Projekt

Obecné

Profil

« Předchozí | Další » 

Revize 0a2832fb

Přidáno uživatelem Jakub Vašta před asi 4 roky(ů)

Re #8089
- pridana nova proměna do konfig souboru
- predalana tabulka v databasi s nazvem DATASETS
- nove obsahuje "key-name" jako klíč a "display-name" pro zobrazení

Zobrazit rozdíly:

modules/crawler/DatasetConfigs/JIS.yaml
1 1
# jmeno datasetu, pod kterym bude zobrazen v aplikaci
2
display-name: Snímače JIS
3
# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře
2 4
dataset-name: JIS
3 5
# root slozka, ktera obsahuje odkazy na dataset
4 6
url: https://openstore.zcu.cz/
modules/crawler/DatasetConfigs/KOLOBEZKY.yaml
1 1
# jmeno datasetu, pod kterym bude zobrazen v aplikaci
2
display-name: Půjčování koloběžek
3
# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře
2 4
dataset-name: KOLOBEZKY
3 5
# root slozka, ktera obsahuje odkazy na dataset
4 6
url: https://openstore.zcu.cz/
modules/crawler/DatasetConfigs/WIFI.yaml
1 1
# jmeno datasetu, pod kterym bude zobrazen v aplikaci
2
display-name: Wi-Fi síť ZČU
3
# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře
2 4
dataset-name: WIFI
3 5
# root slozka, ktera obsahuje odkazy na dataset
4 6
url: https://openstore.zcu.cz/
modules/crawler/DatasetProcessing/WIFIProcessor.py
32 32
                date_dict[date] = dict()
33 33

  
34 34
            if name in date_dict[date]:
35
                date_dict[date][name].occurrence += int(occurrence)
35
                date_dict[date][name].occurrence += max(date_dict[date][name].occurrence,int(occurrence))
36 36
            else:
37 37
                date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, int(occurrence))
38 38

  
modules/crawler/Pipeline.py
152 152
    # get all unprocessed files from dataset
153 153
    not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path)
154 154

  
155
    database_connection = DatabaseLoader.create_database_connection()
156

  
157
    DatabaseLoader.check_or_update_datasets_collection(database_connection,config)
158

  
159
    DatabaseLoader.update_devices_collection(database_connection,config)
160

  
161

  
162
    # load every file
163
    for not_loaded_file in not_loaded_files:
164
        #check if file is not in database already if it is skip
165
        test = DatabaseLoader.check_if_database_doesnt_contain_file(database_connection,dataset_name,not_loaded_file)
166
        if test == False:
167
            logging.info(dataset_name + " could not load " + not_loaded_file + " to database because he is already there.")
168
            FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
169
            continue
170
        # load processed data
171
        processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
172
        # load processed data to database
173
        DatabaseLoader.load_data_to_database(database_connection, dataset_name, processed_data, not_loaded_file)
174
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
175

  
176
    logging.info(dataset_name + " has loaded to database " + str(len(not_loaded_files)) + " newly processed files.")
177

  
178

  
179
def load_data_to_database_crone(config):
180
    """
181
    Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
182
    loads data appends coordination from configurations
183
    and exports it into the database
184
    After successful exporting updates ignore.txt
185

  
186
    Args:
187
        config: loaded configuration file of dataset
188
    """
189
    dataset_name = config["dataset-name"]
190
    dataset_path = dataset_name + '/'
191

  
192
    # get all unprocessed files from dataset
193
    not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path)
194

  
195
    database_connection = DatabaseLoader.create_database_connection()
196

  
155 197
    # load every file
156 198
    for not_loaded_file in not_loaded_files:
157 199
        # load processed data
158 200
        processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
159 201
        # load processed data to database
160
        DatabaseLoader.load_data_to_database(dataset_name, processed_data)
202
        DatabaseLoader.load_data_to_database(database_connection, dataset_name, processed_data, not_loaded_file)
161 203
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
162 204

  
163
    logging.info(dataset_name + " has loaded to databse " + str(len(not_loaded_files)) + " newly processed files.")
205
    logging.info(dataset_name + " has loaded to database " + str(len(not_loaded_files)) + " newly processed files.")
164 206

  
165 207

  
166 208
def run_full_pipeline(dataset_name):
......
185 227
        load_data_to_database(config)
186 228

  
187 229

  
188

  
189 230
def run_full_pipeline_crone(dataset_name):
190 231
    """
191 232
    Loads config file and starts full pipeline
......
207 248
        validation_test = validate_process_data(config)
208 249

  
209 250
        if validation_test:
210
            load_data_to_database(config)
251
            load_data_to_database_crone(config)
211 252
            
modules/crawler/PrepareNewDataset.py
24 24
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
25 25
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
26 26
        file.write("dataset-name: " + dataset_name + "\n")
27
        file.write("# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře\n")
28
        file.write("dataset-name: " + dataset_name + "\n")
27 29
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
28 30
        file.write("url: ZDE VLOZTE URL\n")
29 31
        file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
modules/crawler/Utilities/Database/DatabaseLoader.py
11 11
MONGODB_DATA_DATABASE = "open-data-db"
12 12
# mongodb collection with aviable datasets
13 13
MONGODB_DATASET_COLLECTION = "DATASETS"
14
# mongodb collection with aviable diveces of datasets
15
MONGODB_DATASET_DEVICES_COLLECTION = "DEVICES"
16
# mongodb collection with aviable diveces of datasets
17
MONGODB_DATASET_LOADED_FILES_COLLECTION = "FILES"
14 18

  
15 19
# Path to processed data
16 20
PROCESSED_DATA_PATH = "ProcessedData/"
......
63 67

  
64 68
        name = csv_column[0]
65 69

  
70
        if devices[name]["x"] == "SKIP" or devices[name]["y"] == "SKIP":
71
            continue
72

  
66 73
        occurrence = csv_column[1]
67 74
        date = csv_column[2]
68 75
        database_data_line = DatabaseDataLine.DatabaseDataLine(
......
79 86
    return date_dict
80 87

  
81 88

  
82
def load_data_to_database(dataset_name, data_dic):
89
def load_data_to_database(database_connection,dataset_name, data_dic, file_name):
83 90
    """
84 91
    Takes data_dic created in method get_data_from_file
85 92
    and loads into into database where collection name is dataset_name + data_dic key
86 93
    and data lines are line in collection
87 94

  
88 95
    Args:
89
        dataset_name: name of dataset that has existing configuration file
96
        database_connection: created connection to a MONGODB
97
        config: loaded configuration file of dataset
90 98
        data_dic: dictionary of data lines created in get_data_from_file
99
        file_name: name of file containing data
100
    """
101

  
102
    for date in data_dic:
103
        dataset_collections = database_connection[dataset_name]
104
        dataset_collections.insert_one({'name': dataset_name + date})
105
        date_dataset = database_connection[dataset_name + date]
106
        date_dataset.insert_many(data_dic[date])
107

  
108
    collection_loaded_files = database_connection[dataset_name + MONGODB_DATASET_LOADED_FILES_COLLECTION]
109
    collection_loaded_files.insert_one({'file': file_name})
110

  
111

  
112

  
113
def check_or_update_datasets_collection(database_connection,config):
91 114
    """
92
    database = create_database_connection()
115
    Checks if DATASETS collection contains dataset and if display name was not updated
93 116

  
117
    Args:
118
        database_connection: created connection to a MONGODB
119
        config: loaded configuration file of dataset
120
    """
94 121
    # collection where are specified aviable datasets
95
    collection_datasets = database[MONGODB_DATASET_COLLECTION]
122
    collection_datasets = database_connection[MONGODB_DATASET_COLLECTION]
123

  
124
    dataset_name = config['dataset-name']
125
    display_name = config['display-name']
126

  
127
    query = {'key-name': dataset_name}
96 128

  
97 129
    # check if newly added data already have a dataset specified in collection
98
    dataset_present = collection_datasets.find_one({'name': dataset_name})
130
    dataset_present = collection_datasets.find_one(query)
99 131

  
100 132
    if dataset_present is None:
101
        collection_datasets.insert_one({'name': dataset_name})
133
        collection_datasets.insert_one({'key-name': dataset_name, 'display-name': display_name})
134
    elif dataset_present['display-name'] != display_name:
135
        newvalues = { "$set": { 'display-name': display_name } }
136
        collection_datasets.update_one(query, newvalues)
102 137

  
103
    for date in data_dic:
104
        dataset_collections = database[dataset_name]
105
        dataset_collections.insert_one({'name': dataset_name + date})
106
        date_dataset = database[dataset_name + date]
107
        date_dataset.insert_many(data_dic[date])
138

  
139
def update_devices_collection(database_connection,config):
140
    """
141
    Checks if dataset_name collection contains every device with current set up
142

  
143
    Args:
144
        database_connection: created connection to a MONGODB
145
        config: loaded configuration file of dataset
146
    """
147
    dataset_name = config['dataset-name']
148

  
149
    collection_devices = database_connection[dataset_name + MONGODB_DATASET_DEVICES_COLLECTION]
150

  
151
    collection_devices.delete_many({})
152

  
153
    devices = config['devices']
154

  
155
    devices_list = list()
156

  
157
    for device in devices.keys():
158
        if devices[device]['x'] != "SKIP" or devices[device]['y'] != "SKIP":
159
            devices_list.append({'name': device , 'x': devices[device]['x'] , 'y': devices[device]['y'] })
160

  
161
    collection_devices.insert_many(devices_list)
162

  
163

  
164
def check_if_database_doesnt_contain_file(database_connection,dataset_name,file_name):
165
    """
166
    Checks if dataset_name collection contains every device with current set up
167

  
168
    Args:
169
        database_connection: created connection to a MONGODB
170
        filename: checked file name
171
    """
172

  
173
    collection_loaded_files = database_connection[dataset_name + MONGODB_DATASET_LOADED_FILES_COLLECTION]
174

  
175
    query = {'file': file_name}
176

  
177
    # check if newly added data already have a dataset specified in collection
178
    dataset_present = collection_loaded_files.find_one(query)
179

  
180
    if dataset_present is None:
181
        return True
182
    else:
183
        return False

Také k dispozici: Unified diff