Revize 0a2832fb
Přidáno uživatelem Jakub Vašta před téměř 5 roky(ů)
modules/crawler/DatasetConfigs/JIS.yaml | ||
---|---|---|
1 | 1 |
# jmeno datasetu, pod kterym bude zobrazen v aplikaci |
2 |
display-name: Snímače JIS |
|
3 |
# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře |
|
2 | 4 |
dataset-name: JIS |
3 | 5 |
# root slozka, ktera obsahuje odkazy na dataset |
4 | 6 |
url: https://openstore.zcu.cz/ |
modules/crawler/DatasetConfigs/KOLOBEZKY.yaml | ||
---|---|---|
1 | 1 |
# jmeno datasetu, pod kterym bude zobrazen v aplikaci |
2 |
display-name: Půjčování koloběžek |
|
3 |
# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře |
|
2 | 4 |
dataset-name: KOLOBEZKY |
3 | 5 |
# root slozka, ktera obsahuje odkazy na dataset |
4 | 6 |
url: https://openstore.zcu.cz/ |
modules/crawler/DatasetConfigs/WIFI.yaml | ||
---|---|---|
1 | 1 |
# jmeno datasetu, pod kterym bude zobrazen v aplikaci |
2 |
display-name: Wi-Fi síť ZČU |
|
3 |
# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře |
|
2 | 4 |
dataset-name: WIFI |
3 | 5 |
# root slozka, ktera obsahuje odkazy na dataset |
4 | 6 |
url: https://openstore.zcu.cz/ |
modules/crawler/DatasetProcessing/WIFIProcessor.py | ||
---|---|---|
32 | 32 |
date_dict[date] = dict() |
33 | 33 |
|
34 | 34 |
if name in date_dict[date]: |
35 |
date_dict[date][name].occurrence += int(occurrence)
|
|
35 |
date_dict[date][name].occurrence += max(date_dict[date][name].occurrence,int(occurrence))
|
|
36 | 36 |
else: |
37 | 37 |
date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, int(occurrence)) |
38 | 38 |
|
modules/crawler/Pipeline.py | ||
---|---|---|
152 | 152 |
# get all unprocessed files from dataset |
153 | 153 |
not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path) |
154 | 154 |
|
155 |
database_connection = DatabaseLoader.create_database_connection() |
|
156 |
|
|
157 |
DatabaseLoader.check_or_update_datasets_collection(database_connection,config) |
|
158 |
|
|
159 |
DatabaseLoader.update_devices_collection(database_connection,config) |
|
160 |
|
|
161 |
|
|
162 |
# load every file |
|
163 |
for not_loaded_file in not_loaded_files: |
|
164 |
#check if file is not in database already if it is skip |
|
165 |
test = DatabaseLoader.check_if_database_doesnt_contain_file(database_connection,dataset_name,not_loaded_file) |
|
166 |
if test == False: |
|
167 |
logging.info(dataset_name + " could not load " + not_loaded_file + " to database because he is already there.") |
|
168 |
FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file) |
|
169 |
continue |
|
170 |
# load processed data |
|
171 |
processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config) |
|
172 |
# load processed data to database |
|
173 |
DatabaseLoader.load_data_to_database(database_connection, dataset_name, processed_data, not_loaded_file) |
|
174 |
FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file) |
|
175 |
|
|
176 |
logging.info(dataset_name + " has loaded to database " + str(len(not_loaded_files)) + " newly processed files.") |
|
177 |
|
|
178 |
|
|
179 |
def load_data_to_database_crone(config): |
|
180 |
""" |
|
181 |
Goes trough every not loaded file(not contained in ProcessedData/ignore.txt) |
|
182 |
loads data appends coordination from configurations |
|
183 |
and exports it into the database |
|
184 |
After successful exporting updates ignore.txt |
|
185 |
|
|
186 |
Args: |
|
187 |
config: loaded configuration file of dataset |
|
188 |
""" |
|
189 |
dataset_name = config["dataset-name"] |
|
190 |
dataset_path = dataset_name + '/' |
|
191 |
|
|
192 |
# get all unprocessed files from dataset |
|
193 |
not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path) |
|
194 |
|
|
195 |
database_connection = DatabaseLoader.create_database_connection() |
|
196 |
|
|
155 | 197 |
# load every file |
156 | 198 |
for not_loaded_file in not_loaded_files: |
157 | 199 |
# load processed data |
158 | 200 |
processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config) |
159 | 201 |
# load processed data to database |
160 |
DatabaseLoader.load_data_to_database(dataset_name, processed_data)
|
|
202 |
DatabaseLoader.load_data_to_database(database_connection, dataset_name, processed_data, not_loaded_file)
|
|
161 | 203 |
FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file) |
162 | 204 |
|
163 |
logging.info(dataset_name + " has loaded to databse " + str(len(not_loaded_files)) + " newly processed files.") |
|
205 |
logging.info(dataset_name + " has loaded to database " + str(len(not_loaded_files)) + " newly processed files.")
|
|
164 | 206 |
|
165 | 207 |
|
166 | 208 |
def run_full_pipeline(dataset_name): |
... | ... | |
185 | 227 |
load_data_to_database(config) |
186 | 228 |
|
187 | 229 |
|
188 |
|
|
189 | 230 |
def run_full_pipeline_crone(dataset_name): |
190 | 231 |
""" |
191 | 232 |
Loads config file and starts full pipeline |
... | ... | |
207 | 248 |
validation_test = validate_process_data(config) |
208 | 249 |
|
209 | 250 |
if validation_test: |
210 |
load_data_to_database(config) |
|
251 |
load_data_to_database_crone(config)
|
|
211 | 252 |
|
modules/crawler/PrepareNewDataset.py | ||
---|---|---|
24 | 24 |
with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file: |
25 | 25 |
file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n") |
26 | 26 |
file.write("dataset-name: " + dataset_name + "\n") |
27 |
file.write("# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře\n") |
|
28 |
file.write("dataset-name: " + dataset_name + "\n") |
|
27 | 29 |
file.write("# root slozka, ktera obsahuje odkazy na dataset\n") |
28 | 30 |
file.write("url: ZDE VLOZTE URL\n") |
29 | 31 |
file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n") |
modules/crawler/Utilities/Database/DatabaseLoader.py | ||
---|---|---|
11 | 11 |
MONGODB_DATA_DATABASE = "open-data-db" |
12 | 12 |
# mongodb collection with aviable datasets |
13 | 13 |
MONGODB_DATASET_COLLECTION = "DATASETS" |
14 |
# mongodb collection with aviable diveces of datasets |
|
15 |
MONGODB_DATASET_DEVICES_COLLECTION = "DEVICES" |
|
16 |
# mongodb collection with aviable diveces of datasets |
|
17 |
MONGODB_DATASET_LOADED_FILES_COLLECTION = "FILES" |
|
14 | 18 |
|
15 | 19 |
# Path to processed data |
16 | 20 |
PROCESSED_DATA_PATH = "ProcessedData/" |
... | ... | |
63 | 67 |
|
64 | 68 |
name = csv_column[0] |
65 | 69 |
|
70 |
if devices[name]["x"] == "SKIP" or devices[name]["y"] == "SKIP": |
|
71 |
continue |
|
72 |
|
|
66 | 73 |
occurrence = csv_column[1] |
67 | 74 |
date = csv_column[2] |
68 | 75 |
database_data_line = DatabaseDataLine.DatabaseDataLine( |
... | ... | |
79 | 86 |
return date_dict |
80 | 87 |
|
81 | 88 |
|
82 |
def load_data_to_database(dataset_name, data_dic):
|
|
89 |
def load_data_to_database(database_connection,dataset_name, data_dic, file_name):
|
|
83 | 90 |
""" |
84 | 91 |
Takes data_dic created in method get_data_from_file |
85 | 92 |
and loads into into database where collection name is dataset_name + data_dic key |
86 | 93 |
and data lines are line in collection |
87 | 94 |
|
88 | 95 |
Args: |
89 |
dataset_name: name of dataset that has existing configuration file |
|
96 |
database_connection: created connection to a MONGODB |
|
97 |
config: loaded configuration file of dataset |
|
90 | 98 |
data_dic: dictionary of data lines created in get_data_from_file |
99 |
file_name: name of file containing data |
|
100 |
""" |
|
101 |
|
|
102 |
for date in data_dic: |
|
103 |
dataset_collections = database_connection[dataset_name] |
|
104 |
dataset_collections.insert_one({'name': dataset_name + date}) |
|
105 |
date_dataset = database_connection[dataset_name + date] |
|
106 |
date_dataset.insert_many(data_dic[date]) |
|
107 |
|
|
108 |
collection_loaded_files = database_connection[dataset_name + MONGODB_DATASET_LOADED_FILES_COLLECTION] |
|
109 |
collection_loaded_files.insert_one({'file': file_name}) |
|
110 |
|
|
111 |
|
|
112 |
|
|
113 |
def check_or_update_datasets_collection(database_connection,config): |
|
91 | 114 |
""" |
92 |
database = create_database_connection()
|
|
115 |
Checks if DATASETS collection contains dataset and if display name was not updated
|
|
93 | 116 |
|
117 |
Args: |
|
118 |
database_connection: created connection to a MONGODB |
|
119 |
config: loaded configuration file of dataset |
|
120 |
""" |
|
94 | 121 |
# collection where are specified aviable datasets |
95 |
collection_datasets = database[MONGODB_DATASET_COLLECTION] |
|
122 |
collection_datasets = database_connection[MONGODB_DATASET_COLLECTION] |
|
123 |
|
|
124 |
dataset_name = config['dataset-name'] |
|
125 |
display_name = config['display-name'] |
|
126 |
|
|
127 |
query = {'key-name': dataset_name} |
|
96 | 128 |
|
97 | 129 |
# check if newly added data already have a dataset specified in collection |
98 |
dataset_present = collection_datasets.find_one({'name': dataset_name})
|
|
130 |
dataset_present = collection_datasets.find_one(query)
|
|
99 | 131 |
|
100 | 132 |
if dataset_present is None: |
101 |
collection_datasets.insert_one({'name': dataset_name}) |
|
133 |
collection_datasets.insert_one({'key-name': dataset_name, 'display-name': display_name}) |
|
134 |
elif dataset_present['display-name'] != display_name: |
|
135 |
newvalues = { "$set": { 'display-name': display_name } } |
|
136 |
collection_datasets.update_one(query, newvalues) |
|
102 | 137 |
|
103 |
for date in data_dic: |
|
104 |
dataset_collections = database[dataset_name] |
|
105 |
dataset_collections.insert_one({'name': dataset_name + date}) |
|
106 |
date_dataset = database[dataset_name + date] |
|
107 |
date_dataset.insert_many(data_dic[date]) |
|
138 |
|
|
139 |
def update_devices_collection(database_connection,config): |
|
140 |
""" |
|
141 |
Checks if dataset_name collection contains every device with current set up |
|
142 |
|
|
143 |
Args: |
|
144 |
database_connection: created connection to a MONGODB |
|
145 |
config: loaded configuration file of dataset |
|
146 |
""" |
|
147 |
dataset_name = config['dataset-name'] |
|
148 |
|
|
149 |
collection_devices = database_connection[dataset_name + MONGODB_DATASET_DEVICES_COLLECTION] |
|
150 |
|
|
151 |
collection_devices.delete_many({}) |
|
152 |
|
|
153 |
devices = config['devices'] |
|
154 |
|
|
155 |
devices_list = list() |
|
156 |
|
|
157 |
for device in devices.keys(): |
|
158 |
if devices[device]['x'] != "SKIP" or devices[device]['y'] != "SKIP": |
|
159 |
devices_list.append({'name': device , 'x': devices[device]['x'] , 'y': devices[device]['y'] }) |
|
160 |
|
|
161 |
collection_devices.insert_many(devices_list) |
|
162 |
|
|
163 |
|
|
164 |
def check_if_database_doesnt_contain_file(database_connection,dataset_name,file_name): |
|
165 |
""" |
|
166 |
Checks if dataset_name collection contains every device with current set up |
|
167 |
|
|
168 |
Args: |
|
169 |
database_connection: created connection to a MONGODB |
|
170 |
filename: checked file name |
|
171 |
""" |
|
172 |
|
|
173 |
collection_loaded_files = database_connection[dataset_name + MONGODB_DATASET_LOADED_FILES_COLLECTION] |
|
174 |
|
|
175 |
query = {'file': file_name} |
|
176 |
|
|
177 |
# check if newly added data already have a dataset specified in collection |
|
178 |
dataset_present = collection_loaded_files.find_one(query) |
|
179 |
|
|
180 |
if dataset_present is None: |
|
181 |
return True |
|
182 |
else: |
|
183 |
return False |
Také k dispozici: Unified diff
Re #8089
- pridana nova proměna do konfig souboru
- predalana tabulka v databasi s nazvem DATASETS
- nove obsahuje "key-name" jako klíč a "display-name" pro zobrazení