Revize 753d424e
Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)
modules/crawler/pipeline.py | ||
---|---|---|
86 | 86 |
dataset_name = config["dataset-name"] |
87 | 87 |
dataset_path = dataset_name + '/' |
88 | 88 |
|
89 |
changes_in_devices = database_loader.update_devices_collection(config) |
|
90 |
|
|
91 |
if changes_in_devices == True: |
|
92 |
logging.info(dataset_name + " contains changes in devices configuration. Deleteing old data and preparing new") |
|
93 |
database_loader.reset_dataset_database(dataset_name) |
|
94 |
folder_processor.clean_folder(PROCESSED_DATA_PATH + dataset_path) |
|
95 |
|
|
96 |
|
|
97 | 89 |
process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "_processor", globals(), locals(), |
98 | 90 |
['process_file']).process_file |
99 | 91 |
|
... | ... | |
194 | 186 |
|
195 | 187 |
database_loader.check_or_update_datasets_collection(database_connection,config) |
196 | 188 |
|
189 |
changes_in_devices = database_loader.update_devices_collection(config) |
|
190 |
|
|
191 |
if changes_in_devices == True: |
|
192 |
logging.info(dataset_name + " contains changes in devices configuration. Deleting old data and preparing new") |
|
193 |
database_loader.reset_dataset_database(dataset_name) |
|
194 |
|
|
197 | 195 |
# get all unprocessed files from dataset |
198 | 196 |
ignore_set = database_record_logs.load_ignore_set_loaded(dataset_name) |
199 | 197 |
not_loaded_files = folder_processor.list_of_all_new_files(ignore_set,PROCESSED_DATA_PATH + dataset_path) |
... | ... | |
261 | 259 |
dataset_name: name of dataset that has existing configuration file |
262 | 260 |
""" |
263 | 261 |
logging.info("Starting pipeline for dataset " + dataset_name) |
264 |
print("Processing dataset " + dataset_name + " you can watch the progress in log contained in CrawlerLogs folder")
|
|
262 |
print("Zpracovávám dataset " + dataset_name + " průběh lze sledovat v logu umístěném v in CrawlerLogs folder")
|
|
265 | 263 |
|
266 | 264 |
config = configure_functions.load_configuration(dataset_name) |
267 | 265 |
crawl_data(config) |
Také k dispozici: Unified diff
Predelany komentáře upravena kontrola datasetu