Revize 27eee939
Přidáno uživatelem Tomáš Ballák před více než 3 roky(ů)
modules/crawler/pipeline.py | ||
---|---|---|
298 | 298 |
dataset_name: name of dataset that has existing configuration file |
299 | 299 |
""" |
300 | 300 |
logging.info("Starting pipeline for dataset " + dataset_name) |
301 |
print("Zpracovávám dataset " + dataset_name + |
|
302 |
", průběh lze sledovat v logu umístěném v adresáři CrawlerLogs") |
|
301 |
try: |
|
302 |
print("Zpracovávám dataset " + dataset_name + |
|
303 |
", průběh lze sledovat v logu umístěném v adresáři CrawlerLogs") |
|
303 | 304 |
|
304 |
config = configure_functions.load_configuration(dataset_name) |
|
305 |
crawl_data(config) |
|
306 |
process_data(config) |
|
305 |
config = configure_functions.load_configuration(dataset_name)
|
|
306 |
crawl_data(config)
|
|
307 |
process_data(config)
|
|
307 | 308 |
|
308 |
validation_test = validate_process_data(config) |
|
309 |
validation_test = validate_process_data(config)
|
|
309 | 310 |
|
310 |
if validation_test: |
|
311 |
load_data_to_database(config) |
|
311 |
if validation_test: |
|
312 |
load_data_to_database(config) |
|
313 |
except Exception as e: |
|
314 |
logging.error("Pipeline for " + dataset_name + " failed") |
|
315 |
logging.error(e) |
|
312 | 316 |
|
313 | 317 |
|
314 | 318 |
def run_full_pipeline_crone(dataset_name: str) -> None: |
... | ... | |
323 | 327 |
""" |
324 | 328 |
logging.info("Starting pipeline for dataset " + dataset_name) |
325 | 329 |
|
326 |
config = configure_functions.load_configuration(dataset_name) |
|
327 |
update_test = check_last_update(config) |
|
328 |
if update_test: |
|
329 |
crawl_data(config) |
|
330 |
process_data_crone(config["dataset-name"]) |
|
330 |
try: |
|
331 |
config = configure_functions.load_configuration(dataset_name) |
|
332 |
update_test = check_last_update(config) |
|
333 |
if update_test: |
|
334 |
crawl_data(config) |
|
335 |
process_data_crone(config["dataset-name"]) |
|
331 | 336 |
|
332 |
validation_test = validate_process_data(config) |
|
337 |
validation_test = validate_process_data(config)
|
|
333 | 338 |
|
334 |
if validation_test: |
|
335 |
load_data_to_database_crone(config) |
|
339 |
if validation_test: |
|
340 |
load_data_to_database_crone(config) |
|
341 |
except: |
|
342 |
print("Pipeline for dataset " + dataset_name + " failed") |
Také k dispozici: Unified diff
fix for bad configuration of dataset