/modules/crawler/Pipeline.py - Aplikace nad otevřenými daty (KIV) – BHVS - Redmine

aswi2020sebela-gitlab/modules/crawler/Pipeline.py @ 587b1c57

       from Utilities import FolderProcessor, ConfigureFunctions
       from Utilities.Database import DatabaseLoader
       import logging
       from datetime import date
       # Path to crawled data
       CRAWLED_DATA_PATH = "CrawledData/"
       # Path to processed data
       PROCESSED_DATA_PATH = "ProcessedData/"
       # Path to crawler logs
       CRAWLER_LOGS_PATH = "CrawlerLogs/"
       # Path to dataset crawler implementations
       CRAWLER_LIB_PATH = "DatasetCrawler."
       # Path to dataset processor implementations
       PROCESSOR_LIB_PATH = "DatasetProcessing."
       #logger
       logging.basicConfig(filename=CRAWLER_LOGS_PATH + "CommonRecords/" + 'Applicationlog-' + date.today().strftime("%b-%Y") + '.log',
                           level=logging.INFO,
                           format='%(asctime)s %(message)s'
+                          )
       def check_last_update(config):
           """
           Loads integer from updated.txt in CrawlerLogs/"dataset_name"
           representing number of days from last update if number equals
           number in confing update period updates it and reset number of
           days to zero else increment the number
           Arguments:
               config loaded configuration file of dataset
           Returns:
              True if updating
              Else if incementing days from last update
           """
           dataset_name = config["dataset-name"]
           with open(CRAWLER_LOGS_PATH + dataset_name + "/" + "updated.txt", "r+") as file:
               last_update = file.read()
               last_update = int(last_update)
               file.seek(0)
               confing_update_period = int(config["update-period"])
               if config["update-period"] <= last_update:
                   logging.info("Dataset " + dataset_name + " is being updated today")
                   file.write("0")
                   file.truncate()
                   return True
               else:
                   last_update_days = last_update + 1
                   logging.info("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days")
                   file.write(str(last_update_days))
                   file.truncate()
                   return False
       def crawl_data(config):
           """
             Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py
             runs crawler.
           Args:
               config: loaded configuration file of dataset
           """
           dataset_name = config["dataset-name"]
           crawl_func = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
           crawl_func(config)
           dataset_name += '/'
       def process_data(dataset_name):
           """
           Goes trough every not processed file(not contained in CrawledData/"dataset_name"/ignore.txt)
           Imports dataset processor in DatasetProcessing/"dataset_name"Processor.py
           Runs processor on every file
           After successful processing updates ignore.txt
           Args:
               dataset_name: name of dataset that has existing configuration file
           """
           dataset_path = dataset_name + '/'
           process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor", globals(), locals(),
                                          ['process_file']).process_file
           not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path)
           logging.info(dataset_name + " has downloaded " + str(len(not_processed_files)) + " new files")
           for not_processed_file in not_processed_files:
               process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file)
               FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file)
           logging.info(dataset_name + " has processed " + str(len(not_processed_files)) + " newly crawled files")
       def validate_process_data(config):
           """
           Function goes through newly processed data and checks theirs status
           Args:
               config: loaded configuration file of dataset
           Returns:
               boolean variable TRUE/FALSE.
               Data processed correctly - TRUE
               Wrong format or NEW unknown devices - FALSE
           """
           processed_devices_set = FolderProcessor.get_devices_set(PROCESSED_DATA_PATH + config["dataset-name"] + '/')
           unknown_devices_set = FolderProcessor.get_unknown_devices_set(config, processed_devices_set)
           unknown_devices_size = len(unknown_devices_set)
           if unknown_devices_size != 0:
               logging.info("There is " + str(unknown_devices_size) + " unknown devices")
               logging.info("Adding devices to " + config["dataset-name"] + " config file")
               ConfigureFunctions.update_configuration(config["dataset-name"], unknown_devices_set)
               return False
           for device in config["devices"]:
               device = config["devices"][device]
               if device["x"] == "UNKNOWN!" or device["y"] == "UNKNOWN!":
                   logging.info(config["dataset-name"] + " config file contains devices with UNKOWN! values please update them!!")
                   return False
           return True
       def load_data_to_database(config):
           """
           Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
           loads data appends coordination from configurations
           and exports it into the database
           After successful exporting updates ignore.txt
           Args:
               config: loaded configuration file of dataset
           """
           dataset_name = config["dataset-name"]
           dataset_path = dataset_name + '/'
           # get all unprocessed files from dataset
           not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path)
           # load every file
           for not_loaded_file in not_loaded_files:
               # load processed data
               processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
               # load processed data to database
               DatabaseLoader.load_data_to_database(dataset_name, processed_data)
               FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
           logging.info(dataset_name + " has loaded to databse " + str(len(not_loaded_files)) + " newly processed files.")
       def run_full_pipeline(dataset_name):
           """
           Loads config file and starts full pipeline
           -crawl data
           -process data
           -load data to database
           Args:
               dataset_name: name of dataset that has existing configuration file
           """
           logging.info("Starting pipeline for dataset " + dataset_name)
           config = ConfigureFunctions.load_configuration(dataset_name)
           crawl_data(config)
           process_data(config["dataset-name"])
           validation_test = validate_process_data(config)
           if validation_test:
               load_data_to_database(config)
       def run_full_pipeline_crone(dataset_name):
           """
           Loads config file and starts full pipeline
           -crawl data
           -process data
           -load data to database
           Args:
               dataset_name: name of dataset that has existing configuration file
           """
           logging.info("Starting pipeline for dataset " + dataset_name)
           config = ConfigureFunctions.load_configuration(dataset_name)
           update_test = check_last_update(config)
           if update_test:
               crawl_data(config)
               process_data(config["dataset-name"])
               validation_test = validate_process_data(config)
               if validation_test:
                   load_data_to_database(config)

(5-5/14)

Projekt

Obecné

Profil

ASWI - Pokročilé softwarové inženýrství » ASWI 2020 » Aplikace nad otevřenými daty (KIV) – BHVS