/ - Diff - Aplikace nad otevřenými daty (KIV) – BHVS - Redmine

     ignore.txt
     OD_ZCU_JIS_10_2019.CSV
     OD_ZCU_JIS_03_2020.CSV
     OD_ZCU_JIS_02_2020.CSV
     OD_ZCU_JIS_00_2019.CSV
     OD_ZCU_JIS_08_2019.CSV
     OD_ZCU_JIS_12_2019.CSV
     OD_ZCU_JIS_09_2019.CSV
     OD_ZCU_JIS_01_2020.CSV
     OD_ZCU_JIS_06_2019.CSV
     OD_ZCU_JIS_11_2019.CSV
     OD_ZCU_JIS_07_2019.CSV
     ignore.txt

modules/crawler/CrawledData/KOLOBEZKY/ignore.txt
1		ignore.txt
2		OD_ZCU_KOLOBEZKY_00_2019.CSV
3		OD_ZCU_KOLOBEZKY_06_2019.CSV
4		OD_ZCU_KOLOBEZKY_07_2019.CSV
5		OD_ZCU_KOLOBEZKY_08_2019.CSV
	1	ignore.txt

modules/crawler/CrawledData/WIFI/ignore.txt
1		ignore.txt
2		OD_ZCU_WIFI_07_2019.CSV
3		OD_ZCU_WIFI_00_2019.CSV
4		OD_ZCU_WIFI_06_2019.CSV
5		OD_ZCU_WIFI_08_2019.CSV
	1	ignore.txt

     https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_JIS_00_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_01_2020/OD_ZCU_JIS_01_2020_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_02_2020/OD_ZCU_JIS_02_2020_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_03_2020/OD_ZCU_JIS_03_2020_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_JIS_06_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_JIS_07_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_JIS_08_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_09_2019/OD_ZCU_JIS_09_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_10_2019/OD_ZCU_JIS_10_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_11_2019/OD_ZCU_JIS_11_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_12_2019/OD_ZCU_JIS_12_2019_CSV.zip

modules/crawler/CrawlerLogs/JIS/updated.txt
	1	0

modules/crawler/CrawlerLogs/KOLOBEZKY/ignore.txt
1		https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_KOLOBEZKY_00_2019_CSV.zip
2		https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_KOLOBEZKY_06_2019_CSV.zip
3		https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_KOLOBEZKY_07_2019_CSV.zip
4		https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_KOLOBEZKY_08_2019_CSV.zip

modules/crawler/CrawlerLogs/KOLOBEZKY/updated.txt
	1	0

modules/crawler/CrawlerLogs/WIFI/ignore.txt
1		https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_WIFI_00_2019_CSV.zip
2		https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_WIFI_06_2019_CSV.zip
3		https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_WIFI_07_2019_CSV.zip
4		https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_WIFI_08_2019_CSV.zip

modules/crawler/CrawlerLogs/WIFI/updated.txt
	1	0

     import Pipeline
     import os
     # Path to configuration files
     CONFIG_FILES_PATH = "DatasetConfigs/"
     def run_pipeline_for_all_datasets():
         """
         Runs whole DataScript pipeline for every dataset that has existing configuration file
         """
         files_in_dir = os.listdir(CONFIG_FILES_PATH)
         for file in files_in_dir:
             name = file.split('.')
             Pipeline.run_full_pipeline_crone(name[0])
     run_pipeline_for_all_datasets()

modules/crawler/ForceUpdateDataset.py
	1	import Pipeline
	2	import os
	3
	4	print("Zadejte jméno Datasetu který chcete upadtovat:\n")
	5	Pipeline.run_full_pipeline(input())

     import Pipeline
     import os
     # Path to configuration files
     CONFIG_FILES_PATH = "DatasetConfigs/"
     def run_pipeline_for_all_datasets():
         """
         Runs whole DataScript pipeline for every dataset that has existing configuration file
         """
         files_in_dir = os.listdir(CONFIG_FILES_PATH)
         for file in files_in_dir:
             name = file.split('.')
             Pipeline.run_full_pipeline(name[0])
     run_pipeline_for_all_datasets()

     CRAWLED_DATA_PATH = "CrawledData/"
     # Path to processed data
     PROCESSED_DATA_PATH = "ProcessedData/"
     # Path to crawler logs
     CRAWLER_LOGS_PATH = "CrawlerLogs/"
     # Path to dataset crawler implementations
     CRAWLER_LIB_PATH = "DatasetCrawler."
     # Path to dataset processor implementations
     PROCESSOR_LIB_PATH = "DatasetProcessing."
     def check_last_update(config):
         """
         Loads integer from updated.txt in CrawlerLogs/"dataset_name"
         representing number of days from last update if number equals
         number in confing update period updates it and reset number of
         days to zero else increment the number
         Arguments:
             config loaded configuration file of dataset
         Returns:
            True if updating
            Else if incementing days from last update
         """
         dataset_name = config["dataset-name"]
         with open(CRAWLER_LOGS_PATH + dataset_name + "/" + "updated.txt", "r+") as file:
             last_update = file.read()
             last_update = int(last_update)
             file.seek(0)
             confing_update_period = int(config["update-period"])
             if config["update-period"] <= last_update:
                 print("Dataset " + dataset_name + " is being updated")
                 file.write("0")
                 file.truncate()
                 return True
             else:
                 last_update_days = last_update + 1
                 print("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days")
                 file.write(str(last_update_days))
                 file.truncate()
                 return False
     def crawl_data(config):
         """
           Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py
-...
         if validation_test:
             load_data_to_database(config)
             print("Dataset " + dataset_name + " has been sucessfully updated\n")
     def run_full_pipeline_crone(dataset_name):
         """
         Loads config file and starts full pipeline
         -crawl data
         -process data
         -load data to database
         Args:
             dataset_name: name of dataset that has existing configuration file
         """
         config = ConfigureFunctions.load_configuration(dataset_name)
         update_test = check_last_update(config)
         if update_test:
             crawl_data(config)
             process_data(config["dataset-name"])
             validation_test = validate_process_data(config)
             if validation_test:
                 load_data_to_database(config)

             if text is not None:
                 file.write(text + "\n")
     def create_updated_file(path):
         """
         Creates updated file
         Args:
             path: path to directory for creating updated.txt
         """
         with open(path + "/updated.txt", "w") as file:
                 file.write(str(0) + "\n")
     def prepare_dataset_structure(dataset_name):
         """
-...
         Args:
             dataset_name: Name of newly created dataset
         """
         jump_folder = "../"
         # create folder for crawled data
         try:
-...
             path = CRAWLER_LOGS_PATH + dataset_name
             os.mkdir(path)
             create_ignore_file(path, None)
             create_updated_file(path)
         except OSError:
             print("Creation of the directory %s failed" % path)

modules/crawler/ProcessedData/JIS/ignore.txt
1		ignore.txt
	1	ignore.txt

modules/crawler/ProcessedData/KOLOBEZKY/ignore.txt
1		ignore.txt
2		OD_ZCU_KOLOBEZKY_08_2019.CSV
3		OD_ZCU_KOLOBEZKY_00_2019.CSV
4		OD_ZCU_KOLOBEZKY_07_2019.CSV
5		OD_ZCU_KOLOBEZKY_06_2019.CSV
	1	ignore.txt

modules/crawler/ProcessedData/WIFI/ignore.txt
1		ignore.txt
	1	ignore.txt

     from Utilities import FolderProcessor
     # Path to crawled data
     CRAWLED_DATA_PATH = "../CrawledData/"
     CRAWLED_DATA_PATH = "CrawledData/"
     # Path to processed data
     PROCESSED_DATA_PATH = "../ProcessedData/"
     PROCESSED_DATA_PATH = "ProcessedData/"
     # Path to crawler logs
     CRAWLER_LOGS_PATH = "../CrawlerLogs/"
     CRAWLER_LOGS_PATH = "CrawlerLogs/"
     # Path to dataset configuration files
     CONFIG_FILES_PATH = "../DatasetConfigs"
     CONFIG_FILES_PATH = "DatasetConfigs"
     def create_ignore_file(path, text):
-...
                 file.write(text + "\n")
     def create_updated_file(path):
         """
         Creates updated file
         Args:
             path: path to directory for creating updated.txt
         """
         with open(path + "/updated.txt", "w") as file:
                 file.write(str(0) + "\n")
     def reset_dataset(dataset_name):
         """
         Resets all saved data in dataset except config and implementation
-...
         path = CRAWLER_LOGS_PATH + dataset_name + "/"
         FolderProcessor.clean_folder(path)
         create_ignore_file(path, None)
         create_updated_file(path)
     def reset_all_datasets():

     import Pipeline
     import os
     # Path to configuration files
     CONFIG_FILES_PATH = "DatasetConfigs/"
     def run_pipeline_for_all_datasets():
         """
         Runs whole DataScript pipeline for every dataset that has existing configuration file
         """
         files_in_dir = os.listdir(CONFIG_FILES_PATH)
         for file in files_in_dir:
             name = file.split('.')
             Pipeline.run_full_pipeline(name[0])
     def run_pipeline_for_one_dataset(dataset_name):
         """
         Runs whole DataScript pipeline for only one dataset
         Args:
             dataset_name: name of dataset that has existing configuration file
         """
         Pipeline.run_full_pipeline(dataset_name)
     run_pipeline_for_all_datasets()

Projekt

Obecné

Profil

ASWI - Pokročilé softwarové inženýrství » ASWI 2020 » Aplikace nad otevřenými daty (KIV) – BHVS

Revize 34baf808

Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)

Projekt

Obecné

Profil

ASWI - Pokročilé softwarové inženýrství » ASWI 2020 » Aplikace nad otevřenými daty (KIV) – BHVS

Revize 34baf808

Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)

Související úkoly