/ - Diff - Aplikace nad otevřenými daty (KIV) – BHVS - Redmine

     ignore.txt
     OD_ZCU_JIS_10_2019.CSV
     OD_ZCU_JIS_03_2020.CSV
     OD_ZCU_JIS_02_2020.CSV
     OD_ZCU_JIS_00_2019.CSV
     OD_ZCU_JIS_08_2019.CSV
     OD_ZCU_JIS_12_2019.CSV
     OD_ZCU_JIS_09_2019.CSV
     OD_ZCU_JIS_01_2020.CSV
     OD_ZCU_JIS_06_2019.CSV
     OD_ZCU_JIS_11_2019.CSV
     OD_ZCU_JIS_07_2019.CSV

modules/crawler/CrawledData/KOLOBEZKY/ignore.txt
1	1	ignore.txt
	2	OD_ZCU_KOLOBEZKY_00_2019.CSV
	3	OD_ZCU_KOLOBEZKY_06_2019.CSV
	4	OD_ZCU_KOLOBEZKY_07_2019.CSV
	5	OD_ZCU_KOLOBEZKY_08_2019.CSV

modules/crawler/CrawledData/WIFI/ignore.txt
1	1	ignore.txt
	2	OD_ZCU_WIFI_07_2019.CSV
	3	OD_ZCU_WIFI_00_2019.CSV
	4	OD_ZCU_WIFI_06_2019.CSV
	5	OD_ZCU_WIFI_08_2019.CSV

     https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_JIS_00_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_01_2020/OD_ZCU_JIS_01_2020_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_02_2020/OD_ZCU_JIS_02_2020_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_03_2020/OD_ZCU_JIS_03_2020_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_JIS_06_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_JIS_07_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_JIS_08_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_09_2019/OD_ZCU_JIS_09_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_10_2019/OD_ZCU_JIS_10_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_11_2019/OD_ZCU_JIS_11_2019_CSV.zip
     https://openstore.zcu.cz/OD_ZCU_12_2019/OD_ZCU_JIS_12_2019_CSV.zip

modules/crawler/CrawlerLogs/KOLOBEZKY/ignore.txt
	1	https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_KOLOBEZKY_00_2019_CSV.zip
	2	https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_KOLOBEZKY_06_2019_CSV.zip
	3	https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_KOLOBEZKY_07_2019_CSV.zip
	4	https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_KOLOBEZKY_08_2019_CSV.zip

modules/crawler/CrawlerLogs/WIFI/ignore.txt
	1	https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_WIFI_00_2019_CSV.zip
	2	https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_WIFI_06_2019_CSV.zip
	3	https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_WIFI_07_2019_CSV.zip
	4	https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_WIFI_08_2019_CSV.zip

     update-period: 24
     # pozice jednotlivych zarizeni, ktera jsou v datasetu
     devices:
       - US 005 - z?vora vjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_KL20:
           x: UNKNOWN!
           y: UNKNOWN!
       - US 005 - m?? vjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza4-kasa3:
           x: UNKNOWN!
           y: UNKNOWN!
       - NTIS-BUFET:
           x: UNKNOWN!
           y: UNKNOWN!
       - Parkoviste-vjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza4-kasa1:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza1-kasa-p:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza4-kasa2:
           x: UNKNOWN!
           y: UNKNOWN!
       - L2:
           x: UNKNOWN!
           y: UNKNOWN!
       - EP-BUFET:
           x: UNKNOWN!
           y: UNKNOWN!
       - KolaBory-vnitrni:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza1-kasa-l:
           x: UNKNOWN!
           y: UNKNOWN!
       - Zavora-FEL:
           x: UNKNOWN!
           y: UNKNOWN!
       - Zavora-NTIS-vjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - L-Posilovna:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_ST407:
           x: UNKNOWN!
           y: UNKNOWN!
       - M16:
           x: UNKNOWN!
           y: UNKNOWN!
       - Zavora-Kaplirova:
           x: UNKNOWN!
           y: UNKNOWN!
       - Zavora-FDU:
           x: UNKNOWN!
           y: UNKNOWN!
       - KolaBory-vnejsi:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_VC53:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza4-kasa4:
           x: UNKNOWN!
           y: UNKNOWN!
       - KL-Posilovna:
           x: UNKNOWN!
           y: UNKNOWN!
       - VC-VYJEZD:
           x: UNKNOWN!
           y: UNKNOWN!
       - EXT/kola-B:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza4-kasa5:
           x: UNKNOWN!
           y: UNKNOWN!
       - L1L2-vchod:
           x: UNKNOWN!
           y: UNKNOWN!
       - VC-VJEZD:
           x: UNKNOWN!
           y: UNKNOWN!
       - A3:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_UB113:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_CHEB:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_PRA1:
           x: UNKNOWN!
           y: UNKNOWN!
       - L1:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_UB211:
           x: UNKNOWN!
           y: UNKNOWN!
       - A1:
           x: UNKNOWN!
           y: UNKNOWN!
       - EXT/kola:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_KL87:
           x: UNKNOWN!
           y: UNKNOWN!
       - UV1-Bufet:
           x: UNKNOWN!
           y: UNKNOWN!
       - M14:
           x: UNKNOWN!
           y: UNKNOWN!
       - Zavora-NTIS-vyjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - B3-kolarna:
           x: UNKNOWN!
           y: UNKNOWN!
       - B3-LEVY:
           x: UNKNOWN!
           y: UNKNOWN!
       - MenzaKL-vydej:
           x: UNKNOWN!
           y: UNKNOWN!
       - A2-Hlavni vchod:
           x: UNKNOWN!
           y: UNKNOWN!
       - Parkoviste-vyjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza4-kasa5:
           x: UNKNOWN!
           y: UNKNOWN!
       - Zavora-FDU:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_PRA1:
           x: UNKNOWN!
           y: UNKNOWN!
       - B3-LEVY:
           x: UNKNOWN!
           y: UNKNOWN!
       - KolaBory-vnejsi:
           x: UNKNOWN!
           y: UNKNOWN!
       - VC-VYJEZD:
           x: UNKNOWN!
           y: UNKNOWN!
       - L-Posilovna:
           x: UNKNOWN!
           y: UNKNOWN!
       - A3:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza4-kasa1:
           x: UNKNOWN!
           y: UNKNOWN!
       - US 005 - závora vjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - EP-BUFET:
           x: UNKNOWN!
           y: UNKNOWN!
       - Zavora-FEL:
           x: UNKNOWN!
           y: UNKNOWN!
       - US 005 - mříž vjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_VC53:
           x: UNKNOWN!
           y: UNKNOWN!
       - NTIS-BUFET:
           x: UNKNOWN!
           y: UNKNOWN!
       - Zavora-NTIS-vjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - EXT/kola:
           x: UNKNOWN!
           y: UNKNOWN!
       - VC-VJEZD:
           x: UNKNOWN!
           y: UNKNOWN!
       - Zavora-Kaplirova:
           x: UNKNOWN!
           y: UNKNOWN!
       - M16:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza1-kasa-p:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_CHEB:
           x: UNKNOWN!
           y: UNKNOWN!
       - M14:
           x: UNKNOWN!
           y: UNKNOWN!
       - L2:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_UB113:
           x: UNKNOWN!
           y: UNKNOWN!
       - B3-kolarna:
           x: UNKNOWN!
           y: UNKNOWN!
       - L1:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza1-kasa-l:
           x: UNKNOWN!
           y: UNKNOWN!
       - Zavora-NTIS-vyjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza4-kasa4:
           x: UNKNOWN!
           y: UNKNOWN!
       - MenzaKL-vydej:
           x: UNKNOWN!
           y: UNKNOWN!
       - Parkoviste-vjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - KL-Posilovna:
           x: UNKNOWN!
           y: UNKNOWN!
       - A1:
           x: UNKNOWN!
           y: UNKNOWN!
       - KolaBory-vnitrni:
           x: UNKNOWN!
           y: UNKNOWN!
       - EXT/kola-B:
           x: UNKNOWN!
           y: UNKNOWN!
       - A2-Hlavni vchod:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_ST407:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_KL87:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza4-kasa2:
           x: UNKNOWN!
           y: UNKNOWN!
       - L1L2-vchod:
           x: UNKNOWN!
           y: UNKNOWN!
       - Menza4-kasa3:
           x: UNKNOWN!
           y: UNKNOWN!
       - Parkoviste-vyjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_KL20:
           x: UNKNOWN!
           y: UNKNOWN!
       - UV1-Bufet:
           x: UNKNOWN!
           y: UNKNOWN!
       - STUD_UB211:
           x: UNKNOWN!
           y: UNKNOWN!

     import os
     # Path to crawled data
     CRAWLED_DATA_PATH = "../CrawledData/"
     # Path to processed data
     PROCESSED_DATA_PATH = "../ProcessedData/"
     # Path to crawler logs
     CRAWLER_LOGS_PATH = "../CrawlerLogs/"
     # Path for DatasetCrawlers implementations
     CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
     # Path for DatasetProcessors implementations
     PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
     # Path to dataset configuration files
     CONFIG_FILES_PATH = "../DatasetConfigs"
     def create_default_config_file(dataset_name):
         """
         Creates default config file
         Args:
             dataset_name: Name of newly created dataset
         """
         with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
             file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
             file.write("dataset-name: " + dataset_name + "\n")
             file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
             file.write("url: ZDE VLOZTE URL\n")
             file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
             file.write("regex: ZDE VLOZTE REGEX\n")
             file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
                        "tak defaultni hodnota (dny)\n")
             file.write("update-period: ZDE VLOZTE HODNOTU\n")
             file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
             file.write("devices:\n")
     def create_default_processor(dataset_name):
         """
         Creates default processor for dataset
         Args:
             dataset_name: Name of newly created dataset
         """
         with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
             file.write("from Utilities.CSV import CSVDataLine, CSVutils")
             file.write("\n")
             file.write("\n")
             file.write("def process_file(filename):\n")
             file.write("    \"\"\"\n")
             file.write("    Method that take path to crawled file and outputs date dictionary using method:\n")
             file.write("    CSVutils.export_data_to_csv(filename, date_dict)\n")
             file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
             file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
             file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
             file.write("\n")
             file.write("    Args:\n")
             file.write("    filename: name of processed file\n")
             file.write("\n")
             file.write("    Returns:\n")
             file.write("    False if not implemented\n")
             file.write("    True when implemented\n")
             file.write("    \"\"\"\n")
             file.write("    print(\"You must implements process_file method first!\")\n")
             file.write("    #CSVutils.export_data_to_csv(filename, date_dict)\n")
             file.write("    return False\n")
     def create_default_crawler(dataset_name):
         """
         Creates default crawler for dataset
         Args:
             dataset_name: Name of newly created dataset
         """
         with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
             file.write("# Path to crawled data\n")
             file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
             file.write("\n")
             file.write("\n")
             file.write("def crawl(config):\n")
             file.write("    \"\"\"\n")
             file.write("    Implement crawl method that downloads new data to path_for_files\n")
             file.write("    For keeping the project structure\n")
             file.write("    url , regex, and dataset_name from config\n")
             file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
             file.write("\n")
             file.write("    Args:\n")
             file.write("        config: loaded configuration file of dataset\n")
             file.write("    \"\"\"\n")
             file.write("    dataset_name = config[\"dataset-name\"]\n")
             file.write("    url = config['url']\n")
             file.write("    regex = config['regex']\n")
             file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
             file.write("    print(\"You must implements Crawl method first!\")\n")
     def create_ignore_file(path, text):
         """
         Creates ignore file
         Args:
             path: path to directory for creating ignore.txt
             text: text that will be on first line of ignore.txt can be None
         """
         with open(path + "/ignore.txt", "w") as file:
             if text is not None:
                 file.write(text + "\n")
     def prepare_dataset_structure(dataset_name):
         """
         Prepares folders for new dataset
         Args:
             dataset_name: Name of newly created dataset
         """
         jump_folder = "../"
         # create folder for crawled data
         try:
             path = CRAWLED_DATA_PATH+dataset_name
             os.mkdir(path)
             create_ignore_file(path, "ignore.txt")
         except os.error as e:
             print(e)
             print("Creation of the directory %s failed" % path)
         # create folder for processed data
         try:
             path = PROCESSED_DATA_PATH + dataset_name
             os.mkdir(path)
             create_ignore_file(path, "ignore.txt")
         except OSError:
             print("Creation of the directory %s failed" % path)
         # create folder for crawler logs
         try:
             path = CRAWLER_LOGS_PATH + dataset_name
             os.mkdir(path)
             create_ignore_file(path, None)
         except OSError:
             print("Creation of the directory %s failed" % path)
         create_default_crawler(dataset_name)
         create_default_processor(dataset_name)
         create_default_config_file(dataset_name)
     prepare_dataset_structure("TEST")

modules/crawler/ProcessedData/KOLOBEZKY/ignore.txt
1	1	ignore.txt
	2	OD_ZCU_KOLOBEZKY_08_2019.CSV
	3	OD_ZCU_KOLOBEZKY_00_2019.CSV
	4	OD_ZCU_KOLOBEZKY_07_2019.CSV
	5	OD_ZCU_KOLOBEZKY_06_2019.CSV

     import os
     import shutil
     # Path to crawled data
     CRAWLED_DATA_PATH = "CrawledData/"
     # Path to processed data
     PROCESSED_DATA_PATH = "ProcessedData/"
     # Path to crawler logs
     CRAWLER_LOGS_PATH = "CrawlerLogs/"
     # Path to dataset configuration files
     CONFIG_FILES_PATH = "DatasetConfigs"
     # Path for DatasetCrawlers implementations
     CRAWLER_PROGRAM_PATH = "DatasetCrawler"
     # Path for DatasetProcessors implementations
     PROCESSOR_PROGRAM_PATH = "DatasetProcessing"
     def remove_dataset(dataset_name):
         """
         Remove dataset
         Args:
             dataset_name: name of dataset that has existing configuration file
         """
         shutil.rmtree(CRAWLED_DATA_PATH + dataset_name + "/")
         shutil.rmtree(PROCESSED_DATA_PATH + dataset_name + "/")
         shutil.rmtree(CRAWLER_LOGS_PATH + dataset_name + "/")
         os.remove(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml")
         os.remove(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py")
         os.remove(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py")
         print("Dataset: " + dataset_name + " removed")
     remove_dataset("TEST")

     from Utilities.Database import DatabaseLoader
     def remove_dataset_database(dataset_name):
         """
         Removes dataset entries from database
         Args:
             dataset_name: name of dataset that has existing configuration file
         """
         # Creating connection
         mydb = DatabaseLoader.create_database_connection()
         # collection where are specified aviable datasets
         collection_datasets = mydb[DatabaseLoader.MONGODB_DATASET_COLLECTION]
         collection_datasets.delete_one({"name": dataset_name})
         print("Removing record from DATASETS collection")
         # Retrieve list of all collections
         collections = mydb.list_collection_names()
         # Drop of all collections
         for name in collections:
             if name.startswith(dataset_name):
                 mydb[name].drop()
                 print("Dropping: " + name)
         print("Database Cleaned")
     remove_dataset_database("KOLOBEZKY")

     from Utilities.Database import DatabaseLoader
     def clean_database():
         """
         Deletes all collections from database
         """
         # Create connection
         mydb = DatabaseLoader.create_database_connection()
         # Retrieve list of all collections
         collections = mydb.list_collection_names()
         # Drop of all collections
         for name in collections:
             mydb[name].drop()
         print("Database Cleaned")
     clean_database()

     import os
     from Utilities import FolderProcessor
     # Path to crawled data
     CRAWLED_DATA_PATH = "../CrawledData/"
     # Path to processed data
     PROCESSED_DATA_PATH = "../ProcessedData/"
     # Path to crawler logs
     CRAWLER_LOGS_PATH = "../CrawlerLogs/"
     # Path to dataset configuration files
     CONFIG_FILES_PATH = "../DatasetConfigs"
     def create_ignore_file(path, text):
         """
         Creates ignore file
         Args:
             path: path to directory for creating ignore.txt
             text: text that will be on first line of ignore.txt can be None
         """
         with open(path + "/ignore.txt", "w") as file:
             if text is not None:
                 file.write(text + "\n")
     def reset_dataset(dataset_name):
         """
         Resets all saved data in dataset except config and implementation
         Args:
             dataset_name: name of dataset that has existing configuration file
         """
         path = CRAWLED_DATA_PATH + dataset_name + "/"
         FolderProcessor.clean_folder(path)
         create_ignore_file(path, "ignore.txt")
         path = PROCESSED_DATA_PATH + dataset_name + "/"
         FolderProcessor.clean_folder(path)
         create_ignore_file(path, "ignore.txt")
         path = CRAWLER_LOGS_PATH + dataset_name + "/"
         FolderProcessor.clean_folder(path)
         create_ignore_file(path, None)
     def reset_all_datasets():
         """
         Resets all saved data in all datasets with config file except configs and implementation
         """
         datasets = os.listdir(CONFIG_FILES_PATH)
         for dataset in datasets:
             reset_dataset(dataset.split('.')[0])
     reset_all_datasets()

     import os
     # Path to crawled data
     CRAWLED_DATA_PATH = "../CrawledData/"
     # Path to processed data
     PROCESSED_DATA_PATH = "../ProcessedData/"
     # Path to crawler logs
     CRAWLER_LOGS_PATH = "../CrawlerLogs/"
     # Path for DatasetCrawlers implementations
     CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
     # Path for DatasetProcessors implementations
     PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
     # Path to dataset configuration files
     CONFIG_FILES_PATH = "../DatasetConfigs"
     def create_default_config_file(dataset_name):
         """
         Creates default config file
         Args:
             dataset_name: Name of newly created dataset
         """
         with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
             file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
             file.write("dataset-name: " + dataset_name + "\n")
             file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
             file.write("url: ZDE VLOZTE URL\n")
             file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
             file.write("regex: ZDE VLOZTE REGEX\n")
             file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
                        "tak defaultni hodnota (dny)\n")
             file.write("update-period: ZDE VLOZTE HODNOTU\n")
             file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
             file.write("devices:\n")
     def create_default_processor(dataset_name):
         """
         Creates default processor for dataset
         Args:
             dataset_name: Name of newly created dataset
         """
         with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
             file.write("from Utilities.CSV import CSVDataLine, CSVutils")
             file.write("\n")
             file.write("\n")
             file.write("def process_file(filename):\n")
             file.write("    \"\"\"\n")
             file.write("    Method that take path to crawled file and outputs date dictionary using method:\n")
             file.write("    CSVutils.export_data_to_csv(filename, date_dict)\n")
             file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
             file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
             file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
             file.write("\n")
             file.write("    Args:\n")
             file.write("    filename: name of processed file\n")
             file.write("\n")
             file.write("    Returns:\n")
             file.write("    False if not implemented\n")
             file.write("    True when implemented\n")
             file.write("    \"\"\"\n")
             file.write("    print(\"You must implements process_file method first!\")\n")
             file.write("    #CSVutils.export_data_to_csv(filename, date_dict)\n")
             file.write("    return False\n")
     def create_default_crawler(dataset_name):
         """
         Creates default crawler for dataset
         Args:
             dataset_name: Name of newly created dataset
         """
         with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
             file.write("# Path to crawled data\n")
             file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
             file.write("\n")
             file.write("\n")
             file.write("def crawl(config):\n")
             file.write("    \"\"\"\n")
             file.write("    Implement crawl method that downloads new data to path_for_files\n")
             file.write("    For keeping the project structure\n")
             file.write("    url , regex, and dataset_name from config\n")
             file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
             file.write("\n")
             file.write("    Args:\n")
             file.write("        config: loaded configuration file of dataset\n")
             file.write("    \"\"\"\n")
             file.write("    dataset_name = config[\"dataset-name\"]\n")
             file.write("    url = config['url']\n")
             file.write("    regex = config['regex']\n")
             file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
             file.write("    print(\"You must implements Crawl method first!\")\n")
     def create_ignore_file(path, text):
         """
         Creates ignore file
         Args:
             path: path to directory for creating ignore.txt
             text: text that will be on first line of ignore.txt can be None
         """
         with open(path + "/ignore.txt", "w") as file:
             if text is not None:
                 file.write(text + "\n")
     def prepare_dataset_structure(dataset_name):
         """
         Prepares folders for new dataset
         Args:
             dataset_name: Name of newly created dataset
         """
         jump_folder = "../"
         # create folder for crawled data
         try:
             path = CRAWLED_DATA_PATH+dataset_name
             os.mkdir(path)
             create_ignore_file(path, "ignore.txt")
         except os.error as e:
             print(e)
             print("Creation of the directory %s failed" % path)
         # create folder for processed data
         try:
             path = PROCESSED_DATA_PATH + dataset_name
             os.mkdir(path)
             create_ignore_file(path, "ignore.txt")
         except OSError:
             print("Creation of the directory %s failed" % path)
         # create folder for crawler logs
         try:
             path = CRAWLER_LOGS_PATH + dataset_name
             os.mkdir(path)
             create_ignore_file(path, None)
         except OSError:
             print("Creation of the directory %s failed" % path)
         create_default_crawler(dataset_name)
         create_default_processor(dataset_name)
         create_default_config_file(dataset_name)
     prepare_dataset_structure("TEST")

     import os
     import shutil
     # Path to crawled data
     CRAWLED_DATA_PATH = "../CrawledData/"
     # Path to processed data
     PROCESSED_DATA_PATH = "../ProcessedData/"
     # Path to crawler logs
     CRAWLER_LOGS_PATH = "../CrawlerLogs/"
     # Path to dataset configuration files
     CONFIG_FILES_PATH = "../DatasetConfigs"
     # Path for DatasetCrawlers implementations
     CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
     # Path for DatasetProcessors implementations
     PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
     def remove_dataset(dataset_name):
         """
         Remove dataset
         Args:
             dataset_name: name of dataset that has existing configuration file
         """
         shutil.rmtree(CRAWLED_DATA_PATH + dataset_name + "/")
         shutil.rmtree(PROCESSED_DATA_PATH + dataset_name + "/")
         shutil.rmtree(CRAWLER_LOGS_PATH + dataset_name + "/")
         os.remove(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml")
         os.remove(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py")
         os.remove(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py")
         print("Dataset: " + dataset_name + " removed");
     remove_dataset("TEST");

     from Utilities.Database import DatabaseLoader
     def remove_dataset_database(dataset_name):
         """
         Removes dataset entries from database
         Args:
             dataset_name: name of dataset that has existing configuration file
         """
         # Creating connection
         mydb = DatabaseLoader.create_database_connection();
         # collection where are specified aviable datasets
         collection_datasets = mydb[DatabaseLoader.MONGODB_DATASET_COLLECTION]
         collection_datasets.delete_one({"name": dataset_name})
         print("Removing record from DATASETS collection")
         # Retrieve list of all collections
         collections = mydb.list_collection_names()
         # Drop of all collections
         for name in collections:
             if name.startswith(dataset_name):
                 mydb[name].drop()
                 print("Dropping: " + name)
         print("Database Cleaned")
     remove_dataset_database("KOLOBEZKY")

     from Utilities.Database import DatabaseLoader
     def clean_database():
         """
         Deletes all collections from database
         """
         # Create connection
         mydb = DatabaseLoader.create_database_connection()
         # Retrieve list of all collections
         collections = mydb.list_collection_names()
         # Drop of all collections
         for name in collections:
             mydb[name].drop()
         print("Database Cleaned")
     clean_database();

     import os
     from Utilities import FolderProcessor
     # Path to crawled data
     CRAWLED_DATA_PATH = "../CrawledData/"
     # Path to processed data
     PROCESSED_DATA_PATH = "../ProcessedData/"
     # Path to crawler logs
     CRAWLER_LOGS_PATH = "../CrawlerLogs/"
     # Path to dataset configuration files
     CONFIG_FILES_PATH = "../DatasetConfigs"
     def create_ignore_file(path, text):
         """
         Creates ignore file
         Args:
             path: path to directory for creating ignore.txt
             text: text that will be on first line of ignore.txt can be None
         """
         with open(path + "/ignore.txt", "w") as file:
             if text is not None:
                 file.write(text + "\n")
     def reset_dataset(dataset_name):
         """
         Resets all saved data in dataset except config and implementation
         Args:
             dataset_name: name of dataset that has existing configuration file
         """
         path = CRAWLED_DATA_PATH + dataset_name + "/"
         FolderProcessor.clean_folder(path)
         create_ignore_file(path, "ignore.txt")
         path = PROCESSED_DATA_PATH + dataset_name + "/"
         FolderProcessor.clean_folder(path)
         create_ignore_file(path, "ignore.txt")
         path = CRAWLER_LOGS_PATH + dataset_name + "/"
         FolderProcessor.clean_folder(path)
         create_ignore_file(path, None)
     def reset_all_datasets():
         """
         Resets all saved data in all datasets with config file except configs and implementation
         """
         datasets = os.listdir(CONFIG_FILES_PATH)
         for dataset in datasets:
             reset_dataset(dataset.split('.')[0])
     reset_all_datasets()

     import pymongo
     # specify mongodb connection
     MONGODB_CONNECTION = "mongodb://localhost:27017/"
     MONGODB_CONNECTION = "mongodb://root:root@database"
     # mongodb account name
     MONGODB_ACC_NAME = "root"
     # mongodb account password

Projekt

Obecné

Profil

ASWI - Pokročilé softwarové inženýrství » ASWI 2020 » Aplikace nad otevřenými daty (KIV) – BHVS

Revize 728f8c5d

Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)