/modules/crawler/PrepareNewDataset.py - Aplikace nad otevřenými daty (KIV) – BHVS - Redmine

aswi2020sebela-gitlab/modules/crawler/PrepareNewDataset.py @ 34baf808

       import os
       # Path to crawled data
       CRAWLED_DATA_PATH = "../CrawledData/"
       # Path to processed data
       PROCESSED_DATA_PATH = "../ProcessedData/"
       # Path to crawler logs
       CRAWLER_LOGS_PATH = "../CrawlerLogs/"
       # Path for DatasetCrawlers implementations
       CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
       # Path for DatasetProcessors implementations
       PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
       # Path to dataset configuration files
       CONFIG_FILES_PATH = "../DatasetConfigs"
       def create_default_config_file(dataset_name):
           """
           Creates default config file
           Args:
               dataset_name: Name of newly created dataset
           """
           with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
               file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
               file.write("dataset-name: " + dataset_name + "\n")
               file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
               file.write("url: ZDE VLOZTE URL\n")
               file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
               file.write("regex: ZDE VLOZTE REGEX\n")
               file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
                          "tak defaultni hodnota (dny)\n")
               file.write("update-period: ZDE VLOZTE HODNOTU\n")
               file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
               file.write("devices:\n")
       def create_default_processor(dataset_name):
           """
           Creates default processor for dataset
           Args:
               dataset_name: Name of newly created dataset
           """
           with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
               file.write("from Utilities.CSV import CSVDataLine, CSVutils")
               file.write("\n")
               file.write("\n")
               file.write("def process_file(filename):\n")
               file.write("    \"\"\"\n")
               file.write("    Method that take path to crawled file and outputs date dictionary using method:\n")
               file.write("    CSVutils.export_data_to_csv(filename, date_dict)\n")
               file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
               file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
               file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
               file.write("\n")
               file.write("    Args:\n")
               file.write("    filename: name of processed file\n")
               file.write("\n")
               file.write("    Returns:\n")
               file.write("    False if not implemented\n")
               file.write("    True when implemented\n")
               file.write("    \"\"\"\n")
               file.write("    print(\"You must implements process_file method first!\")\n")
               file.write("    #CSVutils.export_data_to_csv(filename, date_dict)\n")
               file.write("    return False\n")
       def create_default_crawler(dataset_name):
           """
           Creates default crawler for dataset
           Args:
               dataset_name: Name of newly created dataset
           """
           with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
               file.write("# Path to crawled data\n")
               file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
               file.write("\n")
               file.write("\n")
               file.write("def crawl(config):\n")
               file.write("    \"\"\"\n")
               file.write("    Implement crawl method that downloads new data to path_for_files\n")
               file.write("    For keeping the project structure\n")
               file.write("    url , regex, and dataset_name from config\n")
               file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
               file.write("\n")
               file.write("    Args:\n")
               file.write("        config: loaded configuration file of dataset\n")
               file.write("    \"\"\"\n")
               file.write("    dataset_name = config[\"dataset-name\"]\n")
               file.write("    url = config['url']\n")
               file.write("    regex = config['regex']\n")
               file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
               file.write("    print(\"You must implements Crawl method first!\")\n")
       def create_ignore_file(path, text):
           """
           Creates ignore file
           Args:
               path: path to directory for creating ignore.txt
               text: text that will be on first line of ignore.txt can be None
           """
           with open(path + "/ignore.txt", "w") as file:
               if text is not None:
                   file.write(text + "\n")
       def create_updated_file(path):
           """
           Creates updated file
           Args:
               path: path to directory for creating updated.txt
           """
           with open(path + "/updated.txt", "w") as file:
                   file.write(str(0) + "\n")
       def prepare_dataset_structure(dataset_name):
           """
           Prepares folders for new dataset
           Args:
               dataset_name: Name of newly created dataset
           """
           # create folder for crawled data
           try:
               path = CRAWLED_DATA_PATH+dataset_name
               os.mkdir(path)
               create_ignore_file(path, "ignore.txt")
           except os.error as e:
               print(e)
               print("Creation of the directory %s failed" % path)
           # create folder for processed data
           try:
               path = PROCESSED_DATA_PATH + dataset_name
               os.mkdir(path)
               create_ignore_file(path, "ignore.txt")
           except OSError:
               print("Creation of the directory %s failed" % path)
           # create folder for crawler logs
           try:
               path = CRAWLER_LOGS_PATH + dataset_name
               os.mkdir(path)
               create_ignore_file(path, None)
               create_updated_file(path)
           except OSError:
               print("Creation of the directory %s failed" % path)
           create_default_crawler(dataset_name)
           create_default_processor(dataset_name)
           create_default_config_file(dataset_name)
       prepare_dataset_structure("TEST")

(6-6/13)

Projekt

Obecné

Profil

ASWI - Pokročilé softwarové inženýrství » ASWI 2020 » Aplikace nad otevřenými daty (KIV) – BHVS