/ - Diff - Aplikace nad otevřenými daty (KIV) – BHVS - Redmine

           x: UNKNOWN!
           y: UNKNOWN!
       - US 005 - z?vora vjezd:
           x: UNKNOWN!
           y: UNKNOWN!
       - US 005 - m?? vjezd:
           x: UNKNOWN!
           y: UNKNOWN!

     from Utilities import FolderProcessor
     from Utilities.Crawler import BasicCrawler
     # Path to crawled data
     CRAWLED_DATA_PATH = "CrawledData/"
     def crawl(config):
     def crawl(config):
         """
         Implement crawl method that downloads new data to path_for_files
         For keeping the project structure
         url , regex, and dataset_name from config
         You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py
         Args:
             config: loaded configuration file of dataset
         """
         dataset_name = config["dataset-name"]
         url = config['url']
         regex = config['regex']
         path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
         first_level_links = BasicCrawler.get_all_links(url)
         filtered_first_level_links = BasicCrawler.filter_links(first_level_links, "^OD_ZCU")
-...
                 files.append(file_link)
         for file in files:
             BasicCrawler.download_file_from_url(file, "CrawledData/" + dataset_name + "/", dataset_name)
             BasicCrawler.download_file_from_url(file, path_for_files, dataset_name)
         FolderProcessor.unzip_all_csv_zip_files_in_folder("CrawledData/" + dataset_name + "/")
         FolderProcessor.unzip_all_csv_zip_files_in_folder(path_for_files)

     from Utilities import FolderProcessor
     from Utilities.Crawler import BasicCrawler
     # Path to crawled data
     CRAWLED_DATA_PATH = "CrawledData/"
     def crawl(config):
     def crawl(config):
         """
         Implement crawl method that downloads new data to path_for_files
         For keeping the project structure
         url , regex, and dataset_name from config
         You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py
         Args:
             config: loaded configuration file of dataset
         """
         dataset_name = config["dataset-name"]
         url = config['url']
         regex = config['regex']
         path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
         first_level_links = BasicCrawler.get_all_links(url)
         filtered_first_level_links = BasicCrawler.filter_links(first_level_links, "^OD_ZCU")

     from Utilities import FolderProcessor
     from Utilities.Crawler import BasicCrawler
     # Path to crawled data
     CRAWLED_DATA_PATH = "CrawledData/"
     def crawl(config):
     def crawl(config):
         """
         Implement crawl method that downloads new data to path_for_files
         For keeping the project structure
         url , regex, and dataset_name from config
         You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py
         Args:
             config: loaded configuration file of dataset
         """
         dataset_name = config["dataset-name"]
         url = config['url']
         regex = config['regex']
         path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
         first_level_links = BasicCrawler.get_all_links(url)
         filtered_first_level_links = BasicCrawler.filter_links(first_level_links, "^OD_ZCU")
-...
                 files.append(file_link)
         for file in files:
             BasicCrawler.download_file_from_url(file, "CrawledData/" + dataset_name + "/", dataset_name)
             BasicCrawler.download_file_from_url(file, path_for_files, dataset_name)
         FolderProcessor.unzip_all_csv_zip_files_in_folder("CrawledData/" + dataset_name + "/")
         FolderProcessor.unzip_all_csv_zip_files_in_folder(path_for_files)

     def process_file(filename):
         """
         Method that take path to crawled file and outputs date dictionary using method:
         CSVutils.export_data_to_csv(filename, date_dict)
         Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)
         and value is dictionary where keys devices (specified in configuration file)
         and value is CSVDataLine.CSVDataLine with device,date and occurrence
         Args:
             filename: name of processed file
         Returns:
             False if not implemented
             True when implemented
         """
         with open(filename, "r", encoding="utf-8") as file:
             date_dict = dict()
-...
                 array = line.split(";")
                 date = DateFormating.date_time_formater(array[1][1:-1])
                 date = DateFormating.date_time_formatter(array[1][1:-1])
                 name = array[0][1:-1]
                 occurence = array[2][:-1]
-...
                 else:
                     date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, int(occurence))
             CSVutils.export_data_to_csv(filename, date_dict)
         CSVutils.export_data_to_csv(filename, date_dict)
         return True

     def process_file(filename):
         """
         Method that take path to crawled file and outputs date dictionary using method:
         CSVutils.export_data_to_csv(filename, date_dict)
         Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)
         and value is dictionary where keys devices (specified in configuration file)
         and value is CSVDataLine.CSVDataLine with device,date and occurrence
         Args:
             filename: name of processed file
         Returns:
             False if not implemented
             True when implemented
         """
         with open(filename, "r") as file:
             date_dict = dict()
-...
                 array = line.split(";")
                 date = DateFormating.date_time_formater(array[0][1:-1])
                 date = DateFormating.date_time_formatter(array[0][1:-1])
                 name = array[1][1:-1]
                 if date not in date_dict:
-...
                 else:
                     date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, 1)
             CSVutils.export_data_to_csv(filename, date_dict)
         CSVutils.export_data_to_csv(filename, date_dict)
         return True

     def process_file(filename):
         """
         Method that take path to crawled file and outputs date dictionary using method:
         CSVutils.export_data_to_csv(filename, date_dict)
         Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)
         and value is dictionary where keys devices (specified in configuration file)
         and value is CSVDataLine.CSVDataLine with device,date and occurrence
         Args:
             filename: name of processed file
         Returns:
             False if not implemented
             True when implemented
         """
         with open(filename, "r", encoding="utf-8") as file:
             date_dict = dict()
-...
                 array = line.split(";")
                 date = DateFormating.date_time_formater(array[4][1:-2])
                 date = DateFormating.date_time_formatter(array[4][1:-2])
                 name = array[1][1:-1]
                 occurence = array[0]
-...
                 else:
                     date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, int(occurence))
             CSVutils.export_data_to_csv(filename, date_dict)
         CSVutils.export_data_to_csv(filename, date_dict)
         return True

     from Utilities import FolderProcessor, ConfigureFunctions
     from Utilities.Database import DatabaseLoader
     CONFIG_FILES_PATH = "DatasetConfigs/"
     # Path to crawled data
     CRAWLED_DATA_PATH = "CrawledData/"
     # Path to processed data
     PROCESSED_DATA_PATH = "ProcessedData/"
     # Path to dataset crawler implementations
     CRAWLER_LIB_PATH = "DatasetCrawler."
     # Path to dataset processor implementations
     PROCESSOR_LIB_PATH = "DatasetProcessing."
     def crawl_data(config):
         """
           Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py
           runs crawler.
         Args:
             config: loaded configuration file of dataset
         """
         dataset_name = config["dataset-name"]
         my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler",globals(),locals(),['crawl']).crawl
         my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
         my_function(config)
         dataset_name += '/'
     def process_data(dataset_name):
         """
         Goes trough every not processed file(not contained in CrawledData/"dataset_name"/ignore.txt)
         Imports dataset processor in DatasetProcessing/"dataset_name"Processor.py
         Runs processor on every file
         After successful processing updates ignore.txt
         Args:
             dataset_name: name of dataset that has existing configuration file
         """
         dataset_path = dataset_name + '/'
         process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor",globals(),locals(),
         process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor", globals(), locals(),
                                        ['process_file']).process_file
         # get all not processed files from dataset
         not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path)
         # process every file
         for not_processed_file in not_processed_files:
             # call processing for one file in dataset
             process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file)
             FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file)
     def validate_process_data(config):
         """
         Function goes through newly processed data and checks theirs status
         Args:
             config: loaded configuration file of dataset
         Returns:
             boolean variable TRUE/FALSE.
             Data processed correctly - TRUE
             Wrong format or NEW unknown devices - FALSE
         """
         processed_devices_set = FolderProcessor.get_devices_set(PROCESSED_DATA_PATH + config["dataset-name"] + '/')
         unknown_devices_set = FolderProcessor.get_unknown_devices_set(config,processed_devices_set)
         unknown_devices_set = FolderProcessor.get_unknown_devices_set(config, processed_devices_set)
         unknown_devices_size = len(unknown_devices_set)
         if unknown_devices_size != 0:
             print("There is " + str(unknown_devices_size) + " unknown devies")
             ConfigureFunctions.update_configuration(CONFIG_FILES_PATH + config["dataset-name"] + ".yaml", unknown_devices_set)
             ConfigureFunctions.update_configuration(config["dataset-name"], unknown_devices_set)
             return False
     def load_data_to_database(config):
         """
         Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
         loads data appends coordination from configurations
         and exports it into the database
         After successful exporting updates ignore.txt
         Args:
             config: loaded configuration file of dataset
         """
         dataset_name = config["dataset-name"]
         dataset_path = dataset_name + '/'
-...
         # load every file
         for not_loaded_file in not_loaded_files:
             # load processed data
             processed_data = DatabaseLoader.get_data_from_file(PROCESSED_DATA_PATH + dataset_path + not_loaded_file,
                                                                config["devices"])
             processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
             # load processed data to database
             DatabaseLoader.load_data_to_database(dataset_name, processed_data)
             FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
     def run_full_pipeline(dataset_name):
         config = ConfigureFunctions.load_configuration(CONFIG_FILES_PATH + dataset_name)
         """
         Loads config file and starts full pipeline
         -crawl data
         -process data
         -load data to database
         Args:
             dataset_name: name of dataset that has existing configuration file
         """
         config = ConfigureFunctions.load_configuration(dataset_name)
         crawl_data(config)
         process_data(config["dataset-name"])

     import os
     # Path to crawled data
     CRAWLED_DATA_PATH = "../CrawledData/"
     # Path to processed data
     PROCESSED_DATA_PATH = "../ProcessedData/"
     # Path to crawler logs
     CRAWLER_LOGS_PATH = "../CrawlerLogs/"
     # Path for DatasetCrawlers implementations
     CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
     # Path for DatasetProcessors implementations
     PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
     # Path to dataset configuration files
     CONFIG_FILES_PATH = "../DatasetConfigs"
     def create_default_config_file(dataset_name):
         """
         Creates default config file
         Args:
             dataset_name: Name of newly created dataset
         """
         with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
             file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
             file.write("dataset-name: " + dataset_name + "\n")
             file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
             file.write("url: ZDE VLOZTE URL/\n")
             file.write("url: ZDE VLOZTE URL\n")
             file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
             file.write("regex: ZDE VLOZTE REGEX\n")
             file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
-...
     def create_default_processor(dataset_name):
         """
         Creates default processor for dataset
         Args:
             dataset_name: Name of newly created dataset
         """
         with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
             file.write("from Utilities.CSV import CSVDataLine, CSVutils")
             file.write("\n")
             file.write("\n")
             file.write("def process_file(filename):\n")
             file.write("    \"\"\"\n")
             file.write("    Method that take path to crawled file and outputs date dictionary using method:\n")
             file.write("    CSVutils.export_data_to_csv(filename, date_dict)\n")
             file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
             file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
             file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
             file.write("\n")
             file.write("    Args:\n")
             file.write("    filename: name of processed file\n")
             file.write("\n")
             file.write("    Returns:\n")
             file.write("    False if not implemented\n")
             file.write("    True when implemented\n")
             file.write("    \"\"\"\n")
             file.write("    print(\"You must implements process_file method first!\")\n")
             file.write("    #CSVutils.export_data_to_csv(filename, date_dict)\n")
             file.write("    return False\n")
     def create_default_crawler(dataset_name):
         """
         Creates default crawler for dataset
         Args:
             dataset_name: Name of newly created dataset
         """
         with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
             file.write("# Path to crawled data\n")
             file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
             file.write("\n")
             file.write("\n")
             file.write("def crawl(config):\n")
             file.write("    \"\"\"\n")
             file.write("    Implement crawl method that downloads new data to path_for_files\n")
             file.write("    For keeping the project structure\n")
             file.write("    url , regex, and dataset_name from config\n")
             file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
             file.write("\n")
             file.write("    Args:\n")
             file.write("        config: loaded configuration file of dataset\n")
             file.write("    \"\"\"\n")
             file.write("    dataset_name = config[\"dataset-name\"]\n")
             file.write("    url = config['url']\n")
             file.write("    regex = config['regex']\n")
             file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
             file.write("    print(\"You must implements Crawl method first!\")\n")
     def create_ignore_file(path,text):
     def create_ignore_file(path, text):
         """
         Creates ignore file
         Args:
             path: path to directory for creating ignore.txt
             text: text that will be on first line of ignore.txt can be None
         """
         with open(path + "/ignore.txt", "w") as file:
             if text is not None:
                 file.write(text + "\n")
     def prepare_dataset_structure(dataset_name):
         """
         Prepares folders for new dataset
         Args:
             dataset_name: Name of newly created dataset
         """
         jump_folder = "../"
         # create folder for crawled data
         try:
             path = CRAWLED_DATA_PATH+dataset_name
             os.mkdir(path)
             create_ignore_file(path,"ignore.txt")
             create_ignore_file(path, "ignore.txt")
         except os.error as e:
             print(e)
             print("Creation of the directory %s failed" % path)
-...
         create_default_config_file(dataset_name)
     prepare_dataset_structure("WIFI")
     prepare_dataset_structure("TEST")

     class CSVDataLine:
         def __init__(self, name, date, occurence):
         """
         Class that specifies the look of data line in processed csv file
         prepared for database
         """
         def __init__(self, name, date, occurrence):
             self.name = name
             self.date = date
             self.occurence = occurence
             self.occurrence = occurrence
         def to_csv(self):
             return self.name + ";" + str(self.occurence) + ";" + self.date
             return self.name + ";" + str(self.occurrence) + ";" + self.date

     # Path to processed data
     PROCESSED_DATA_PATH = "ProcessedData/"
     def get_unique_names_from_file(filename, column_number):
         """
         Args:
             filename:
             column_number:
         Returns:
         """
         f = open(filename, "r")
         # create set of unique names

     import yaml
     # Path to dataset configuration files
     CONFIG_FILES_PATH = "DatasetConfigs/"
     # Config file type
     CONFIG_FILE_TYPE = ".yaml"
     def load_configuration(configure_file_name):
         with open(configure_file_name) as f:
     def load_configuration(dataset_name):
         """
         Loads yaml configuration file into memory
         Args:
             dataset_name: name of dataset that has existing configuration file
         Returns:
             yaml configuration file as dictionary
         """
         with open(CONFIG_FILES_PATH + dataset_name + CONFIG_FILE_TYPE, "r") as f:
             data = yaml.load(f, Loader=yaml.FullLoader)
         devices_dic = dict()
-...
         return data
     def update_configuration(configure_file_name, new_devices):
     def update_configuration(dataset_name, new_devices):
         """
         Open dataset and appends new_devices to the end
         with open(configure_file_name, "a") as file:
         Args:
             dataset_name: name of dataset that has existing configuration file
             new_devices: list or set of new devices for dataset
         """
         with open(CONFIG_FILES_PATH + dataset_name + CONFIG_FILE_TYPE, "a") as file:
             for device in new_devices:
                 file.write("  - "+device+":\n")
                 file.write("      x: UNKNOWN!\n")

     import requests
     import re
     from Utilities import FolderProcessor
     from bs4 import BeautifulSoup
     def get_all_links(url):
         # create response object
         r = requests.get(url)
         # create beautiful-soup object
         soup = BeautifulSoup(r.content, 'html5lib')
         links = []
         for link in soup.findAll('a'):
             links.append(link.get('href'))
         return links
     def filter_links(links, regex):
         fitlered_links = []
         for link in links:
             if re.search(regex,link):
                 fitlered_links.append(link)
         return fitlered_links
     def create_absolute_links(links, archive):
         absolute_links = []
         for link in links:
             absolute_links.append(archive + link)
         return absolute_links
     def remove_downloaded_links(links,dataset_name):
         downloaded_links = FolderProcessor.load_ignore_set("CrawlerLogs/" + dataset_name + "/")
         final_links = set(links) - downloaded_links
         return final_links
     def download_file_from_url(url,path, dataset_name):
         r = requests.get(url, stream=True)
         url_parts = url.split("/")
         file_name = url_parts[len(url_parts)-1]
         with open(path + file_name, "wb") as file:
             for chunk in r.iter_content(chunk_size=1024):
                 # writing one chunk at a time to pdf file
                 if chunk:
                     file.write(chunk)
         FolderProcessor.update_ignore_set("CrawlerLogs/" + dataset_name + "/", url)

     import requests
     import re
     from Utilities import FolderProcessor
     from bs4 import BeautifulSoup
     # Path to crawler logs
     CRAWLER_LOGS_PATH = "CrawlerLogs/"
     def get_all_links(url):
         """
         Sends http request to url, downloads all data,
         extract links
         Args:
             url: url of website we want to search
         Returns:
             list of all links
         """
         # create response object
         r = requests.get(url)
         # create beautiful-soup object
         soup = BeautifulSoup(r.content, 'html5lib')
         links = []
         for link in soup.findAll('a'):
             links.append(link.get('href'))
         return links
     def filter_links(links, regex):
         """
         Filters list of links using regex
         Args:
             links: list of links
             regex: regex used for filtering
         Returns:
             filtered list of links
         """
         filtered_links = []
         for link in links:
             if re.search(regex, link):
                 filtered_links.append(link)
         return filtered_links
     def create_absolute_links(links, archive):
         """
             Appends archive path to every link in links
         Args:
             links: list of relative links
             archive: archive url
         Returns:
             list of absolute links
         """
         absolute_links = []
         for link in links:
             absolute_links.append(archive + link)
         return absolute_links
     def remove_downloaded_links(links, dataset_name):
         """
         Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
         Args:
             links: list of links
             dataset_name: name of dataset that has existing configuration file
         Returns:
             List of links without already downloaded links
         """
         downloaded_links = FolderProcessor.load_ignore_set(CRAWLER_LOGS_PATH + dataset_name + "/")
         final_links = set(links) - downloaded_links
         return final_links
     def download_file_from_url(url, dataset_name):
         """
         Downloads file on provided url and saves it to path
         Args:
             url: url file we want to download
             dataset_name: name of dataset that has existing configuration file
         """
         r = requests.get(url, stream=True)
         # splits url and extract last part that contains filename
         url_parts = url.split("/")
         file_name = url_parts[len(url_parts)-1]
         path = CRAWLER_LOGS_PATH + dataset_name + '/'
         # download file chunk by chunk so we can download large files
         with open(path + file_name, "wb") as file:
             for chunk in r.iter_content(chunk_size=1024):
                 # writing one chunk at a time to file
                 if chunk:
                     file.write(chunk)
         # after successful download update list of already downloaded files
         FolderProcessor.update_ignore_set(CRAWLER_LOGS_PATH + dataset_name + "/", url)

     class DatabaseDataLine:
         def __init__(self, name, longitude, latitude, date, occurence):
         """
         Class that specifies the look of data line in database
         """
         def __init__(self, name, longitude, latitude, date, occurrence):
             self.name = name
             self.latitude = latitude
             self.longitude = longitude
             self.date = date
             self.occurence = occurence
             self.occurrence = occurrence
         def to_dictionary(self):
             return {"place": self.name, "x": self.longitude, "y": self.latitude, "number": self.occurence, "date": self.date}
             return {"place": self.name, "x": self.longitude, "y": self.latitude, "number": self.occurrence, "date": self.date}

     from Utilities.Database import DatabaseDataLine
     import pymongo
     # specify mongodb connection
     MONGODB_CONNECTION = "mongodb://localhost:27017/"
     # mongodb account name
     MONGODB_ACC_NAME = "root"
     # mongodb account password
     MONGODB_ACC_PASSWORD = "root"
     # mongodb data database
     MONGODB_DATA_DATABASE = "DATA"
     # mongodb collection with aviable datasets
     MONGODB_DATASET_COLLECTION = "DATASETS"
     # Path to processed data
     PROCESSED_DATA_PATH = "ProcessedData/"
     def create_database_connection():
         """
         Creates connection to mongoDB
         Returns:
             Connection to mongoDB
         """
         client = pymongo.MongoClient(MONGODB_CONNECTION)
     def get_data_from_file(filename, devices):
         f = open(filename, "r")
         # Authenticating
         client.admin.authenticate(MONGODB_ACC_NAME, MONGODB_ACC_PASSWORD)
         database = client[MONGODB_DATA_DATABASE]
         return database
     def get_data_from_file(filename, config):
         """
             Opens processed file, reads it line by line
             name, ocurrence, date
             searches name in config and adds device map coordinates
             than creates a dictionary with date without hours as key
             and list of data lines as value.
         Args:
             filename: name of processed file
             config: loaded configuration file of dataset
         Returns:
             dictionary with date without hours as key
             and list of Datalines as value
         """
         dataset_name = config["dataset-name"]
         dataset_path = PROCESSED_DATA_PATH + dataset_name + '/'
         f = open(dataset_path + filename, "r")
         devices = config["devices"]
         date_dict = dict()
         for line in f:
             # remove \n
             line = line[:-1]
             # split by csv splitter ;
             csv_collum = line.split(";")
             name = csv_collum[0]
             occurence = csv_collum[1]
             date = csv_collum[2]
             csv_column = line.split(";")
             date_without_hours = date[:-2]
             name = csv_column[0]
             occurrence = csv_column[1]
             date = csv_column[2]
             database_data_line = DatabaseDataLine.DatabaseDataLine(name, devices[name]["x"]
                                                                    , devices[name]["y"], date, occurence)
                                                                    , devices[name]["y"], date, occurrence)
             # if you want to change table split by hours or months change this
             date_without_hours = date[:-2]
             if date_without_hours not in date_dict:
                 date_dict[date_without_hours] = list()
             date_dict[date_without_hours].append(database_data_line.to_dictionary())
             date_dict[date_without_hours].append(database_data_line.to_dictionary)
         return date_dict
     def load_data_to_database(dataset_name, data_dic):
         myclient = pymongo.MongoClient("mongodb://localhost:27017/");
         # Authenticating
         myclient.admin.authenticate('root', 'root');
         """
         Takes data_dic created in method get_data_from_file
         and loads into into database where collection name is dataset_name + data_dic key
         and data lines are line in collection
         # Database DATA
         mydb = myclient["DATA"]
         Args:
             dataset_name: name of dataset that has existing configuration file
             data_dic: dictionary of data lines created in get_data_from_file
         """
         database = create_database_connection()
         # Collection Datasets
         collection_datasets = mydb["DATASETS"]
         # collection where are specified aviable datasets
         collection_datasets = database[MONGODB_DATASET_COLLECTION]
         # check if newly added data already have a dataset specified in collection
         dataset_present = collection_datasets.find_one({}, {'name': dataset_name})
         if dataset_present is None:
             collection_datasets.insert_one({'name': dataset_name})
         for date in data_dic:
             dataset_collections = mydb[dataset_name]
             dataset_collections = database[dataset_name]
             dataset_collections.insert_one({'name': dataset_name+date})
             date_dataset = mydb[dataset_name + date]
             date_dataset = database[dataset_name + date]
             date_dataset.insert_many(data_dic[date])

     def date_formater(string_date):
     def date_formatter(string_date):
         """
         Args:
             string_date: string containing date in format 22.08.2018 12:27:00
         Returns:
             string of date in format 0804201814 ddmmYYYY
         """
         if string_date[11].isspace():
             pos = 0
             srr = ""
-...
         return return_date
     def date_time_formater(string_date):
     def date_time_formatter(string_date):
         """
         Converts one type of date format "dd.mm.yyyy hh.mm.ss" to date format ddmmYYYYhh
         Args:
             string_date: string containing date in format 22.08.2018 12:27:00
         Returns:
             string of date in format 0804201814 ddmmYYYYhh
         """
         if string_date[11].isspace():
             pos = 0
             srr = ""
-...
         return_date = string_date[:2] + string_date[3:5] + string_date[6:10] + string_date[11:13]
         return return_date
         return return_date

     import os
     import zipfile
     from CSV import CSVutils
     def list_of_all_files(path):
         """
         Get all files from directory and all files written in ignore.txt
         and return the difference
         Args:
             path: path to Directory
         Returns:
             list with names of all files in directory
         """
         files_in_dir = os.listdir(path)
         ignore_set = load_ignore_set(path)
-...
     def load_ignore_set(path):
         """
         Reads ignore.txt line by line and add it to a set
         Args:
             path: Path to directory containing ignore.txt file
         Returns:
             list of names contained in ignore.txt file
         """
         ignore_set = set()
         with open(path + "ignore.txt", "r") as file:
-...
         return ignore_set
     def update_ignore_set(path,file_name):
     def update_ignore_set(path, file_name):
         """
         Adds file_name to the ignore file
         Args:
             path: Path to directory containing ignore.txt file
             file_name: name of file you want to add to ignore file
         """
         with open(path + "ignore.txt", "a") as file:
             file.write(file_name + '\n')
     def get_devices_set(folder):
     def get_devices_set(path):
         """
          Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
          Extracts names from not loaded file which should be in first column
          Creates set of unique devices_names
         Args:
             path: Path to Processed directory containing ignore.txt file
         files_in_dir = list_of_all_files(folder)
         Returns:
             set of unique names contained in not loaded files
         """
         files_in_dir = list_of_all_files(path)
         unique_names = set()
         for file_path in files_in_dir:
             with open(folder+file_path) as file:
                 for line in file:
                     array = line.split(";")
                     name = array[0]
                     unique_names.add(name)
             unique_names.add(CSVutils.get_unique_names_from_file(path+file_path, 0))
         return unique_names
     def get_unknown_devices_set(config,devices):
     def get_unknown_devices_set(config, devices):
         """
         Compares config and devices a return difference
         Args:
             config:  loaded configuration file of dataset
             devices: set of unique devices contained in dataset
         Returns:
         """
         devices_set = set(config["devices"].keys())
         unknown_devices_set = devices.difference(devices_set)
         return unknown_devices_set
     def unzip_all_csv_zip_files_in_folder(folder):
         files_in_dir = os.listdir(folder)
     def unzip_all_csv_zip_files_in_folder(path):
         """
         Load all files from directory and unzip those which end by .zip
         After unziping deletes the zip file
         Args:
             path: Path to CrawledData directory containing ignore.txt file
         """
         files_in_dir = os.listdir(path)
         zips = []
         for file in files_in_dir:
             if file.endswith(".zip"):
                 zips.append(folder + file)
                 zips.append(path + file)
         for zip_file in zips:
             with zipfile.ZipFile(zip_file, "r") as unziped_file:
                 unziped_file.extractall(folder)
                 unziped_file.extractall(path)
             os.remove(zip_file)

     import Pipeline
     import os
     # Path to configuration files
     CONFIG_FILES_PATH = "DatasetConfigs/"
     def run_pipeline_for_all_datasets():
         """
         Runs whole DataScript pipeline for every dataset that has existing configuration file
         """
         files_in_dir = os.listdir(CONFIG_FILES_PATH)
         for file in files_in_dir:
             Pipeline.run_full_pipeline(file)
             name = file.split('.')
             Pipeline.run_full_pipeline(name[0])
     def run_pipeline_for_one_dataset(dataset_name):
         """
         Runs whole DataScript pipeline for only one dataset
         Args:
             dataset_name: name of dataset that has existing configuration file
         """
         Pipeline.run_full_pipeline(dataset_name)

Projekt

Obecné

Profil

ASWI - Pokročilé softwarové inženýrství » ASWI 2020 » Aplikace nad otevřenými daty (KIV) – BHVS

Revize 04a2b5a4

Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)

Projekt

Obecné

Profil

ASWI - Pokročilé softwarové inženýrství » ASWI 2020 » Aplikace nad otevřenými daty (KIV) – BHVS

Revize 04a2b5a4

Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)

Související úkoly