/ - Diff - Aplikace nad otevřenými daty (KIV) – BHVS - Redmine

     from Utilities import folder_processor
     from Utilities.Crawler import basic_crawler_functions
     from shared_types import ConfigType
     # Path to crawled data
     CRAWLED_DATA_PATH = "CrawledData/"
     def crawl(config):
     def crawl(config: ConfigType):
         """
         Implement crawl method that downloads new data to path_for_files
         For keeping the project structure
-...
         path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
         first_level_links = basic_crawler_functions.get_all_links(url)
         filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU")
         absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url)
         filtered_first_level_links = basic_crawler_functions.filter_links(
             first_level_links, "^OD_ZCU")
         absolute_first_level_links = basic_crawler_functions.create_absolute_links(
             filtered_first_level_links, url)
         files = []
         for link in absolute_first_level_links:
             second_level_links = basic_crawler_functions.get_all_links(link)
             filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex)
             absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link)
             filtered_second_level_links = basic_crawler_functions.filter_links(
                 second_level_links, regex)
             absolute_second_level_links = basic_crawler_functions.create_absolute_links(
                 filtered_second_level_links, link)
             for file_link in absolute_second_level_links:
                 files.append(file_link)
         files = basic_crawler_functions.remove_downloaded_links(files, dataset_name)
         files = basic_crawler_functions.remove_downloaded_links(
             files, dataset_name)
         for file in files:
             basic_crawler_functions.download_file_from_url(file, dataset_name)

     from Utilities import folder_processor
     from Utilities.Crawler import basic_crawler_functions
     from shared_types import ConfigType
     # Path to crawled data
     CRAWLED_DATA_PATH = "CrawledData/"
     def crawl(config):
     def crawl(config: ConfigType):
         """
         Implement crawl method that downloads new data to path_for_files
         For keeping the project structure
-...
         path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
         first_level_links = basic_crawler_functions.get_all_links(url)
         filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU")
         absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url)
         filtered_first_level_links = basic_crawler_functions.filter_links(
             first_level_links, "^OD_ZCU")
         absolute_first_level_links = basic_crawler_functions.create_absolute_links(
             filtered_first_level_links, url)
         files = []
         for link in absolute_first_level_links:
             second_level_links = basic_crawler_functions.get_all_links(link)
             filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex)
             absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link)
             filtered_second_level_links = basic_crawler_functions.filter_links(
                 second_level_links, regex)
             absolute_second_level_links = basic_crawler_functions.create_absolute_links(
                 filtered_second_level_links, link)
             for file_link in absolute_second_level_links:
                 files.append(file_link)
         files = basic_crawler_functions.remove_downloaded_links(files, dataset_name)
         files = basic_crawler_functions.remove_downloaded_links(
             files, dataset_name)
         for file in files:
             basic_crawler_functions.download_file_from_url(file, dataset_name)

     from Utilities import folder_processor
     from Utilities.Crawler import basic_crawler_functions
     from shared_types import ConfigType
     # Path to crawled data
     CRAWLED_DATA_PATH = "CrawledData/"
     def crawl(config):
     def crawl(config: ConfigType):
         """
         Implement crawl method that downloads new data to path_for_files
         For keeping the project structure

     from Utilities import folder_processor
     from Utilities.Crawler import basic_crawler_functions
     from shared_types import ConfigType
     # Path to crawled data
     CRAWLED_DATA_PATH = "CrawledData/"
     def crawl(config):
     def crawl(config: ConfigType):
         """
         Implement crawl method that downloads new data to path_for_files
         For keeping the project structure
-...
         path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
         first_level_links = basic_crawler_functions.get_all_links(url)
         filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU")
         absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url)
         filtered_first_level_links = basic_crawler_functions.filter_links(
             first_level_links, "^OD_ZCU")
         absolute_first_level_links = basic_crawler_functions.create_absolute_links(
             filtered_first_level_links, url)
         files = []
         for link in absolute_first_level_links:
             second_level_links = basic_crawler_functions.get_all_links(link)
             filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex)
             absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link)
             filtered_second_level_links = basic_crawler_functions.filter_links(
                 second_level_links, regex)
             absolute_second_level_links = basic_crawler_functions.create_absolute_links(
                 filtered_second_level_links, link)
             for file_link in absolute_second_level_links:
                 files.append(file_link)
         files = basic_crawler_functions.remove_downloaded_links(files, dataset_name)
         files = basic_crawler_functions.remove_downloaded_links(
             files, dataset_name)
         for file in files:
             basic_crawler_functions.download_file_from_url(file, dataset_name)

     from Utilities.CSV import csv_data_line
     from Utilities import date_formating
     from shared_types import DateDict
     def process_file(filename):
     def process_file(filename: str) -> DateDict:
         """
         Method that take path to crawled file and outputs date dictionary:
         Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)
-...
         None if not implemented
         date_dict when implemented
         """
         date_dict = dict()
         date_dict = {}
         with open(filename, "r", encoding="utf-8") as file:
-...
                 occurrence = array[2][:-1]
                 if date not in date_dict:
                     date_dict[date] = dict()
                     date_dict[date] = {}
                 if name in date_dict[date]:
                     date_dict[date][name].occurrence += int(occurrence)
                 else:
                     date_dict[date][name] = csv_data_line.CSVDataLine(name, date, occurrence)
                     date_dict[date][name] = csv_data_line.CSVDataLine(
                         name, date, occurrence)
         return date_dict

     from Utilities.CSV import csv_data_line
     from Utilities import date_formating
     from shared_types import DateDict
     def process_file(filename):
     def process_file(filename: str) -> DateDict:
         """
         Method that take path to crawled file and outputs date dictionary:
         Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)
-...
         None if not implemented
         date_dict when implemented
         """
         date_dict = dict()
         date_dict = {}
         with open(filename, "r") as file:
-...
                 name = array[1][1:-1]
                 if date not in date_dict:
                     date_dict[date] = dict()
                     date_dict[date] = {}
                 if name in date_dict[date]:
                     date_dict[date][name].occurrence += 1
                 else:
                     date_dict[date][name] = csv_data_line.CSVDataLine(name, date, 1)
                     date_dict[date][name] = csv_data_line.CSVDataLine(
                         name, date, 1)
         return date_dict

     import time
     import datetime
     from shared_types import DateDict
     logging.basicConfig(filename='../../CrawlerLogs' + 'Crawlerlog-' +
                         date.today().strftime("%b-%Y") + '.log',
                         level=logging.INFO,
                         format='%(asctime)s %(message)s')
     def process_file(filename):
     def process_file(filename: str) -> DateDict:
         """
         Method that take path to crawled file and outputs date dictionary:
         Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)
-...
         None if not implemented
         date_dict when implemented
         """
         date_dict = dict()
         date_dict = {}
         with open(filename, "r") as file:

     from Utilities.CSV import csv_data_line
     from Utilities import date_formating
     from shared_types import DateDict
     def process_file(filename):
     def process_file(filename: str) -> DateDict:
         """
         Method that take path to crawled file and outputs date dictionary:
         Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)

         Class that specifies the look of data line in processed csv file
         prepared for database
         """
         def __init__(self, name, date, occurrence):
         def __init__(self, name: str, date: str, occurrence: int) -> None:
             try:
                 test_val = int(occurrence)
             except ValueError:
                 print("Occurence should be and integer value!")
             if len(date) != 13:
                 raise ValueError("Invalid date format YYYY-dd-mm-hh expected!")
                 raise ValueError("Invalid date format YYYY-dd-mm-hh expected!")
             self.name = name
             self.date = date
             self.occurrence = test_val
         def to_csv(self):
         def to_csv(self) -> str:
             return self.name + ";" + str(self.occurrence) + ";" + self.date

     import inspect
     from shared_types import StringSetType
     from Utilities.CSV import csv_data_line
     # Path to processed data
     PROCESSED_DATA_PATH = "ProcessedData/"
     def get_unique_names_from_file(filename, column_number):
     def get_unique_names_from_file(filename: str,
                                    column_number: int) -> StringSetType:
         """
             Extract set of unique names from file
         Args:
-...
         return name_set
     def export_data_to_csv(filename, data_dict):
     def export_data_to_csv(filename: str, data_dict) -> None:
         """
             Takes data_dict and export it into a csv file
         Args:
-...
             for date in data_dict:
                 if len(date) != 13:
                     raise ValueError("Invalid date format for key value --> YYYY-mm-dd-hh expected!")
                     raise ValueError(
                         "Invalid date format for key value --> YYYY-mm-dd-hh expected!"
+                    )
                 for data in data_dict[date]:
                     csv_line = data_dict[date][data]
                     if not isinstance(csv_line,csv_data_line.CSVDataLine):
                         raise ValueError("data_dict is expected to have CSVDataLine as values")
                     if not isinstance(csv_line, csv_data_line.CSVDataLine):
                         raise ValueError(
                             "data_dict is expected to have CSVDataLine as values")
                     file.write(csv_line.to_csv() + '\n')

     from Utilities import folder_processor
     from Utilities.Database import database_record_logs
     from bs4 import BeautifulSoup
     from typing import List
     # Path to crawler logs
     CRAWLER_LOGS_PATH = "CrawlerLogs/"
     # Path to crawled data
     CRAWLED_DATA_PATH = "CrawledData/"
     LinksType = List[str]
     def get_all_links(url):
     def get_all_links(url: str) -> LinksType:
         """
         Sends http request to url, downloads all data,
         extract links
-...
         return links
     def filter_links(links, regex):
     def filter_links(links: LinksType, regex: str) -> LinksType:
         """
         Filters list of links using regex
-...
         return filtered_links
     def create_absolute_links(links, archive):
     def create_absolute_links(links: LinksType, archive: str) -> LinksType:
         """
             Appends archive path to every link in links
         Args:
-...
         return absolute_links
     def remove_downloaded_links(links, dataset_name):
     def remove_downloaded_links(links: LinksType, dataset_name: str) -> LinksType:
         """
         Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
         Args:
-...
         return final_links
     def download_file_from_url(url, dataset_name):
     def download_file_from_url(url: str, dataset_name: str) -> None:
         """
         Downloads file on provided url and saves it to path
         Args:

     from typing import Dict
     class DatabaseDataLine:
         """
         Class that specifies the look of data line in database
         """
         def __init__(self, name, longitude, latitude, date, occurrence):
         def __init__(self, name: str, longitude: float, latitude: float, date: str,
                      occurrence: int):
             self.name = name
             self.latitude = latitude
             self.longitude = longitude
             self.date = date
             self.occurrence = occurrence
         def to_dictionary(self):
             return {"place": self.name, "x": self.longitude, "y": self.latitude, "number": self.occurrence,
                     "date": self.date}
         def to_dictionary(self) -> Dict[str, any]:
             return {
                 "place": self.name,
                 "x": self.longitude,
                 "y": self.latitude,
                 "number": self.occurrence,
                 "date": self.date
+            }

     from Utilities.Database import database_data_line, database_record_logs
     from Utilities import configure_functions
     from Utilities.helpers import should_skip, detect_change
     from shared_types import ConfigType
     from typing import Dict
     import pymongo
     import re
-...
     # Path to processed data
     PROCESSED_DATA_PATH = "ProcessedData/"
     DatabaseConnectionType = Dict[str, any]
     def create_database_connection():
     def create_database_connection() -> pymongo.database.Database:
         """
         Creates connection to mongoDB
-...
         return database
     def get_data_from_file(filename, config):
     def get_data_from_file(filename: str, config: ConfigType) -> Dict[str, any]:
         """
             Opens processed file, reads it line by line
             name, ocurrence, date
-...
         f = open(dataset_path + filename, "r")
         devices = config["devices"]
         date_dict = dict()
         date_dict = {}
         for line in f:
             line = line[:-1]
-...
         return date_dict
     def load_data_to_database(database_connection, dataset_name, data_dic,
                               file_name):
     def load_data_to_database(database_connection: DatabaseConnectionType,
                               dataset_name: str, data_dic: Dict[str, any],
                               file_name: str) -> None:
         """
         Takes data_dic created in method get_data_from_file
         and loads into into database where collection name is dataset_name + data_dic key
-...
             date_dataset.insert_many(data_dic[date])
     def check_or_update_datasets_collection(database_connection, config):
     def check_or_update_datasets_collection(
             database_connection: DatabaseConnectionType, config: ConfigType):
         """
         Checks if DATASETS collection contains dataset and if display name was not updated
-...
             config: loaded configuration file of dataset
         """
         # collection where are specified aviable datasets
         compareKeys = ['display-name',
                        'display-color']
         compareKeys = ['display-name', 'display-color']
         collection_datasets = database_connection[MONGODB_DATASET_COLLECTION]
         query = {'key-name': config['dataset-name']}
-...
             collection_datasets.update_one(query, {"$set": newVal})
     def update_devices_collection(config):
     def update_devices_collection(config: ConfigType):
         """
         Checks if there are any changes in devices specified in config file against
         devices processed and loaded into the database
-...
         devices_cursor = collection_devices.find()
         db_device_dict = dict()
         db_device_dict = {}
         for device in devices_cursor:
             name = device['name']
-...
         return change_in_devices
     def remove_dataset_database(dataset_name):
     def remove_dataset_database(dataset_name: str):
         """
         Removes dataset entries from database
         Args:
-...
         collection_datasets = mydb[MONGODB_DATASET_COLLECTION]
         collection_datasets.delete_one({"key-name": dataset_name})
         print("Removing record from DATASETS collection")
         print("Odstraňování záznamu z DATASETS kolekce")
         # Retrieve list of all collections
         collections = mydb.list_collection_names()
-...
         for name in collections:
             if name.startswith(dataset_name):
                 mydb[name].drop()
                 print("Dropping: " + name)
                 print("Odstraňuji: " + name)
     def reset_dataset_database(dataset_name):
     def reset_dataset_database(dataset_name: str):
         """
         Reset dataset in database
          - delete everything from except crawled links and mention in DATASETS collection
-...
         for name in collections:
             if pattern.match(name):
                 mydb[name].drop()
                 print("Dropping: " + name)
                 print("Odstraňuji: " + name)
         database_record_logs.reset_ignore_set_loaded(dataset_name)

     from Utilities.Database import database_loader
     from shared_types import StringSetType
     # mongodb collection with with already downloaded links
     MONGODB_DATASET_LINK_COLLECTION = "LINKS"
     # mongodb collection with with already processed files
-...
     MONGODB_DATASET_COLLECTION = "DATASETS"
     def load_ignore_set_links(dataset_name):
     def load_ignore_set_links(dataset_name: str) -> StringSetType:
         """
         Loades from database links of already downloaded files by crawler
-...
         return ignore_set
     def update_ignore_set_links(dataset_name,link):
     def update_ignore_set_links(dataset_name: str, link: str) -> None:
         """
         Adds links of newly crawled files to the database
-...
         my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
         my_col.insert({ "name": link})
         my_col.insert({"name": link})
     def reset_ignore_set_links(dataset_name):
     def reset_ignore_set_links(dataset_name: str) -> None:
         """
         Drops collection of already downloaded links
-...
         my_col.drop()
     def load_ignore_set_processed(dataset_name):
     def load_ignore_set_processed(dataset_name: str) -> StringSetType:
         """
         Loads from database set of already processed files
-...
         return ignore_set
     def update_ignore_set_processed(dataset_name,filename):
     def update_ignore_set_processed(dataset_name: str, filename: str) -> None:
         """
         Adds files of newly processed files to the database
-...
         my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
         my_col.insert({ "name": filename})
         my_col.insert({"name": filename})
     def reset_ignore_set_processed(dataset_name):
     def reset_ignore_set_processed(dataset_name: str) -> None:
         """
         Drops collection of already processed files
-...
         my_col.drop()
     def load_ignore_set_loaded(dataset_name):
     def load_ignore_set_loaded(dataset_name: str) -> StringSetType:
         """
         Loads from database set of already loaded files in database
-...
         return ignore_set
     def update_ignore_set_loaded(dataset_name,filename):
     def update_ignore_set_loaded(dataset_name: str, filename: str) -> None:
         """
         Adds files of newly loaded files to the database
-...
         my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
         my_col.insert({ "name": filename})
         my_col.insert({"name": filename})
     def reset_ignore_set_loaded(dataset_name):
     def reset_ignore_set_loaded(dataset_name: str) -> None:
         """
         Drops collection of already loaded files
-...
         my_col.drop()
     def load_updated(dataset_name):
     def load_updated(dataset_name: str) -> int:
         """
         Loads value of (days from last update) from db
-...
         my_col = connection[MONGODB_DATASET_COLLECTION]
         data = my_col.find_one({'key-name': dataset_name},{'updated'})
         data = my_col.find_one({'key-name': dataset_name}, {'updated'})
         updated = int(data['updated'])
         return updated
     def update_updated(dataset_name,value):
     def update_updated(dataset_name: str, value: int):
         """
         Updates value of (days from last update) in db
-...
         my_col = connection[MONGODB_DATASET_COLLECTION]
         myquery = { 'key-name': dataset_name }
         new_values = { "$set": { "updated": value } }
         myquery = {'key-name': dataset_name}
         new_values = {"$set": {"updated": value}}
         my_col.update_one(myquery,new_values)
         my_col.update_one(myquery, new_values)

     import yaml
     import os
     from typing import Dict, Set
     from shared_types import StringSetType
     from Utilities.Database import database_record_logs
     from Utilities.helpers import should_skip
-...
     CONFIG_FILE_TYPE = ".yaml"
     def load_configuration(dataset_name):
     def load_configuration(dataset_name: str) -> Dict[str, any]:
         """
         Loads yaml configuration file into memory
-...
         with open(CONFIG_FILES_PATH + dataset_name + CONFIG_FILE_TYPE, "r") as f:
             data = yaml.load(f, Loader=yaml.FullLoader)
         devices_dic = dict()
         devices_dic = {}
         if data["devices"] is not None:
             for item in data["devices"]:
-...
         return data
     def update_configuration(dataset_name, new_devices):
     def update_configuration(dataset_name: str,
                              new_devices: StringSetType) -> None:
         """
         Open dataset and appends new_devices to the end
-...
                 file.write("\n")
     def check_if_there_is_a_config_file(dataset_name):
     def check_if_there_is_a_config_file(dataset_name: str) -> bool:
         """
         Goes trough all config files (represeting valid dataset in database)
         and checks if dataset_name is there
-...
         return False
     def return_dictionary_of_valid_devices(devices):
     def return_dictionary_of_valid_devices(
             devices: Dict[str, any]) -> Dict[str, Dict[str, any]]:
         """
         Iterates over all devices specified in config file
-...
         Returns:
             Dictonary containing only valid devices
         """
         valid_devices = dict()
         valid_devices = {}
         for device in devices.keys():
             if not should_skip(devices[device]):

     def date_formatter(string_date):
     def date_formatter(string_date: str) -> str:
         """
         Args:
-...
             string_date = srr
         return_date = string_date[6:10] + '-' + string_date[3:5] + '-' + string_date[:2]
         return_date = string_date[6:10] + '-' + string_date[
 :5] + '-' + string_date[:2]
         return return_date
     def date_time_formatter(string_date):
     def date_time_formatter(string_date: str) -> str:
         """
         Converts one type of date format "dd.mm.yyyy hh.mm.ss" to date format YYYY-mm-dd-hh
         Args:
-...
             string_date = srr
         return_date = string_date[6:10] + '-' + string_date[3:5] + '-' + string_date[:2] + '-' + string_date[11:13]
         return_date = string_date[6:10] + '-' + string_date[
 :5] + '-' + string_date[:2] + '-' + string_date[11:13]
         return return_date

     import os
     import zipfile
     from shared_types import ConfigType, StringSetType
     from Utilities.CSV import csv_utils
     from Utilities.Database import database_record_logs
     def list_of_all_new_files(ignore_set,path):
     def list_of_all_new_files(ignore_set: StringSetType,
                               path: str) -> StringSetType:
         """
         Get all files from directory and all files written in ignore.txt
         and return the difference
-...
         """
         files_in_dir = os.listdir(path)
         return set(files_in_dir).difference(ignore_set)
     def get_devices_set(dataset_name,path):
     def get_devices_set(dataset_name: str, path: str) -> StringSetType:
         """
          Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
          Extracts names from not loaded file which should be in first column
-...
             set of unique names contained in not loaded files
         """
         ignore_set = database_record_logs.load_ignore_set_loaded(dataset_name)
         files_in_dir = list_of_all_new_files(ignore_set,path)
         files_in_dir = list_of_all_new_files(ignore_set, path)
         unique_names = set()
         for file_path in files_in_dir:
             unique_names.update(csv_utils.get_unique_names_from_file(path+file_path, 0))
             unique_names.update(
                 csv_utils.get_unique_names_from_file(path + file_path, 0))
         return unique_names
     def get_unknown_devices_set(config, devices):
     def get_unknown_devices_set(config: ConfigType,
                                 devices: StringSetType) -> StringSetType:
         """
         Compares config and devices a return difference
-...
         return unknown_devices_set
     def unzip_all_csv_zip_files_in_folder(path):
     def unzip_all_csv_zip_files_in_folder(path: str) -> None:
         """
         Load all files from directory and unzip those which end by .zip
         After unziping deletes the zip file
-...
             os.remove(zip_file)
     def clean_folder(path):
     def clean_folder(path: str) -> None:
         """
         Deletes all files in folder
-...
         files = os.listdir(path)
         for file in files:
             os.remove(path+file)
             os.remove(path + file)

     UNKNOWN = "UNKNOWN!"
     def should_skip(device) -> bool:
     def should_skip(device: Dict[str, str]) -> bool:
         return device['x'] == SKIP or device['y'] == SKIP or device[
             'x'] == UNKNOWN or device['y'] == UNKNOWN
     def detect_change(first: Dict[str, str], second: Dict[str, str], compareKeys: [str]) -> bool:
     def detect_change(first: Dict[str, str], second: Dict[str, str],
                       compareKeys: [str]) -> bool:
         """Detects change between two dictonaries
         Args:

     CONFIG_FILES_PATH = "DatasetConfigs/"
     def run_pipeline_for_all_datasets():
     def run_pipeline_for_all_datasets() -> None:
         """
         Runs whole DataScript pipeline for every dataset that has existing configuration file
         """
-...
             pipeline.run_full_pipeline_crone(name)
     run_pipeline_for_all_datasets()
     def main() -> None:
         run_pipeline_for_all_datasets()
     if __name__ == "__main__":
         main()

     CRAWLER_LOGS_PATH = "CrawlerLogs/"
     def prepare_strucure_for_all_datasets():
     def prepare_strucure_for_all_datasets() -> None:
         """
         Prepares folders that are necessery but does not contain code so they are excluded from gitlab by gitignore
         """
         if  not os.path.isdir(CRAWLED_DATA_PATH) :
         if not os.path.isdir(CRAWLED_DATA_PATH):
             try:
                 os.mkdir(CRAWLED_DATA_PATH)
             except os.error as e:
                 print(e)
                 print("Creation of the directory %s failed" % CRAWLED_DATA_PATH)
                 print("Nelze vytvořit adresář %s" % CRAWLED_DATA_PATH)
         if  not os.path.isdir(PROCESSED_DATA_PATH) :
         if not os.path.isdir(PROCESSED_DATA_PATH):
             try:
                 os.mkdir(PROCESSED_DATA_PATH)
             except os.error as e:
                 print(e)
                 print("Creation of the directory %s failed" % PROCESSED_DATA_PATH)
         if  not os.path.isdir(CRAWLER_LOGS_PATH) :
                 print("Nelze vytvořit adresář %s" % PROCESSED_DATA_PATH)
         if not os.path.isdir(CRAWLER_LOGS_PATH):
             try:
                 os.mkdir(CRAWLER_LOGS_PATH)
             except os.error as e:
                 print(e)
                 print("Creation of the directory %s failed" % PROCESSED_DATA_PATH)
                 print("Nelze vytvořit adresář %s" % CRAWLER_LOGS_PATH)
         files_in_dir = os.listdir(CONFIG_FILES_PATH)
-...
             prepare_structure(name[0])
     def prepare_structure(dataset_name):
     def prepare_structure(dataset_name: str) -> None:
         """
         Create folder for every dataset in newly created folder for processed and crawled data
         """
         path =  CRAWLED_DATA_PATH + dataset_name
         if  not os.path.isdir(path) :
         path = CRAWLED_DATA_PATH + dataset_name
         if not os.path.isdir(path):
             os.mkdir(path)
         path =  PROCESSED_DATA_PATH + dataset_name
         if not  os.path.isdir(path):
         path = PROCESSED_DATA_PATH + dataset_name
         if not os.path.isdir(path):
             os.mkdir(PROCESSED_DATA_PATH + dataset_name)
     print("Inicializuji počáteční strukturu pro stažená a zpracovaná data")
     prepare_strucure_for_all_datasets()
     def main() -> None:
         print("Inicializuji počáteční strukturu pro stažená a zpracovaná data")
         prepare_strucure_for_all_datasets()
     if __name__ == "__main__":
         main()

     from Utilities import configure_functions
     import pipeline
     import os
     from Utilities import configure_functions
     import sys
     # Path to configuration files
     CONFIG_FILES_PATH = "DatasetConfigs/"
     WRONG_ARG_MSG = "Do argumentu funkce dejte jméno Datasetu, který chcete aktualizovat (pokud všechny zadejte 'ALL'):\n"
     DATASET_NOT_FOUND_MSG = "Tento dataset v architektuře neexistuje"
     def run_pipeline_for_one_datasets(dataset_name):
     def run_pipeline_for_one_datasets(dataset_name: str) -> None:
         print("Probíhá update datasetu " + dataset_name)
         pipeline.run_full_pipeline(dataset_name)
     def run_pipeline_for_all_datasets():
     def run_pipeline_for_all_datasets() -> None:
         """
         Runs whole DataScript pipeline for every dataset that has existing configuration file
         """
-...
             pipeline.run_full_pipeline(name)
     print("Zadejte jméno Datasetu který chcete updatovat (pokud všechny zadejte '-ALL'):\n")
     def main() -> None:
         if len(sys.argv) > 1:
             dataset_name = sys.argv[1].upper()
             if dataset_name == "ALL":
                 run_pipeline_for_all_datasets()
             else:
                 test = configure_functions.check_if_there_is_a_config_file(
                     dataset_name)
                 if test == True:
                     run_pipeline_for_one_datasets(dataset_name)
                 else:
                     print(DATASET_NOT_FOUND_MSG)
         else:
             print(WRONG_ARG_MSG)
     dataset_name = input().upper()
     if dataset_name == '-ALL':
         run_pipeline_for_all_datasets()
     else:
         test = configure_functions.check_if_there_is_a_config_file(dataset_name)
         if test == True:
             run_pipeline_for_one_datasets(dataset_name)
         else:
             print("Tento dataset v architektuře neexistuje")
     if __name__ == "__main__":
         main()

     from Utilities.Database import database_loader
     #TODO: smazat vsechny pomocny soubory po cisteni databaze + prejmenovat
     def clean_database():
     def clean_database() -> None:
         """
         Drops every collection in database
         """
-...
             mydb[name].drop()
     print('Data z databáze budou smazána:')
     clean_database()
     def main() -> None:
         print('Data z databáze budou smazána:')
         clean_database()
     if __name__ == "__main__":
         main()

     from Utilities import folder_processor, configure_functions
     from Utilities.Database import database_loader, database_record_logs
     from Utilities.CSV import csv_utils
     from shared_types import ConfigType
     import os
     import pymongo
-...
     #logger
     logging.basicConfig(filename=CRAWLER_LOGS_PATH + 'Applicationlog-' +
                                    date.today().strftime("%b-%Y") + '.log',
                                    level=logging.INFO,
                                    format='%(asctime)s %(message)s')
                         date.today().strftime("%b-%Y") + '.log',
                         level=logging.INFO,
                         format='%(asctime)s %(message)s')
     def check_last_update(config):
     def check_last_update(config: ConfigType) -> bool:
         """
         Loads integer from updated.txt in CrawlerLogs/"dataset_name"
         representing number of days from last update if number equals
-...
             return False
     def crawl_data(config):
     def crawl_data(config: ConfigType) -> None:
         """
           Imports dataset crawler in DatasetCrawler/"dataset_name"_crawler.py
           runs crawler.
-...
         dataset_name += '/'
     def process_data(config):
     def process_data(config: ConfigType) -> None:
         """
         Goes trough every not processed file(list of processed files is saved in databse)
         Imports dataset processor in DatasetProcessing/"dataset_name"_processor.py
-...
             path = CRAWLED_DATA_PATH + dataset_path + not_processed_file
             date_dic = process_file_func(path)
             csv_utils.export_data_to_csv(path, date_dic)
             print("Vytvářím: " + not_processed_file)
             database_record_logs.update_ignore_set_processed(
                 dataset_name, not_processed_file)
-...
                      str(len(not_processed_files)) + " newly crawled files")
     def process_data_crone(config):
     def process_data_crone(config: ConfigType) -> None:
         """
         Goes trough every not processed file(list of processed files is saved in database)
         Imports dataset processor in DatasetProcessing/"dataset_name"_processor.py
-...
                      str(len(not_processed_files)) + " newly crawled files")
     def validate_process_data(config):
     def validate_process_data(config: ConfigType) -> bool:
         """
         Function goes through newly processed data and checks theirs status
-...
         return True
     def load_data_to_database(config):
     def load_data_to_database(config: ConfigType) -> None:
         """
         Goes trough every not loaded file(list of loaded files is saved in database)
         loads data appends coordination from configurations
-...
         changes_in_devices = database_loader.update_devices_collection(config)
         if changes_in_devices == True:
             logging.info(
                 dataset_name +
                 " contains changes in devices configuration. Deleting old data and preparing new"
+            )
             logg_string = dataset_name + " contains changes in devices configuration. Deleting old data and preparing new"
             logg_string_cs = dataset_name + " obsahuje změny v konfiguračním souboru. Probíha odstraňování starých dat a připravení nových."
             logging.info(logg_string)
             print(logg_string_cs)
             database_loader.reset_dataset_database(dataset_name)
         # get all unprocessed files from dataset
-...
             database_record_logs.update_ignore_set_loaded(dataset_name,
                                                           not_loaded_file)
         logging.info(dataset_name + " has loaded to database " +
                      str(len(not_loaded_files)) + " newly processed files.")
         logg_string = dataset_name + " has loaded to database " + str(
             len(not_loaded_files)) + " newly processed files."
         logg_string_cs = dataset_name + " načetl " + str(
             len(not_loaded_files)) + " nových zpracovaných souborů \n"
         logging.info(logg_string)
         print(logg_string_cs)
         client = pymongo.MongoClient()
         client.close()
     def load_data_to_database_crone(config):
     def load_data_to_database_crone(config: ConfigType) -> None:
         """
         Goes trough every not loaded file(list of loaded files is saved in database)
         loads data appends coordination from configurations
-...
         client.close()
     def run_full_pipeline(dataset_name):
     def run_full_pipeline(dataset_name: str) -> None:
         """
         Loads config file and starts full pipeline
         -crawl data
-...
         """
         logging.info("Starting pipeline for dataset " + dataset_name)
         print("Zpracovávám dataset " + dataset_name +
               " průběh lze sledovat v logu umístěném v in CrawlerLogs folder")
               ", průběh lze sledovat v logu umístěném v adresáři CrawlerLogs")
         config = configure_functions.load_configuration(dataset_name)
         crawl_data(config)
-...
             load_data_to_database(config)
     def run_full_pipeline_crone(dataset_name):
     def run_full_pipeline_crone(dataset_name: str) -> None:
         """
         Loads config file and starts full pipeline
         -crawl data

     import os
     # Path to crawled data
     CRAWLED_DATA_PATH = "CrawledData/"
     # Path to processed data
-...
     DEFAULT_COLOR = "#000000"
     def create_default_config_file(dataset_name: str):
     def create_default_config_file(dataset_name: str) -> None:
         """
         Creates default config file
-...
             dataset_name: Name of newly created dataset
         """
         with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
             file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
             file.write("# Name of the dataset inside the application\n")
             file.write("display-name: " + dataset_name + "\n")
             file.write(
                 "# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře\n")
             file.write("display-color: " + DEFAULT_COLOR + "\n")
                 "# Color for the dataset in a hex value (default value #000000)\n")
             file.write(f'display-color: \'{DEFAULT_COLOR}\' \n')
             file.write(
                 "# barva pro tento dataset v hexadecimální hodnotě (#000000)\n")
                 "# One word dataset name (structure of all modules will be affected by this)\n"
+            )
             file.write("dataset-name: " + dataset_name + "\n")
             file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
             file.write("url: ZDE VLOZTE URL\n")
             file.write("# Url for the source of this dataset\n")
             file.write("url: ENTER URL HERE\n")
             file.write(
                 "# Optional parameter which specifies a pattern of the datasets name\n"
+            )
             file.write(
                 "# Example: DATASET_NAME_[0-9][0-9]_[0-9][0-9][0-9][0-9].zip\n")
             file.write(
                 "# - DATASET_NAME_01_2020.zip where '01_2020' specifies date in this dataset\n"
+            )
             file.write("regex: ENTER REGEX HERE\n")
             file.write(
                 "# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
             file.write("regex: ZDE VLOZTE REGEX\n")
             file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
                        "tak defaultni hodnota (dny)\n")
             file.write("update-period: ZDE VLOZTE HODNOTU\n")
             file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
                 "# Optional parameter which specifies the way of searching new datasets (if empty the period is set to every day)\n"
+            )
             file.write("update-period: ENTER UPDATE PERIOD HERE\n")
             file.write("# Coordinates of every datasets device (entinty)\n")
             file.write("devices:\n")
     def create_default_processor(dataset_name):
     def create_default_processor(dataset_name: str) -> None:
         """
         Creates default processor for dataset
         Args:
             dataset_name: Name of newly created dataset
         """
         with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "_processor.py", "w") as file:
             file.write("from Utilities.CSV import csv_data_line")
         with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "_processor.py",
                   "w") as file:
             file.write("from Utilities.CSV import csv_data_line\n")
             file.write("from shared_types import DateDict")
             file.write("\n")
             file.write("\n")
             file.write("def process_file(filename):\n")
             file.write("def process_file(filename: str) -> DateDict:\n")
             file.write("    \"\"\"\n")
             file.write(
                 "    Method that take path to crawled file and outputs date dictionary:\n")
                 "    Method that takes the path to crawled file and outputs date dictionary:\n"
+            )
             file.write(
                 "    Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)\n")
                 "    Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)\n"
+            )
             file.write(
                 "    and value is dictionary where keys are devices (specified in configuration file)\n")
                 "    and value is dictionary where keys are devices (specified in configuration file)\n"
+            )
             file.write(
                 "    and value is CSVDataLine.csv_data_line with device,date and occurrence\n")
                 "    and value is CSVDataLine.csv_data_line with device,date and occurrence\n"
+            )
             file.write("\n")
             file.write("    Args:\n")
             file.write("    filename: name of processed file\n")
             file.write("    filename: name of the processed file\n")
             file.write("\n")
             file.write("    Returns:\n")
             file.write("    None if not implemented\n")
             file.write("    date_dict when implemented\n")
             file.write("    \"\"\"\n")
             file.write("    date_dict = dict()\n")
             file.write("    date_dict: DateDict = {}\n")
             file.write("\n")
             file.write("    #with open(filename, \"r\") as file:\n")
             file.write(
                 "    print(\"You must implements process_file method first!\")\n")
             file.write("    return None\n")
                 "    print(\"You must implement the process_file method first!\")\n"
+            )
             file.write("    return date_dict\n")
     def create_default_crawler(dataset_name):
     def create_default_crawler(dataset_name: str) -> None:
         """
         Creates default crawler for dataset
-...
             dataset_name: Name of newly created dataset
         """
         with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "_crawler.py", "w") as file:
         with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "_crawler.py",
                   "w") as file:
             file.write("from shared_types import ConfigType\n")
             file.write("# Path to crawled data\n")
             file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
             file.write(f'CRAWLED_DATA_PATH = "{CRAWLED_DATA_PATH}" \n')
             file.write("\n")
             file.write("\n")
             file.write("def crawl(config):\n")
             file.write("def crawl(config: ConfigType):\n")
             file.write("    \"\"\"\n")
             file.write(
                 "    Implement crawl method that downloads new data to path_for_files\n")
                 "    Implementation the crawl method which downloads new data to the path_for_files\n"
+            )
             file.write("    For keeping the project structure\n")
             file.write("    url , regex, and dataset_name from config\n")
             file.write(
                 "    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
                 "    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n"
+            )
             file.write("\n")
             file.write("    Args:\n")
             file.write("        config: loaded configuration file of dataset\n")
-...
             file.write("    regex = config['regex']\n")
             file.write(
                 "    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
             file.write("    print(\"You must implements Crawl method first!\")\n")
             file.write(
                 "    print(\"Není implementován crawler pro získávání dat!\")\n")
     def prepare_dataset_structure(dataset_name):
     def prepare_dataset_structure(dataset_name: str) -> None:
         """
         Prepares folders for new dataset
         Args:
-...
         """
         # create folder for crawled data
         path = CRAWLED_DATA_PATH+dataset_name
         path = CRAWLED_DATA_PATH + dataset_name
         try:
             os.mkdir(path)
         except os.error as e:
-...
         try:
             os.mkdir(path)
         except OSError:
             print("Creation of the directory %s failed" % path)
             print("Nelze vytvořit adresář %s" % path)
         create_default_crawler(dataset_name)
         create_default_processor(dataset_name)
         create_default_config_file(dataset_name)
     print("Zadejte jméno nového datasetu:\n")
     def main() -> None:
         print("Zadejte jméno nového datasetu:\n")
         dataset_name = input().upper()
         if dataset_name.isalpha():
             prepare_dataset_structure(dataset_name)
             print("Architektura vytvořena \n")
         else:
             print("Jméno musí obsahovat pouze písmena z abecedy (bez mezer)\n")
     dataset_name = input().upper()
     if dataset_name.isalpha():
         prepare_dataset_structure(dataset_name)
         print("Architektura vytvořena \n")
     else:
         print("Jméno musí obsahovat pouze písmena z abecedy (bez mezer)\n")
     if __name__ == "__main__":
         main()

modules/crawler/python.code-workspace
12	12	"python.linting.pylintEnabled": true,
13	13	"python.linting.enabled": true,
14	14	"python.linting.pylintPath": "pylint",
15		"python.pythonPath": "/usr/local/bin/python",
16	15	"python.formatting.provider": "yapf",
17	16	},
18	17	"extensions": {

Projekt

Obecné

Profil

ASWI - Pokročilé softwarové inženýrství » ASWI 2020 » Aplikace nad otevřenými daty (KIV) – BHVS

Revize af7609b5

Přidáno uživatelem Tomáš Ballák před více než 4 roky(ů)

Projekt

Obecné

Profil

ASWI - Pokročilé softwarové inženýrství » ASWI 2020 » Aplikace nad otevřenými daty (KIV) – BHVS

Revize af7609b5

Přidáno uživatelem Tomáš Ballák před více než 4 roky(ů)

Související úkoly