Projekt

Obecné

Profil

« Předchozí | Další » 

Revize af7609b5

Přidáno uživatelem Tomáš Ballák před více než 3 roky(ů)

Re #8193 - refactoring crawler

Zobrazit rozdíly:

modules/crawler/DatasetCrawler/JIS_crawler.py
1 1
from Utilities import folder_processor
2 2
from Utilities.Crawler import basic_crawler_functions
3

  
3
from shared_types import ConfigType
4 4
# Path to crawled data
5 5
CRAWLED_DATA_PATH = "CrawledData/"
6 6

  
7 7

  
8
def crawl(config):
8
def crawl(config: ConfigType):
9 9
    """
10 10
    Implement crawl method that downloads new data to path_for_files
11 11
    For keeping the project structure
......
21 21
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
22 22

  
23 23
    first_level_links = basic_crawler_functions.get_all_links(url)
24
    filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU")
25
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url)
24
    filtered_first_level_links = basic_crawler_functions.filter_links(
25
        first_level_links, "^OD_ZCU")
26
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(
27
        filtered_first_level_links, url)
26 28

  
27 29
    files = []
28 30

  
29 31
    for link in absolute_first_level_links:
30 32
        second_level_links = basic_crawler_functions.get_all_links(link)
31
        filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex)
32
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link)
33
        filtered_second_level_links = basic_crawler_functions.filter_links(
34
            second_level_links, regex)
35
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(
36
            filtered_second_level_links, link)
33 37

  
34 38
        for file_link in absolute_second_level_links:
35 39
            files.append(file_link)
36 40

  
37
    files = basic_crawler_functions.remove_downloaded_links(files, dataset_name)
41
    files = basic_crawler_functions.remove_downloaded_links(
42
        files, dataset_name)
38 43

  
39 44
    for file in files:
40 45
        basic_crawler_functions.download_file_from_url(file, dataset_name)
modules/crawler/DatasetCrawler/KOLOBEZKY_crawler.py
1 1
from Utilities import folder_processor
2 2
from Utilities.Crawler import basic_crawler_functions
3

  
3
from shared_types import ConfigType
4 4
# Path to crawled data
5 5
CRAWLED_DATA_PATH = "CrawledData/"
6 6

  
7 7

  
8
def crawl(config):
8
def crawl(config: ConfigType):
9 9
    """
10 10
    Implement crawl method that downloads new data to path_for_files
11 11
    For keeping the project structure
......
21 21
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
22 22

  
23 23
    first_level_links = basic_crawler_functions.get_all_links(url)
24
    filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU")
25
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url)
24
    filtered_first_level_links = basic_crawler_functions.filter_links(
25
        first_level_links, "^OD_ZCU")
26
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(
27
        filtered_first_level_links, url)
26 28

  
27 29
    files = []
28 30

  
29 31
    for link in absolute_first_level_links:
30 32
        second_level_links = basic_crawler_functions.get_all_links(link)
31
        filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex)
32
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link)
33
        filtered_second_level_links = basic_crawler_functions.filter_links(
34
            second_level_links, regex)
35
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(
36
            filtered_second_level_links, link)
33 37

  
34 38
        for file_link in absolute_second_level_links:
35 39
            files.append(file_link)
36 40

  
37
    files = basic_crawler_functions.remove_downloaded_links(files, dataset_name)
41
    files = basic_crawler_functions.remove_downloaded_links(
42
        files, dataset_name)
38 43

  
39 44
    for file in files:
40 45
        basic_crawler_functions.download_file_from_url(file, dataset_name)
modules/crawler/DatasetCrawler/OBSAZENIMISTNOSTI_crawler.py
1 1
from Utilities import folder_processor
2 2
from Utilities.Crawler import basic_crawler_functions
3
from shared_types import ConfigType
3 4

  
4 5
# Path to crawled data
5 6
CRAWLED_DATA_PATH = "CrawledData/"
6 7

  
7 8

  
8
def crawl(config):
9
def crawl(config: ConfigType):
9 10
    """
10 11
    Implement crawl method that downloads new data to path_for_files
11 12
    For keeping the project structure
modules/crawler/DatasetCrawler/WIFI_crawler.py
1 1
from Utilities import folder_processor
2 2
from Utilities.Crawler import basic_crawler_functions
3
from shared_types import ConfigType
3 4

  
4 5
# Path to crawled data
5 6
CRAWLED_DATA_PATH = "CrawledData/"
6 7

  
7 8

  
8
def crawl(config):
9
def crawl(config: ConfigType):
9 10
    """
10 11
    Implement crawl method that downloads new data to path_for_files
11 12
    For keeping the project structure
......
21 22
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
22 23

  
23 24
    first_level_links = basic_crawler_functions.get_all_links(url)
24
    filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU")
25
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url)
25
    filtered_first_level_links = basic_crawler_functions.filter_links(
26
        first_level_links, "^OD_ZCU")
27
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(
28
        filtered_first_level_links, url)
26 29

  
27 30
    files = []
28 31

  
29 32
    for link in absolute_first_level_links:
30 33
        second_level_links = basic_crawler_functions.get_all_links(link)
31
        filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex)
32
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link)
34
        filtered_second_level_links = basic_crawler_functions.filter_links(
35
            second_level_links, regex)
36
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(
37
            filtered_second_level_links, link)
33 38

  
34 39
        for file_link in absolute_second_level_links:
35 40
            files.append(file_link)
36 41

  
37
    files = basic_crawler_functions.remove_downloaded_links(files, dataset_name)
42
    files = basic_crawler_functions.remove_downloaded_links(
43
        files, dataset_name)
38 44

  
39 45
    for file in files:
40 46
        basic_crawler_functions.download_file_from_url(file, dataset_name)
modules/crawler/DatasetProcessing/JIS_processor.py
1 1
from Utilities.CSV import csv_data_line
2 2
from Utilities import date_formating
3 3

  
4
from shared_types import DateDict
4 5

  
5
def process_file(filename):
6

  
7
def process_file(filename: str) -> DateDict:
6 8
    """
7 9
    Method that take path to crawled file and outputs date dictionary:
8 10
    Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)
......
16 18
    None if not implemented
17 19
    date_dict when implemented
18 20
    """
19
    date_dict = dict()
21
    date_dict = {}
20 22

  
21 23
    with open(filename, "r", encoding="utf-8") as file:
22 24

  
......
29 31
            occurrence = array[2][:-1]
30 32

  
31 33
            if date not in date_dict:
32
                date_dict[date] = dict()
34
                date_dict[date] = {}
33 35

  
34 36
            if name in date_dict[date]:
35 37
                date_dict[date][name].occurrence += int(occurrence)
36 38
            else:
37
                date_dict[date][name] = csv_data_line.CSVDataLine(name, date, occurrence)
39
                date_dict[date][name] = csv_data_line.CSVDataLine(
40
                    name, date, occurrence)
38 41

  
39 42
    return date_dict
40

  
modules/crawler/DatasetProcessing/KOLOBEZKY_processor.py
1 1
from Utilities.CSV import csv_data_line
2 2
from Utilities import date_formating
3 3

  
4
from shared_types import DateDict
4 5

  
5
def process_file(filename):
6

  
7
def process_file(filename: str) -> DateDict:
6 8
    """
7 9
    Method that take path to crawled file and outputs date dictionary:
8 10
    Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)
......
16 18
    None if not implemented
17 19
    date_dict when implemented
18 20
    """
19
    date_dict = dict()
21
    date_dict = {}
20 22

  
21 23
    with open(filename, "r") as file:
22 24

  
......
28 30
            name = array[1][1:-1]
29 31

  
30 32
            if date not in date_dict:
31
                date_dict[date] = dict()
33
                date_dict[date] = {}
32 34

  
33 35
            if name in date_dict[date]:
34 36
                date_dict[date][name].occurrence += 1
35 37
            else:
36
                date_dict[date][name] = csv_data_line.CSVDataLine(name, date, 1)
38
                date_dict[date][name] = csv_data_line.CSVDataLine(
39
                    name, date, 1)
37 40

  
38 41
    return date_dict
39

  
modules/crawler/DatasetProcessing/OBSAZENIMISTNOSTI_processor.py
5 5
import time
6 6
import datetime
7 7

  
8
from shared_types import DateDict
9

  
8 10
logging.basicConfig(filename='../../CrawlerLogs' + 'Crawlerlog-' +
9 11
                    date.today().strftime("%b-%Y") + '.log',
10 12
                    level=logging.INFO,
11 13
                    format='%(asctime)s %(message)s')
12 14

  
13 15

  
14
def process_file(filename):
16
def process_file(filename: str) -> DateDict:
15 17
    """
16 18
    Method that take path to crawled file and outputs date dictionary:
17 19
    Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)
......
25 27
    None if not implemented
26 28
    date_dict when implemented
27 29
    """
28
    date_dict = dict()
30
    date_dict = {}
29 31

  
30 32
    with open(filename, "r") as file:
31 33

  
modules/crawler/DatasetProcessing/WIFI_processor.py
1 1
from Utilities.CSV import csv_data_line
2 2
from Utilities import date_formating
3
from shared_types import DateDict
3 4

  
4 5

  
5
def process_file(filename):
6
def process_file(filename: str) -> DateDict:
6 7
    """
7 8
    Method that take path to crawled file and outputs date dictionary:
8 9
    Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)
modules/crawler/Utilities/CSV/csv_data_line.py
3 3
    Class that specifies the look of data line in processed csv file
4 4
    prepared for database
5 5
    """
6

  
7
    def __init__(self, name, date, occurrence):
6
    def __init__(self, name: str, date: str, occurrence: int) -> None:
8 7
        try:
9 8
            test_val = int(occurrence)
10 9
        except ValueError:
11 10
            print("Occurence should be and integer value!")
12 11

  
13 12
        if len(date) != 13:
14
            raise ValueError("Invalid date format YYYY-dd-mm-hh expected!")    
13
            raise ValueError("Invalid date format YYYY-dd-mm-hh expected!")
15 14

  
16 15
        self.name = name
17 16
        self.date = date
18 17
        self.occurrence = test_val
19 18

  
20
    def to_csv(self):
19
    def to_csv(self) -> str:
21 20
        return self.name + ";" + str(self.occurrence) + ";" + self.date
22

  
modules/crawler/Utilities/CSV/csv_utils.py
1 1
import inspect
2
from shared_types import StringSetType
2 3
from Utilities.CSV import csv_data_line
3 4

  
4 5
# Path to processed data
5 6
PROCESSED_DATA_PATH = "ProcessedData/"
6 7

  
7 8

  
8
def get_unique_names_from_file(filename, column_number):
9
def get_unique_names_from_file(filename: str,
10
                               column_number: int) -> StringSetType:
9 11
    """
10 12
        Extract set of unique names from file
11 13
    Args:
......
29 31
    return name_set
30 32

  
31 33

  
32
def export_data_to_csv(filename, data_dict):
34
def export_data_to_csv(filename: str, data_dict) -> None:
33 35
    """
34 36
        Takes data_dict and export it into a csv file
35 37
    Args:
......
40 42

  
41 43
        for date in data_dict:
42 44
            if len(date) != 13:
43
                raise ValueError("Invalid date format for key value --> YYYY-mm-dd-hh expected!")   
45
                raise ValueError(
46
                    "Invalid date format for key value --> YYYY-mm-dd-hh expected!"
47
                )
44 48
            for data in data_dict[date]:
45 49
                csv_line = data_dict[date][data]
46
                if not isinstance(csv_line,csv_data_line.CSVDataLine):
47
                    raise ValueError("data_dict is expected to have CSVDataLine as values")
50
                if not isinstance(csv_line, csv_data_line.CSVDataLine):
51
                    raise ValueError(
52
                        "data_dict is expected to have CSVDataLine as values")
48 53
                file.write(csv_line.to_csv() + '\n')
modules/crawler/Utilities/Crawler/basic_crawler_functions.py
3 3
from Utilities import folder_processor
4 4
from Utilities.Database import database_record_logs
5 5
from bs4 import BeautifulSoup
6
from typing import List
6 7

  
7 8
# Path to crawler logs
8 9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
9 10
# Path to crawled data
10 11
CRAWLED_DATA_PATH = "CrawledData/"
12
LinksType = List[str]
11 13

  
12 14

  
13
def get_all_links(url):
15
def get_all_links(url: str) -> LinksType:
14 16
    """
15 17
    Sends http request to url, downloads all data,
16 18
    extract links
......
34 36
    return links
35 37

  
36 38

  
37
def filter_links(links, regex):
39
def filter_links(links: LinksType, regex: str) -> LinksType:
38 40
    """
39 41
    Filters list of links using regex
40 42

  
......
54 56
    return filtered_links
55 57

  
56 58

  
57
def create_absolute_links(links, archive):
59
def create_absolute_links(links: LinksType, archive: str) -> LinksType:
58 60
    """
59 61
        Appends archive path to every link in links
60 62
    Args:
......
72 74
    return absolute_links
73 75

  
74 76

  
75
def remove_downloaded_links(links, dataset_name):
77
def remove_downloaded_links(links: LinksType, dataset_name: str) -> LinksType:
76 78
    """
77 79
    Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
78 80
    Args:
......
88 90
    return final_links
89 91

  
90 92

  
91
def download_file_from_url(url, dataset_name):
93
def download_file_from_url(url: str, dataset_name: str) -> None:
92 94
    """
93 95
    Downloads file on provided url and saves it to path
94 96
    Args:
modules/crawler/Utilities/Database/database_data_line.py
1
from typing import Dict
2

  
3

  
1 4
class DatabaseDataLine:
2 5
    """
3 6
    Class that specifies the look of data line in database
4 7
    """
5
    def __init__(self, name, longitude, latitude, date, occurrence):
8
    def __init__(self, name: str, longitude: float, latitude: float, date: str,
9
                 occurrence: int):
6 10
        self.name = name
7 11
        self.latitude = latitude
8 12
        self.longitude = longitude
9 13
        self.date = date
10 14
        self.occurrence = occurrence
11 15

  
12
    def to_dictionary(self):
13
        return {"place": self.name, "x": self.longitude, "y": self.latitude, "number": self.occurrence,
14
                "date": self.date}
16
    def to_dictionary(self) -> Dict[str, any]:
17
        return {
18
            "place": self.name,
19
            "x": self.longitude,
20
            "y": self.latitude,
21
            "number": self.occurrence,
22
            "date": self.date
23
        }
modules/crawler/Utilities/Database/database_loader.py
1 1
from Utilities.Database import database_data_line, database_record_logs
2 2
from Utilities import configure_functions
3 3
from Utilities.helpers import should_skip, detect_change
4
from shared_types import ConfigType
5
from typing import Dict
4 6
import pymongo
5 7
import re
6 8

  
......
20 22
# Path to processed data
21 23
PROCESSED_DATA_PATH = "ProcessedData/"
22 24

  
25
DatabaseConnectionType = Dict[str, any]
23 26

  
24
def create_database_connection():
27

  
28
def create_database_connection() -> pymongo.database.Database:
25 29
    """
26 30
    Creates connection to mongoDB
27 31

  
......
38 42
    return database
39 43

  
40 44

  
41
def get_data_from_file(filename, config):
45
def get_data_from_file(filename: str, config: ConfigType) -> Dict[str, any]:
42 46
    """
43 47
        Opens processed file, reads it line by line
44 48
        name, ocurrence, date
......
59 63
    f = open(dataset_path + filename, "r")
60 64

  
61 65
    devices = config["devices"]
62
    date_dict = dict()
66
    date_dict = {}
63 67

  
64 68
    for line in f:
65 69
        line = line[:-1]
......
86 90
    return date_dict
87 91

  
88 92

  
89
def load_data_to_database(database_connection, dataset_name, data_dic,
90
                          file_name):
93
def load_data_to_database(database_connection: DatabaseConnectionType,
94
                          dataset_name: str, data_dic: Dict[str, any],
95
                          file_name: str) -> None:
91 96
    """
92 97
    Takes data_dic created in method get_data_from_file
93 98
    and loads into into database where collection name is dataset_name + data_dic key
......
107 112
        date_dataset.insert_many(data_dic[date])
108 113

  
109 114

  
110
def check_or_update_datasets_collection(database_connection, config):
115
def check_or_update_datasets_collection(
116
        database_connection: DatabaseConnectionType, config: ConfigType):
111 117
    """
112 118
    Checks if DATASETS collection contains dataset and if display name was not updated
113 119

  
......
116 122
        config: loaded configuration file of dataset
117 123
    """
118 124
    # collection where are specified aviable datasets
119
    compareKeys = ['display-name',
120
                   'display-color']
125
    compareKeys = ['display-name', 'display-color']
121 126
    collection_datasets = database_connection[MONGODB_DATASET_COLLECTION]
122 127

  
123 128
    query = {'key-name': config['dataset-name']}
......
139 144
        collection_datasets.update_one(query, {"$set": newVal})
140 145

  
141 146

  
142
def update_devices_collection(config):
147
def update_devices_collection(config: ConfigType):
143 148
    """
144 149
    Checks if there are any changes in devices specified in config file against 
145 150
    devices processed and loaded into the database
......
164 169

  
165 170
    devices_cursor = collection_devices.find()
166 171

  
167
    db_device_dict = dict()
172
    db_device_dict = {}
168 173

  
169 174
    for device in devices_cursor:
170 175
        name = device['name']
......
208 213
    return change_in_devices
209 214

  
210 215

  
211
def remove_dataset_database(dataset_name):
216
def remove_dataset_database(dataset_name: str):
212 217
    """
213 218
    Removes dataset entries from database
214 219
    Args:
......
221 226
    collection_datasets = mydb[MONGODB_DATASET_COLLECTION]
222 227

  
223 228
    collection_datasets.delete_one({"key-name": dataset_name})
224
    print("Removing record from DATASETS collection")
229
    print("Odstraňování záznamu z DATASETS kolekce")
225 230

  
226 231
    # Retrieve list of all collections
227 232
    collections = mydb.list_collection_names()
......
230 235
    for name in collections:
231 236
        if name.startswith(dataset_name):
232 237
            mydb[name].drop()
233
            print("Dropping: " + name)
238
            print("Odstraňuji: " + name)
234 239

  
235 240

  
236
def reset_dataset_database(dataset_name):
241
def reset_dataset_database(dataset_name: str):
237 242
    """
238 243
    Reset dataset in database 
239 244
     - delete everything from except crawled links and mention in DATASETS collection
......
252 257
    for name in collections:
253 258
        if pattern.match(name):
254 259
            mydb[name].drop()
255
            print("Dropping: " + name)
260
            print("Odstraňuji: " + name)
256 261

  
257 262
    database_record_logs.reset_ignore_set_loaded(dataset_name)
modules/crawler/Utilities/Database/database_record_logs.py
1 1
from Utilities.Database import database_loader
2

  
2
from shared_types import StringSetType
3 3
# mongodb collection with with already downloaded links
4 4
MONGODB_DATASET_LINK_COLLECTION = "LINKS"
5 5
# mongodb collection with with already processed files
......
10 10
MONGODB_DATASET_COLLECTION = "DATASETS"
11 11

  
12 12

  
13
def load_ignore_set_links(dataset_name):
13
def load_ignore_set_links(dataset_name: str) -> StringSetType:
14 14
    """
15 15
    Loades from database links of already downloaded files by crawler
16 16
    
......
32 32
    return ignore_set
33 33

  
34 34

  
35
def update_ignore_set_links(dataset_name,link):
35
def update_ignore_set_links(dataset_name: str, link: str) -> None:
36 36
    """
37 37
    Adds links of newly crawled files to the database
38 38
    
......
44 44

  
45 45
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
46 46

  
47
    my_col.insert({ "name": link})
47
    my_col.insert({"name": link})
48 48

  
49 49

  
50
def reset_ignore_set_links(dataset_name):
50
def reset_ignore_set_links(dataset_name: str) -> None:
51 51
    """
52 52
    Drops collection of already downloaded links
53 53
    
......
62 62
    my_col.drop()
63 63

  
64 64

  
65

  
66
def load_ignore_set_processed(dataset_name):
65
def load_ignore_set_processed(dataset_name: str) -> StringSetType:
67 66
    """
68 67
    Loads from database set of already processed files
69 68
    
......
85 84
    return ignore_set
86 85

  
87 86

  
88
def update_ignore_set_processed(dataset_name,filename):
87
def update_ignore_set_processed(dataset_name: str, filename: str) -> None:
89 88
    """
90 89
    Adds files of newly processed files to the database
91 90
    
......
97 96

  
98 97
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
99 98

  
100
    my_col.insert({ "name": filename})
101

  
99
    my_col.insert({"name": filename})
102 100

  
103 101

  
104
def reset_ignore_set_processed(dataset_name):
102
def reset_ignore_set_processed(dataset_name: str) -> None:
105 103
    """
106 104
    Drops collection of already processed files
107 105
    
......
116 114
    my_col.drop()
117 115

  
118 116

  
119

  
120
def load_ignore_set_loaded(dataset_name):
117
def load_ignore_set_loaded(dataset_name: str) -> StringSetType:
121 118
    """
122 119
    Loads from database set of already loaded files in database
123 120
    
......
139 136
    return ignore_set
140 137

  
141 138

  
142

  
143
def update_ignore_set_loaded(dataset_name,filename):
139
def update_ignore_set_loaded(dataset_name: str, filename: str) -> None:
144 140
    """
145 141
    Adds files of newly loaded files to the database
146 142
    
......
152 148

  
153 149
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
154 150

  
155
    my_col.insert({ "name": filename})
151
    my_col.insert({"name": filename})
156 152

  
157 153

  
158
def reset_ignore_set_loaded(dataset_name):
154
def reset_ignore_set_loaded(dataset_name: str) -> None:
159 155
    """
160 156
    Drops collection of already loaded files
161 157
    
......
170 166
    my_col.drop()
171 167

  
172 168

  
173
def load_updated(dataset_name):
169
def load_updated(dataset_name: str) -> int:
174 170
    """
175 171
    Loads value of (days from last update) from db
176 172
    
......
184 180

  
185 181
    my_col = connection[MONGODB_DATASET_COLLECTION]
186 182

  
187
    data = my_col.find_one({'key-name': dataset_name},{'updated'})
183
    data = my_col.find_one({'key-name': dataset_name}, {'updated'})
188 184

  
189 185
    updated = int(data['updated'])
190 186

  
191 187
    return updated
192 188

  
193 189

  
194
def update_updated(dataset_name,value):
190
def update_updated(dataset_name: str, value: int):
195 191
    """
196 192
    Updates value of (days from last update) in db
197 193
    
......
203 199

  
204 200
    my_col = connection[MONGODB_DATASET_COLLECTION]
205 201

  
206
    myquery = { 'key-name': dataset_name }
207
    new_values = { "$set": { "updated": value } }
202
    myquery = {'key-name': dataset_name}
203
    new_values = {"$set": {"updated": value}}
208 204

  
209
    my_col.update_one(myquery,new_values)
205
    my_col.update_one(myquery, new_values)
modules/crawler/Utilities/configure_functions.py
1 1
import yaml
2 2
import os
3
from typing import Dict, Set
4
from shared_types import StringSetType
3 5
from Utilities.Database import database_record_logs
4 6
from Utilities.helpers import should_skip
5 7

  
......
9 11
CONFIG_FILE_TYPE = ".yaml"
10 12

  
11 13

  
12
def load_configuration(dataset_name):
14
def load_configuration(dataset_name: str) -> Dict[str, any]:
13 15
    """
14 16
    Loads yaml configuration file into memory
15 17

  
......
22 24
    with open(CONFIG_FILES_PATH + dataset_name + CONFIG_FILE_TYPE, "r") as f:
23 25
        data = yaml.load(f, Loader=yaml.FullLoader)
24 26

  
25
    devices_dic = dict()
27
    devices_dic = {}
26 28

  
27 29
    if data["devices"] is not None:
28 30
        for item in data["devices"]:
......
33 35
    return data
34 36

  
35 37

  
36
def update_configuration(dataset_name, new_devices):
38
def update_configuration(dataset_name: str,
39
                         new_devices: StringSetType) -> None:
37 40
    """
38 41
    Open dataset and appends new_devices to the end
39 42

  
......
53 56
            file.write("\n")
54 57

  
55 58

  
56
def check_if_there_is_a_config_file(dataset_name):
59
def check_if_there_is_a_config_file(dataset_name: str) -> bool:
57 60
    """
58 61
    Goes trough all config files (represeting valid dataset in database)
59 62
    and checks if dataset_name is there
......
75 78
    return False
76 79

  
77 80

  
78
def return_dictionary_of_valid_devices(devices):
81
def return_dictionary_of_valid_devices(
82
        devices: Dict[str, any]) -> Dict[str, Dict[str, any]]:
79 83
    """
80 84
    Iterates over all devices specified in config file
81 85

  
......
87 91
    Returns:   
88 92
        Dictonary containing only valid devices
89 93
    """
90
    valid_devices = dict()
94
    valid_devices = {}
91 95

  
92 96
    for device in devices.keys():
93 97
        if not should_skip(devices[device]):
modules/crawler/Utilities/date_formating.py
1
def date_formatter(string_date):
1
def date_formatter(string_date: str) -> str:
2 2
    """
3 3

  
4 4
    Args:
......
21 21

  
22 22
        string_date = srr
23 23

  
24
    return_date = string_date[6:10] + '-' + string_date[3:5] + '-' + string_date[:2]
24
    return_date = string_date[6:10] + '-' + string_date[
25
        3:5] + '-' + string_date[:2]
25 26

  
26 27
    return return_date
27 28

  
28 29

  
29
def date_time_formatter(string_date):
30
def date_time_formatter(string_date: str) -> str:
30 31
    """
31 32
    Converts one type of date format "dd.mm.yyyy hh.mm.ss" to date format YYYY-mm-dd-hh
32 33
    Args:
......
49 50

  
50 51
        string_date = srr
51 52

  
52
    return_date = string_date[6:10] + '-' + string_date[3:5] + '-' + string_date[:2] + '-' + string_date[11:13]
53
    return_date = string_date[6:10] + '-' + string_date[
54
        3:5] + '-' + string_date[:2] + '-' + string_date[11:13]
53 55

  
54 56
    return return_date
modules/crawler/Utilities/folder_processor.py
1 1
import os
2 2
import zipfile
3
from shared_types import ConfigType, StringSetType
3 4
from Utilities.CSV import csv_utils
4 5
from Utilities.Database import database_record_logs
5 6

  
6 7

  
7
def list_of_all_new_files(ignore_set,path):
8
def list_of_all_new_files(ignore_set: StringSetType,
9
                          path: str) -> StringSetType:
8 10
    """
9 11
    Get all files from directory and all files written in ignore.txt
10 12
    and return the difference
......
16 18
    """
17 19
    files_in_dir = os.listdir(path)
18 20

  
19

  
20 21
    return set(files_in_dir).difference(ignore_set)
21 22

  
22 23

  
23

  
24
def get_devices_set(dataset_name,path):
24
def get_devices_set(dataset_name: str, path: str) -> StringSetType:
25 25
    """
26 26
     Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
27 27
     Extracts names from not loaded file which should be in first column
......
34 34
        set of unique names contained in not loaded files
35 35
    """
36 36
    ignore_set = database_record_logs.load_ignore_set_loaded(dataset_name)
37
    files_in_dir = list_of_all_new_files(ignore_set,path)
37
    files_in_dir = list_of_all_new_files(ignore_set, path)
38 38

  
39 39
    unique_names = set()
40 40

  
41 41
    for file_path in files_in_dir:
42
        unique_names.update(csv_utils.get_unique_names_from_file(path+file_path, 0))
42
        unique_names.update(
43
            csv_utils.get_unique_names_from_file(path + file_path, 0))
43 44

  
44 45
    return unique_names
45 46

  
46 47

  
47
def get_unknown_devices_set(config, devices):
48
def get_unknown_devices_set(config: ConfigType,
49
                            devices: StringSetType) -> StringSetType:
48 50
    """
49 51
    Compares config and devices a return difference
50 52

  
......
61 63
    return unknown_devices_set
62 64

  
63 65

  
64
def unzip_all_csv_zip_files_in_folder(path):
66
def unzip_all_csv_zip_files_in_folder(path: str) -> None:
65 67
    """
66 68
    Load all files from directory and unzip those which end by .zip
67 69
    After unziping deletes the zip file
......
83 85
        os.remove(zip_file)
84 86

  
85 87

  
86
def clean_folder(path):
88
def clean_folder(path: str) -> None:
87 89
    """
88 90
    Deletes all files in folder
89 91

  
......
93 95
    files = os.listdir(path)
94 96

  
95 97
    for file in files:
96
        os.remove(path+file)
98
        os.remove(path + file)
modules/crawler/Utilities/helpers.py
4 4
UNKNOWN = "UNKNOWN!"
5 5

  
6 6

  
7
def should_skip(device) -> bool:
7
def should_skip(device: Dict[str, str]) -> bool:
8 8
    return device['x'] == SKIP or device['y'] == SKIP or device[
9 9
        'x'] == UNKNOWN or device['y'] == UNKNOWN
10 10

  
11 11

  
12
def detect_change(first: Dict[str, str], second: Dict[str, str], compareKeys: [str]) -> bool:
12
def detect_change(first: Dict[str, str], second: Dict[str, str],
13
                  compareKeys: [str]) -> bool:
13 14
    """Detects change between two dictonaries
14 15

  
15 16
    Args:
modules/crawler/crone_update_script.py
5 5
CONFIG_FILES_PATH = "DatasetConfigs/"
6 6

  
7 7

  
8
def run_pipeline_for_all_datasets():
8
def run_pipeline_for_all_datasets() -> None:
9 9
    """
10 10
    Runs whole DataScript pipeline for every dataset that has existing configuration file
11 11
    """
......
16 16
        pipeline.run_full_pipeline_crone(name)
17 17

  
18 18

  
19
run_pipeline_for_all_datasets()
19
def main() -> None:
20
    run_pipeline_for_all_datasets()
21

  
22

  
23
if __name__ == "__main__":
24
    main()
modules/crawler/docker_prepare_structure.py
10 10
CRAWLER_LOGS_PATH = "CrawlerLogs/"
11 11

  
12 12

  
13
def prepare_strucure_for_all_datasets():
13
def prepare_strucure_for_all_datasets() -> None:
14 14
    """
15 15
    Prepares folders that are necessery but does not contain code so they are excluded from gitlab by gitignore
16 16
    """
17 17

  
18
    if  not os.path.isdir(CRAWLED_DATA_PATH) :
18
    if not os.path.isdir(CRAWLED_DATA_PATH):
19 19
        try:
20 20
            os.mkdir(CRAWLED_DATA_PATH)
21 21
        except os.error as e:
22 22
            print(e)
23
            print("Creation of the directory %s failed" % CRAWLED_DATA_PATH)
23
            print("Nelze vytvořit adresář %s" % CRAWLED_DATA_PATH)
24 24

  
25
    if  not os.path.isdir(PROCESSED_DATA_PATH) :
25
    if not os.path.isdir(PROCESSED_DATA_PATH):
26 26
        try:
27 27
            os.mkdir(PROCESSED_DATA_PATH)
28 28
        except os.error as e:
29 29
            print(e)
30
            print("Creation of the directory %s failed" % PROCESSED_DATA_PATH)
31
    
32
    if  not os.path.isdir(CRAWLER_LOGS_PATH) :
30
            print("Nelze vytvořit adresář %s" % PROCESSED_DATA_PATH)
31

  
32
    if not os.path.isdir(CRAWLER_LOGS_PATH):
33 33
        try:
34 34
            os.mkdir(CRAWLER_LOGS_PATH)
35 35
        except os.error as e:
36 36
            print(e)
37
            print("Creation of the directory %s failed" % PROCESSED_DATA_PATH)
38

  
37
            print("Nelze vytvořit adresář %s" % CRAWLER_LOGS_PATH)
39 38

  
40 39
    files_in_dir = os.listdir(CONFIG_FILES_PATH)
41 40

  
......
44 43
        prepare_structure(name[0])
45 44

  
46 45

  
47
def prepare_structure(dataset_name):
46
def prepare_structure(dataset_name: str) -> None:
48 47
    """
49 48
    Create folder for every dataset in newly created folder for processed and crawled data
50 49
    """
51 50

  
52
    path =  CRAWLED_DATA_PATH + dataset_name
53
    if  not os.path.isdir(path) :
51
    path = CRAWLED_DATA_PATH + dataset_name
52
    if not os.path.isdir(path):
54 53
        os.mkdir(path)
55 54

  
56
    path =  PROCESSED_DATA_PATH + dataset_name
57
    if not  os.path.isdir(path):
55
    path = PROCESSED_DATA_PATH + dataset_name
56
    if not os.path.isdir(path):
58 57
        os.mkdir(PROCESSED_DATA_PATH + dataset_name)
59 58

  
60 59

  
61
print("Inicializuji počáteční strukturu pro stažená a zpracovaná data")
62
prepare_strucure_for_all_datasets()
60
def main() -> None:
61
    print("Inicializuji počáteční strukturu pro stažená a zpracovaná data")
62
    prepare_strucure_for_all_datasets()
63

  
64

  
65
if __name__ == "__main__":
66
    main()
modules/crawler/force_update_datasets.py
1
from Utilities import configure_functions
1 2
import pipeline
2 3
import os
3
from Utilities import configure_functions
4
import sys
4 5

  
5 6
# Path to configuration files
6 7
CONFIG_FILES_PATH = "DatasetConfigs/"
8
WRONG_ARG_MSG = "Do argumentu funkce dejte jméno Datasetu, který chcete aktualizovat (pokud všechny zadejte 'ALL'):\n"
9
DATASET_NOT_FOUND_MSG = "Tento dataset v architektuře neexistuje"
7 10

  
8 11

  
9
def run_pipeline_for_one_datasets(dataset_name):
12
def run_pipeline_for_one_datasets(dataset_name: str) -> None:
10 13
    print("Probíhá update datasetu " + dataset_name)
11 14
    pipeline.run_full_pipeline(dataset_name)
12 15

  
13 16

  
14
def run_pipeline_for_all_datasets():
17
def run_pipeline_for_all_datasets() -> None:
15 18
    """
16 19
    Runs whole DataScript pipeline for every dataset that has existing configuration file
17 20
    """
......
23 26
        pipeline.run_full_pipeline(name)
24 27

  
25 28

  
26
print("Zadejte jméno Datasetu který chcete updatovat (pokud všechny zadejte '-ALL'):\n")
29
def main() -> None:
30
    if len(sys.argv) > 1:
31
        dataset_name = sys.argv[1].upper()
32
        if dataset_name == "ALL":
33
            run_pipeline_for_all_datasets()
34
        else:
35
            test = configure_functions.check_if_there_is_a_config_file(
36
                dataset_name)
37
            if test == True:
38
                run_pipeline_for_one_datasets(dataset_name)
39
            else:
40
                print(DATASET_NOT_FOUND_MSG)
41
    else:
42
        print(WRONG_ARG_MSG)
27 43

  
28
dataset_name = input().upper()
29 44

  
30
if dataset_name == '-ALL':
31
    run_pipeline_for_all_datasets()
32
else:
33
    test = configure_functions.check_if_there_is_a_config_file(dataset_name)
34
    if test == True:
35
        run_pipeline_for_one_datasets(dataset_name)
36
    else:
37
        print("Tento dataset v architektuře neexistuje")
45
if __name__ == "__main__":
46
    main()
modules/crawler/fully_clean_database.py
1 1
from Utilities.Database import database_loader
2 2

  
3 3

  
4
#TODO: smazat vsechny pomocny soubory po cisteni databaze + prejmenovat
5
def clean_database():
4
def clean_database() -> None:
6 5
    """
7 6
    Drops every collection in database
8 7
    """
......
18 17
        mydb[name].drop()
19 18

  
20 19

  
21
print('Data z databáze budou smazána:')
22
clean_database()
20
def main() -> None:
21
    print('Data z databáze budou smazána:')
22
    clean_database()
23

  
24

  
25
if __name__ == "__main__":
26
    main()
modules/crawler/pipeline.py
1 1
from Utilities import folder_processor, configure_functions
2 2
from Utilities.Database import database_loader, database_record_logs
3 3
from Utilities.CSV import csv_utils
4
from shared_types import ConfigType
4 5
import os
5 6
import pymongo
6 7

  
......
20 21

  
21 22
#logger
22 23
logging.basicConfig(filename=CRAWLER_LOGS_PATH + 'Applicationlog-' +
23
                               date.today().strftime("%b-%Y") + '.log',
24
                               level=logging.INFO,
25
                               format='%(asctime)s %(message)s')
24
                    date.today().strftime("%b-%Y") + '.log',
25
                    level=logging.INFO,
26
                    format='%(asctime)s %(message)s')
26 27

  
27 28

  
28
def check_last_update(config):
29
def check_last_update(config: ConfigType) -> bool:
29 30
    """
30 31
    Loads integer from updated.txt in CrawlerLogs/"dataset_name"
31 32
    representing number of days from last update if number equals
......
56 57
        return False
57 58

  
58 59

  
59
def crawl_data(config):
60
def crawl_data(config: ConfigType) -> None:
60 61
    """
61 62
      Imports dataset crawler in DatasetCrawler/"dataset_name"_crawler.py
62 63
      runs crawler.
......
73 74
    dataset_name += '/'
74 75

  
75 76

  
76
def process_data(config):
77
def process_data(config: ConfigType) -> None:
77 78
    """
78 79
    Goes trough every not processed file(list of processed files is saved in databse)
79 80
    Imports dataset processor in DatasetProcessing/"dataset_name"_processor.py
......
100 101
        path = CRAWLED_DATA_PATH + dataset_path + not_processed_file
101 102
        date_dic = process_file_func(path)
102 103
        csv_utils.export_data_to_csv(path, date_dic)
104
        print("Vytvářím: " + not_processed_file)
103 105
        database_record_logs.update_ignore_set_processed(
104 106
            dataset_name, not_processed_file)
105 107

  
......
107 109
                 str(len(not_processed_files)) + " newly crawled files")
108 110

  
109 111

  
110
def process_data_crone(config):
112
def process_data_crone(config: ConfigType) -> None:
111 113
    """
112 114
    Goes trough every not processed file(list of processed files is saved in database)
113 115
    Imports dataset processor in DatasetProcessing/"dataset_name"_processor.py
......
146 148
                 str(len(not_processed_files)) + " newly crawled files")
147 149

  
148 150

  
149
def validate_process_data(config):
151
def validate_process_data(config: ConfigType) -> bool:
150 152
    """
151 153
    Function goes through newly processed data and checks theirs status
152 154

  
......
186 188
    return True
187 189

  
188 190

  
189
def load_data_to_database(config):
191
def load_data_to_database(config: ConfigType) -> None:
190 192
    """
191 193
    Goes trough every not loaded file(list of loaded files is saved in database)
192 194
    loads data appends coordination from configurations
......
207 209
    changes_in_devices = database_loader.update_devices_collection(config)
208 210

  
209 211
    if changes_in_devices == True:
210
        logging.info(
211
            dataset_name +
212
            " contains changes in devices configuration. Deleting old data and preparing new"
213
        )
212
        logg_string = dataset_name + " contains changes in devices configuration. Deleting old data and preparing new"
213
        logg_string_cs = dataset_name + " obsahuje změny v konfiguračním souboru. Probíha odstraňování starých dat a připravení nových."
214
        logging.info(logg_string)
215
        print(logg_string_cs)
214 216
        database_loader.reset_dataset_database(dataset_name)
215 217

  
216 218
    # get all unprocessed files from dataset
......
230 232
        database_record_logs.update_ignore_set_loaded(dataset_name,
231 233
                                                      not_loaded_file)
232 234

  
233
    logging.info(dataset_name + " has loaded to database " +
234
                 str(len(not_loaded_files)) + " newly processed files.")
235
    logg_string = dataset_name + " has loaded to database " + str(
236
        len(not_loaded_files)) + " newly processed files."
237
    logg_string_cs = dataset_name + " načetl " + str(
238
        len(not_loaded_files)) + " nových zpracovaných souborů \n"
239

  
240
    logging.info(logg_string)
241
    print(logg_string_cs)
235 242

  
236 243
    client = pymongo.MongoClient()
237 244
    client.close()
238 245

  
239 246

  
240
def load_data_to_database_crone(config):
247
def load_data_to_database_crone(config: ConfigType) -> None:
241 248
    """
242 249
    Goes trough every not loaded file(list of loaded files is saved in database)
243 250
    loads data appends coordination from configurations
......
280 287
    client.close()
281 288

  
282 289

  
283
def run_full_pipeline(dataset_name):
290
def run_full_pipeline(dataset_name: str) -> None:
284 291
    """
285 292
    Loads config file and starts full pipeline
286 293
    -crawl data
......
292 299
    """
293 300
    logging.info("Starting pipeline for dataset " + dataset_name)
294 301
    print("Zpracovávám dataset " + dataset_name +
295
          " průběh lze sledovat v logu umístěném v in CrawlerLogs folder")
302
          ", průběh lze sledovat v logu umístěném v adresáři CrawlerLogs")
296 303

  
297 304
    config = configure_functions.load_configuration(dataset_name)
298 305
    crawl_data(config)
......
304 311
        load_data_to_database(config)
305 312

  
306 313

  
307
def run_full_pipeline_crone(dataset_name):
314
def run_full_pipeline_crone(dataset_name: str) -> None:
308 315
    """
309 316
    Loads config file and starts full pipeline
310 317
    -crawl data
modules/crawler/prepare_new_dataset.py
1 1
import os
2

  
3 2
# Path to crawled data
4 3
CRAWLED_DATA_PATH = "CrawledData/"
5 4
# Path to processed data
......
14 13
DEFAULT_COLOR = "#000000"
15 14

  
16 15

  
17
def create_default_config_file(dataset_name: str):
16
def create_default_config_file(dataset_name: str) -> None:
18 17
    """
19 18
    Creates default config file
20 19

  
......
22 21
        dataset_name: Name of newly created dataset
23 22
    """
24 23
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
25
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
24
        file.write("# Name of the dataset inside the application\n")
26 25
        file.write("display-name: " + dataset_name + "\n")
27 26
        file.write(
28
            "# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře\n")
29
        file.write("display-color: " + DEFAULT_COLOR + "\n")
27
            "# Color for the dataset in a hex value (default value #000000)\n")
28
        file.write(f'display-color: \'{DEFAULT_COLOR}\' \n')
30 29
        file.write(
31
            "# barva pro tento dataset v hexadecimální hodnotě (#000000)\n")
30
            "# One word dataset name (structure of all modules will be affected by this)\n"
31
        )
32 32
        file.write("dataset-name: " + dataset_name + "\n")
33
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
34
        file.write("url: ZDE VLOZTE URL\n")
33
        file.write("# Url for the source of this dataset\n")
34
        file.write("url: ENTER URL HERE\n")
35
        file.write(
36
            "# Optional parameter which specifies a pattern of the datasets name\n"
37
        )
38
        file.write(
39
            "# Example: DATASET_NAME_[0-9][0-9]_[0-9][0-9][0-9][0-9].zip\n")
40
        file.write(
41
            "# - DATASET_NAME_01_2020.zip where '01_2020' specifies date in this dataset\n"
42
        )
43
        file.write("regex: ENTER REGEX HERE\n")
35 44
        file.write(
36
            "# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
37
        file.write("regex: ZDE VLOZTE REGEX\n")
38
        file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
39
                   "tak defaultni hodnota (dny)\n")
40
        file.write("update-period: ZDE VLOZTE HODNOTU\n")
41
        file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
45
            "# Optional parameter which specifies the way of searching new datasets (if empty the period is set to every day)\n"
46
        )
47
        file.write("update-period: ENTER UPDATE PERIOD HERE\n")
48
        file.write("# Coordinates of every datasets device (entinty)\n")
42 49
        file.write("devices:\n")
43 50

  
44 51

  
45
def create_default_processor(dataset_name):
52
def create_default_processor(dataset_name: str) -> None:
46 53
    """
47 54
    Creates default processor for dataset
48 55

  
49 56
    Args:
50 57
        dataset_name: Name of newly created dataset
51 58
    """
52
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "_processor.py", "w") as file:
53
        file.write("from Utilities.CSV import csv_data_line")
59
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "_processor.py",
60
              "w") as file:
61
        file.write("from Utilities.CSV import csv_data_line\n")
62
        file.write("from shared_types import DateDict")
54 63
        file.write("\n")
55 64
        file.write("\n")
56
        file.write("def process_file(filename):\n")
65
        file.write("def process_file(filename: str) -> DateDict:\n")
57 66
        file.write("    \"\"\"\n")
58 67
        file.write(
59
            "    Method that take path to crawled file and outputs date dictionary:\n")
68
            "    Method that takes the path to crawled file and outputs date dictionary:\n"
69
        )
60 70
        file.write(
61
            "    Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)\n")
71
            "    Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)\n"
72
        )
62 73
        file.write(
63
            "    and value is dictionary where keys are devices (specified in configuration file)\n")
74
            "    and value is dictionary where keys are devices (specified in configuration file)\n"
75
        )
64 76
        file.write(
65
            "    and value is CSVDataLine.csv_data_line with device,date and occurrence\n")
77
            "    and value is CSVDataLine.csv_data_line with device,date and occurrence\n"
78
        )
66 79
        file.write("\n")
67 80
        file.write("    Args:\n")
68
        file.write("    filename: name of processed file\n")
81
        file.write("    filename: name of the processed file\n")
69 82
        file.write("\n")
70 83
        file.write("    Returns:\n")
71 84
        file.write("    None if not implemented\n")
72 85
        file.write("    date_dict when implemented\n")
73 86
        file.write("    \"\"\"\n")
74
        file.write("    date_dict = dict()\n")
87
        file.write("    date_dict: DateDict = {}\n")
75 88
        file.write("\n")
76 89
        file.write("    #with open(filename, \"r\") as file:\n")
77 90
        file.write(
78
            "    print(\"You must implements process_file method first!\")\n")
79
        file.write("    return None\n")
91
            "    print(\"You must implement the process_file method first!\")\n"
92
        )
93
        file.write("    return date_dict\n")
80 94

  
81 95

  
82
def create_default_crawler(dataset_name):
96
def create_default_crawler(dataset_name: str) -> None:
83 97
    """
84 98
    Creates default crawler for dataset
85 99

  
......
87 101
        dataset_name: Name of newly created dataset
88 102
    """
89 103

  
90
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "_crawler.py", "w") as file:
104
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "_crawler.py",
105
              "w") as file:
106
        file.write("from shared_types import ConfigType\n")
91 107
        file.write("# Path to crawled data\n")
92
        file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
108
        file.write(f'CRAWLED_DATA_PATH = "{CRAWLED_DATA_PATH}" \n')
93 109
        file.write("\n")
94 110
        file.write("\n")
95
        file.write("def crawl(config):\n")
111
        file.write("def crawl(config: ConfigType):\n")
96 112
        file.write("    \"\"\"\n")
97 113
        file.write(
98
            "    Implement crawl method that downloads new data to path_for_files\n")
114
            "    Implementation the crawl method which downloads new data to the path_for_files\n"
115
        )
99 116
        file.write("    For keeping the project structure\n")
100 117
        file.write("    url , regex, and dataset_name from config\n")
101 118
        file.write(
102
            "    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
119
            "    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n"
120
        )
103 121
        file.write("\n")
104 122
        file.write("    Args:\n")
105 123
        file.write("        config: loaded configuration file of dataset\n")
......
109 127
        file.write("    regex = config['regex']\n")
110 128
        file.write(
111 129
            "    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
112
        file.write("    print(\"You must implements Crawl method first!\")\n")
130
        file.write(
131
            "    print(\"Není implementován crawler pro získávání dat!\")\n")
113 132

  
114 133

  
115
def prepare_dataset_structure(dataset_name):
134
def prepare_dataset_structure(dataset_name: str) -> None:
116 135
    """
117 136
    Prepares folders for new dataset
118 137
    Args:
......
120 139
    """
121 140

  
122 141
    # create folder for crawled data
123
    path = CRAWLED_DATA_PATH+dataset_name
142
    path = CRAWLED_DATA_PATH + dataset_name
124 143
    try:
125 144
        os.mkdir(path)
126 145
    except os.error as e:
......
132 151
    try:
133 152
        os.mkdir(path)
134 153
    except OSError:
135
        print("Creation of the directory %s failed" % path)
154
        print("Nelze vytvořit adresář %s" % path)
136 155

  
137 156
    create_default_crawler(dataset_name)
138 157
    create_default_processor(dataset_name)
139 158
    create_default_config_file(dataset_name)
140 159

  
141 160

  
142
print("Zadejte jméno nového datasetu:\n")
161
def main() -> None:
162
    print("Zadejte jméno nového datasetu:\n")
163
    dataset_name = input().upper()
164

  
165
    if dataset_name.isalpha():
166
        prepare_dataset_structure(dataset_name)
167
        print("Architektura vytvořena \n")
168
    else:
169
        print("Jméno musí obsahovat pouze písmena z abecedy (bez mezer)\n")
143 170

  
144
dataset_name = input().upper()
145 171

  
146
if dataset_name.isalpha():
147
    prepare_dataset_structure(dataset_name)
148
    print("Architektura vytvořena \n")
149
else:
150
    print("Jméno musí obsahovat pouze písmena z abecedy (bez mezer)\n")
172
if __name__ == "__main__":
173
    main()
modules/crawler/python.code-workspace
12 12
		"python.linting.pylintEnabled": true,
13 13
		"python.linting.enabled": true,
14 14
		"python.linting.pylintPath": "pylint",
15
		"python.pythonPath": "/usr/local/bin/python",
16 15
		"python.formatting.provider": "yapf",
17 16
	},
18 17
	"extensions": {
... Rozdílový soubor je zkrácen, protože jeho délka přesahuje max. limit.

Také k dispozici: Unified diff