Revize af7609b5
Přidáno uživatelem Tomáš Ballák před asi 4 roky(ů)
modules/crawler/DatasetCrawler/JIS_crawler.py | ||
---|---|---|
1 | 1 |
from Utilities import folder_processor |
2 | 2 |
from Utilities.Crawler import basic_crawler_functions |
3 |
|
|
3 |
from shared_types import ConfigType |
|
4 | 4 |
# Path to crawled data |
5 | 5 |
CRAWLED_DATA_PATH = "CrawledData/" |
6 | 6 |
|
7 | 7 |
|
8 |
def crawl(config): |
|
8 |
def crawl(config: ConfigType):
|
|
9 | 9 |
""" |
10 | 10 |
Implement crawl method that downloads new data to path_for_files |
11 | 11 |
For keeping the project structure |
... | ... | |
21 | 21 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
22 | 22 |
|
23 | 23 |
first_level_links = basic_crawler_functions.get_all_links(url) |
24 |
filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU") |
|
25 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url) |
|
24 |
filtered_first_level_links = basic_crawler_functions.filter_links( |
|
25 |
first_level_links, "^OD_ZCU") |
|
26 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links( |
|
27 |
filtered_first_level_links, url) |
|
26 | 28 |
|
27 | 29 |
files = [] |
28 | 30 |
|
29 | 31 |
for link in absolute_first_level_links: |
30 | 32 |
second_level_links = basic_crawler_functions.get_all_links(link) |
31 |
filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex) |
|
32 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link) |
|
33 |
filtered_second_level_links = basic_crawler_functions.filter_links( |
|
34 |
second_level_links, regex) |
|
35 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links( |
|
36 |
filtered_second_level_links, link) |
|
33 | 37 |
|
34 | 38 |
for file_link in absolute_second_level_links: |
35 | 39 |
files.append(file_link) |
36 | 40 |
|
37 |
files = basic_crawler_functions.remove_downloaded_links(files, dataset_name) |
|
41 |
files = basic_crawler_functions.remove_downloaded_links( |
|
42 |
files, dataset_name) |
|
38 | 43 |
|
39 | 44 |
for file in files: |
40 | 45 |
basic_crawler_functions.download_file_from_url(file, dataset_name) |
modules/crawler/DatasetCrawler/KOLOBEZKY_crawler.py | ||
---|---|---|
1 | 1 |
from Utilities import folder_processor |
2 | 2 |
from Utilities.Crawler import basic_crawler_functions |
3 |
|
|
3 |
from shared_types import ConfigType |
|
4 | 4 |
# Path to crawled data |
5 | 5 |
CRAWLED_DATA_PATH = "CrawledData/" |
6 | 6 |
|
7 | 7 |
|
8 |
def crawl(config): |
|
8 |
def crawl(config: ConfigType):
|
|
9 | 9 |
""" |
10 | 10 |
Implement crawl method that downloads new data to path_for_files |
11 | 11 |
For keeping the project structure |
... | ... | |
21 | 21 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
22 | 22 |
|
23 | 23 |
first_level_links = basic_crawler_functions.get_all_links(url) |
24 |
filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU") |
|
25 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url) |
|
24 |
filtered_first_level_links = basic_crawler_functions.filter_links( |
|
25 |
first_level_links, "^OD_ZCU") |
|
26 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links( |
|
27 |
filtered_first_level_links, url) |
|
26 | 28 |
|
27 | 29 |
files = [] |
28 | 30 |
|
29 | 31 |
for link in absolute_first_level_links: |
30 | 32 |
second_level_links = basic_crawler_functions.get_all_links(link) |
31 |
filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex) |
|
32 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link) |
|
33 |
filtered_second_level_links = basic_crawler_functions.filter_links( |
|
34 |
second_level_links, regex) |
|
35 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links( |
|
36 |
filtered_second_level_links, link) |
|
33 | 37 |
|
34 | 38 |
for file_link in absolute_second_level_links: |
35 | 39 |
files.append(file_link) |
36 | 40 |
|
37 |
files = basic_crawler_functions.remove_downloaded_links(files, dataset_name) |
|
41 |
files = basic_crawler_functions.remove_downloaded_links( |
|
42 |
files, dataset_name) |
|
38 | 43 |
|
39 | 44 |
for file in files: |
40 | 45 |
basic_crawler_functions.download_file_from_url(file, dataset_name) |
modules/crawler/DatasetCrawler/OBSAZENIMISTNOSTI_crawler.py | ||
---|---|---|
1 | 1 |
from Utilities import folder_processor |
2 | 2 |
from Utilities.Crawler import basic_crawler_functions |
3 |
from shared_types import ConfigType |
|
3 | 4 |
|
4 | 5 |
# Path to crawled data |
5 | 6 |
CRAWLED_DATA_PATH = "CrawledData/" |
6 | 7 |
|
7 | 8 |
|
8 |
def crawl(config): |
|
9 |
def crawl(config: ConfigType):
|
|
9 | 10 |
""" |
10 | 11 |
Implement crawl method that downloads new data to path_for_files |
11 | 12 |
For keeping the project structure |
modules/crawler/DatasetCrawler/WIFI_crawler.py | ||
---|---|---|
1 | 1 |
from Utilities import folder_processor |
2 | 2 |
from Utilities.Crawler import basic_crawler_functions |
3 |
from shared_types import ConfigType |
|
3 | 4 |
|
4 | 5 |
# Path to crawled data |
5 | 6 |
CRAWLED_DATA_PATH = "CrawledData/" |
6 | 7 |
|
7 | 8 |
|
8 |
def crawl(config): |
|
9 |
def crawl(config: ConfigType):
|
|
9 | 10 |
""" |
10 | 11 |
Implement crawl method that downloads new data to path_for_files |
11 | 12 |
For keeping the project structure |
... | ... | |
21 | 22 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
22 | 23 |
|
23 | 24 |
first_level_links = basic_crawler_functions.get_all_links(url) |
24 |
filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU") |
|
25 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url) |
|
25 |
filtered_first_level_links = basic_crawler_functions.filter_links( |
|
26 |
first_level_links, "^OD_ZCU") |
|
27 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links( |
|
28 |
filtered_first_level_links, url) |
|
26 | 29 |
|
27 | 30 |
files = [] |
28 | 31 |
|
29 | 32 |
for link in absolute_first_level_links: |
30 | 33 |
second_level_links = basic_crawler_functions.get_all_links(link) |
31 |
filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex) |
|
32 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link) |
|
34 |
filtered_second_level_links = basic_crawler_functions.filter_links( |
|
35 |
second_level_links, regex) |
|
36 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links( |
|
37 |
filtered_second_level_links, link) |
|
33 | 38 |
|
34 | 39 |
for file_link in absolute_second_level_links: |
35 | 40 |
files.append(file_link) |
36 | 41 |
|
37 |
files = basic_crawler_functions.remove_downloaded_links(files, dataset_name) |
|
42 |
files = basic_crawler_functions.remove_downloaded_links( |
|
43 |
files, dataset_name) |
|
38 | 44 |
|
39 | 45 |
for file in files: |
40 | 46 |
basic_crawler_functions.download_file_from_url(file, dataset_name) |
modules/crawler/DatasetProcessing/JIS_processor.py | ||
---|---|---|
1 | 1 |
from Utilities.CSV import csv_data_line |
2 | 2 |
from Utilities import date_formating |
3 | 3 |
|
4 |
from shared_types import DateDict |
|
4 | 5 |
|
5 |
def process_file(filename): |
|
6 |
|
|
7 |
def process_file(filename: str) -> DateDict: |
|
6 | 8 |
""" |
7 | 9 |
Method that take path to crawled file and outputs date dictionary: |
8 | 10 |
Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15) |
... | ... | |
16 | 18 |
None if not implemented |
17 | 19 |
date_dict when implemented |
18 | 20 |
""" |
19 |
date_dict = dict()
|
|
21 |
date_dict = {}
|
|
20 | 22 |
|
21 | 23 |
with open(filename, "r", encoding="utf-8") as file: |
22 | 24 |
|
... | ... | |
29 | 31 |
occurrence = array[2][:-1] |
30 | 32 |
|
31 | 33 |
if date not in date_dict: |
32 |
date_dict[date] = dict()
|
|
34 |
date_dict[date] = {}
|
|
33 | 35 |
|
34 | 36 |
if name in date_dict[date]: |
35 | 37 |
date_dict[date][name].occurrence += int(occurrence) |
36 | 38 |
else: |
37 |
date_dict[date][name] = csv_data_line.CSVDataLine(name, date, occurrence) |
|
39 |
date_dict[date][name] = csv_data_line.CSVDataLine( |
|
40 |
name, date, occurrence) |
|
38 | 41 |
|
39 | 42 |
return date_dict |
40 |
|
modules/crawler/DatasetProcessing/KOLOBEZKY_processor.py | ||
---|---|---|
1 | 1 |
from Utilities.CSV import csv_data_line |
2 | 2 |
from Utilities import date_formating |
3 | 3 |
|
4 |
from shared_types import DateDict |
|
4 | 5 |
|
5 |
def process_file(filename): |
|
6 |
|
|
7 |
def process_file(filename: str) -> DateDict: |
|
6 | 8 |
""" |
7 | 9 |
Method that take path to crawled file and outputs date dictionary: |
8 | 10 |
Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15) |
... | ... | |
16 | 18 |
None if not implemented |
17 | 19 |
date_dict when implemented |
18 | 20 |
""" |
19 |
date_dict = dict()
|
|
21 |
date_dict = {}
|
|
20 | 22 |
|
21 | 23 |
with open(filename, "r") as file: |
22 | 24 |
|
... | ... | |
28 | 30 |
name = array[1][1:-1] |
29 | 31 |
|
30 | 32 |
if date not in date_dict: |
31 |
date_dict[date] = dict()
|
|
33 |
date_dict[date] = {}
|
|
32 | 34 |
|
33 | 35 |
if name in date_dict[date]: |
34 | 36 |
date_dict[date][name].occurrence += 1 |
35 | 37 |
else: |
36 |
date_dict[date][name] = csv_data_line.CSVDataLine(name, date, 1) |
|
38 |
date_dict[date][name] = csv_data_line.CSVDataLine( |
|
39 |
name, date, 1) |
|
37 | 40 |
|
38 | 41 |
return date_dict |
39 |
|
modules/crawler/DatasetProcessing/OBSAZENIMISTNOSTI_processor.py | ||
---|---|---|
5 | 5 |
import time |
6 | 6 |
import datetime |
7 | 7 |
|
8 |
from shared_types import DateDict |
|
9 |
|
|
8 | 10 |
logging.basicConfig(filename='../../CrawlerLogs' + 'Crawlerlog-' + |
9 | 11 |
date.today().strftime("%b-%Y") + '.log', |
10 | 12 |
level=logging.INFO, |
11 | 13 |
format='%(asctime)s %(message)s') |
12 | 14 |
|
13 | 15 |
|
14 |
def process_file(filename):
|
|
16 |
def process_file(filename: str) -> DateDict:
|
|
15 | 17 |
""" |
16 | 18 |
Method that take path to crawled file and outputs date dictionary: |
17 | 19 |
Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15) |
... | ... | |
25 | 27 |
None if not implemented |
26 | 28 |
date_dict when implemented |
27 | 29 |
""" |
28 |
date_dict = dict()
|
|
30 |
date_dict = {}
|
|
29 | 31 |
|
30 | 32 |
with open(filename, "r") as file: |
31 | 33 |
|
modules/crawler/DatasetProcessing/WIFI_processor.py | ||
---|---|---|
1 | 1 |
from Utilities.CSV import csv_data_line |
2 | 2 |
from Utilities import date_formating |
3 |
from shared_types import DateDict |
|
3 | 4 |
|
4 | 5 |
|
5 |
def process_file(filename):
|
|
6 |
def process_file(filename: str) -> DateDict:
|
|
6 | 7 |
""" |
7 | 8 |
Method that take path to crawled file and outputs date dictionary: |
8 | 9 |
Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15) |
modules/crawler/Utilities/CSV/csv_data_line.py | ||
---|---|---|
3 | 3 |
Class that specifies the look of data line in processed csv file |
4 | 4 |
prepared for database |
5 | 5 |
""" |
6 |
|
|
7 |
def __init__(self, name, date, occurrence): |
|
6 |
def __init__(self, name: str, date: str, occurrence: int) -> None: |
|
8 | 7 |
try: |
9 | 8 |
test_val = int(occurrence) |
10 | 9 |
except ValueError: |
11 | 10 |
print("Occurence should be and integer value!") |
12 | 11 |
|
13 | 12 |
if len(date) != 13: |
14 |
raise ValueError("Invalid date format YYYY-dd-mm-hh expected!")
|
|
13 |
raise ValueError("Invalid date format YYYY-dd-mm-hh expected!") |
|
15 | 14 |
|
16 | 15 |
self.name = name |
17 | 16 |
self.date = date |
18 | 17 |
self.occurrence = test_val |
19 | 18 |
|
20 |
def to_csv(self): |
|
19 |
def to_csv(self) -> str:
|
|
21 | 20 |
return self.name + ";" + str(self.occurrence) + ";" + self.date |
22 |
|
modules/crawler/Utilities/CSV/csv_utils.py | ||
---|---|---|
1 | 1 |
import inspect |
2 |
from shared_types import StringSetType |
|
2 | 3 |
from Utilities.CSV import csv_data_line |
3 | 4 |
|
4 | 5 |
# Path to processed data |
5 | 6 |
PROCESSED_DATA_PATH = "ProcessedData/" |
6 | 7 |
|
7 | 8 |
|
8 |
def get_unique_names_from_file(filename, column_number): |
|
9 |
def get_unique_names_from_file(filename: str, |
|
10 |
column_number: int) -> StringSetType: |
|
9 | 11 |
""" |
10 | 12 |
Extract set of unique names from file |
11 | 13 |
Args: |
... | ... | |
29 | 31 |
return name_set |
30 | 32 |
|
31 | 33 |
|
32 |
def export_data_to_csv(filename, data_dict):
|
|
34 |
def export_data_to_csv(filename: str, data_dict) -> None:
|
|
33 | 35 |
""" |
34 | 36 |
Takes data_dict and export it into a csv file |
35 | 37 |
Args: |
... | ... | |
40 | 42 |
|
41 | 43 |
for date in data_dict: |
42 | 44 |
if len(date) != 13: |
43 |
raise ValueError("Invalid date format for key value --> YYYY-mm-dd-hh expected!") |
|
45 |
raise ValueError( |
|
46 |
"Invalid date format for key value --> YYYY-mm-dd-hh expected!" |
|
47 |
) |
|
44 | 48 |
for data in data_dict[date]: |
45 | 49 |
csv_line = data_dict[date][data] |
46 |
if not isinstance(csv_line,csv_data_line.CSVDataLine): |
|
47 |
raise ValueError("data_dict is expected to have CSVDataLine as values") |
|
50 |
if not isinstance(csv_line, csv_data_line.CSVDataLine): |
|
51 |
raise ValueError( |
|
52 |
"data_dict is expected to have CSVDataLine as values") |
|
48 | 53 |
file.write(csv_line.to_csv() + '\n') |
modules/crawler/Utilities/Crawler/basic_crawler_functions.py | ||
---|---|---|
3 | 3 |
from Utilities import folder_processor |
4 | 4 |
from Utilities.Database import database_record_logs |
5 | 5 |
from bs4 import BeautifulSoup |
6 |
from typing import List |
|
6 | 7 |
|
7 | 8 |
# Path to crawler logs |
8 | 9 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
9 | 10 |
# Path to crawled data |
10 | 11 |
CRAWLED_DATA_PATH = "CrawledData/" |
12 |
LinksType = List[str] |
|
11 | 13 |
|
12 | 14 |
|
13 |
def get_all_links(url):
|
|
15 |
def get_all_links(url: str) -> LinksType:
|
|
14 | 16 |
""" |
15 | 17 |
Sends http request to url, downloads all data, |
16 | 18 |
extract links |
... | ... | |
34 | 36 |
return links |
35 | 37 |
|
36 | 38 |
|
37 |
def filter_links(links, regex):
|
|
39 |
def filter_links(links: LinksType, regex: str) -> LinksType:
|
|
38 | 40 |
""" |
39 | 41 |
Filters list of links using regex |
40 | 42 |
|
... | ... | |
54 | 56 |
return filtered_links |
55 | 57 |
|
56 | 58 |
|
57 |
def create_absolute_links(links, archive):
|
|
59 |
def create_absolute_links(links: LinksType, archive: str) -> LinksType:
|
|
58 | 60 |
""" |
59 | 61 |
Appends archive path to every link in links |
60 | 62 |
Args: |
... | ... | |
72 | 74 |
return absolute_links |
73 | 75 |
|
74 | 76 |
|
75 |
def remove_downloaded_links(links, dataset_name):
|
|
77 |
def remove_downloaded_links(links: LinksType, dataset_name: str) -> LinksType:
|
|
76 | 78 |
""" |
77 | 79 |
Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt |
78 | 80 |
Args: |
... | ... | |
88 | 90 |
return final_links |
89 | 91 |
|
90 | 92 |
|
91 |
def download_file_from_url(url, dataset_name):
|
|
93 |
def download_file_from_url(url: str, dataset_name: str) -> None:
|
|
92 | 94 |
""" |
93 | 95 |
Downloads file on provided url and saves it to path |
94 | 96 |
Args: |
modules/crawler/Utilities/Database/database_data_line.py | ||
---|---|---|
1 |
from typing import Dict |
|
2 |
|
|
3 |
|
|
1 | 4 |
class DatabaseDataLine: |
2 | 5 |
""" |
3 | 6 |
Class that specifies the look of data line in database |
4 | 7 |
""" |
5 |
def __init__(self, name, longitude, latitude, date, occurrence): |
|
8 |
def __init__(self, name: str, longitude: float, latitude: float, date: str, |
|
9 |
occurrence: int): |
|
6 | 10 |
self.name = name |
7 | 11 |
self.latitude = latitude |
8 | 12 |
self.longitude = longitude |
9 | 13 |
self.date = date |
10 | 14 |
self.occurrence = occurrence |
11 | 15 |
|
12 |
def to_dictionary(self): |
|
13 |
return {"place": self.name, "x": self.longitude, "y": self.latitude, "number": self.occurrence, |
|
14 |
"date": self.date} |
|
16 |
def to_dictionary(self) -> Dict[str, any]: |
|
17 |
return { |
|
18 |
"place": self.name, |
|
19 |
"x": self.longitude, |
|
20 |
"y": self.latitude, |
|
21 |
"number": self.occurrence, |
|
22 |
"date": self.date |
|
23 |
} |
modules/crawler/Utilities/Database/database_loader.py | ||
---|---|---|
1 | 1 |
from Utilities.Database import database_data_line, database_record_logs |
2 | 2 |
from Utilities import configure_functions |
3 | 3 |
from Utilities.helpers import should_skip, detect_change |
4 |
from shared_types import ConfigType |
|
5 |
from typing import Dict |
|
4 | 6 |
import pymongo |
5 | 7 |
import re |
6 | 8 |
|
... | ... | |
20 | 22 |
# Path to processed data |
21 | 23 |
PROCESSED_DATA_PATH = "ProcessedData/" |
22 | 24 |
|
25 |
DatabaseConnectionType = Dict[str, any] |
|
23 | 26 |
|
24 |
def create_database_connection(): |
|
27 |
|
|
28 |
def create_database_connection() -> pymongo.database.Database: |
|
25 | 29 |
""" |
26 | 30 |
Creates connection to mongoDB |
27 | 31 |
|
... | ... | |
38 | 42 |
return database |
39 | 43 |
|
40 | 44 |
|
41 |
def get_data_from_file(filename, config):
|
|
45 |
def get_data_from_file(filename: str, config: ConfigType) -> Dict[str, any]:
|
|
42 | 46 |
""" |
43 | 47 |
Opens processed file, reads it line by line |
44 | 48 |
name, ocurrence, date |
... | ... | |
59 | 63 |
f = open(dataset_path + filename, "r") |
60 | 64 |
|
61 | 65 |
devices = config["devices"] |
62 |
date_dict = dict()
|
|
66 |
date_dict = {}
|
|
63 | 67 |
|
64 | 68 |
for line in f: |
65 | 69 |
line = line[:-1] |
... | ... | |
86 | 90 |
return date_dict |
87 | 91 |
|
88 | 92 |
|
89 |
def load_data_to_database(database_connection, dataset_name, data_dic, |
|
90 |
file_name): |
|
93 |
def load_data_to_database(database_connection: DatabaseConnectionType, |
|
94 |
dataset_name: str, data_dic: Dict[str, any], |
|
95 |
file_name: str) -> None: |
|
91 | 96 |
""" |
92 | 97 |
Takes data_dic created in method get_data_from_file |
93 | 98 |
and loads into into database where collection name is dataset_name + data_dic key |
... | ... | |
107 | 112 |
date_dataset.insert_many(data_dic[date]) |
108 | 113 |
|
109 | 114 |
|
110 |
def check_or_update_datasets_collection(database_connection, config): |
|
115 |
def check_or_update_datasets_collection( |
|
116 |
database_connection: DatabaseConnectionType, config: ConfigType): |
|
111 | 117 |
""" |
112 | 118 |
Checks if DATASETS collection contains dataset and if display name was not updated |
113 | 119 |
|
... | ... | |
116 | 122 |
config: loaded configuration file of dataset |
117 | 123 |
""" |
118 | 124 |
# collection where are specified aviable datasets |
119 |
compareKeys = ['display-name', |
|
120 |
'display-color'] |
|
125 |
compareKeys = ['display-name', 'display-color'] |
|
121 | 126 |
collection_datasets = database_connection[MONGODB_DATASET_COLLECTION] |
122 | 127 |
|
123 | 128 |
query = {'key-name': config['dataset-name']} |
... | ... | |
139 | 144 |
collection_datasets.update_one(query, {"$set": newVal}) |
140 | 145 |
|
141 | 146 |
|
142 |
def update_devices_collection(config): |
|
147 |
def update_devices_collection(config: ConfigType):
|
|
143 | 148 |
""" |
144 | 149 |
Checks if there are any changes in devices specified in config file against |
145 | 150 |
devices processed and loaded into the database |
... | ... | |
164 | 169 |
|
165 | 170 |
devices_cursor = collection_devices.find() |
166 | 171 |
|
167 |
db_device_dict = dict()
|
|
172 |
db_device_dict = {}
|
|
168 | 173 |
|
169 | 174 |
for device in devices_cursor: |
170 | 175 |
name = device['name'] |
... | ... | |
208 | 213 |
return change_in_devices |
209 | 214 |
|
210 | 215 |
|
211 |
def remove_dataset_database(dataset_name): |
|
216 |
def remove_dataset_database(dataset_name: str):
|
|
212 | 217 |
""" |
213 | 218 |
Removes dataset entries from database |
214 | 219 |
Args: |
... | ... | |
221 | 226 |
collection_datasets = mydb[MONGODB_DATASET_COLLECTION] |
222 | 227 |
|
223 | 228 |
collection_datasets.delete_one({"key-name": dataset_name}) |
224 |
print("Removing record from DATASETS collection")
|
|
229 |
print("Odstraňování záznamu z DATASETS kolekce")
|
|
225 | 230 |
|
226 | 231 |
# Retrieve list of all collections |
227 | 232 |
collections = mydb.list_collection_names() |
... | ... | |
230 | 235 |
for name in collections: |
231 | 236 |
if name.startswith(dataset_name): |
232 | 237 |
mydb[name].drop() |
233 |
print("Dropping: " + name)
|
|
238 |
print("Odstraňuji: " + name)
|
|
234 | 239 |
|
235 | 240 |
|
236 |
def reset_dataset_database(dataset_name): |
|
241 |
def reset_dataset_database(dataset_name: str):
|
|
237 | 242 |
""" |
238 | 243 |
Reset dataset in database |
239 | 244 |
- delete everything from except crawled links and mention in DATASETS collection |
... | ... | |
252 | 257 |
for name in collections: |
253 | 258 |
if pattern.match(name): |
254 | 259 |
mydb[name].drop() |
255 |
print("Dropping: " + name)
|
|
260 |
print("Odstraňuji: " + name)
|
|
256 | 261 |
|
257 | 262 |
database_record_logs.reset_ignore_set_loaded(dataset_name) |
modules/crawler/Utilities/Database/database_record_logs.py | ||
---|---|---|
1 | 1 |
from Utilities.Database import database_loader |
2 |
|
|
2 |
from shared_types import StringSetType |
|
3 | 3 |
# mongodb collection with with already downloaded links |
4 | 4 |
MONGODB_DATASET_LINK_COLLECTION = "LINKS" |
5 | 5 |
# mongodb collection with with already processed files |
... | ... | |
10 | 10 |
MONGODB_DATASET_COLLECTION = "DATASETS" |
11 | 11 |
|
12 | 12 |
|
13 |
def load_ignore_set_links(dataset_name):
|
|
13 |
def load_ignore_set_links(dataset_name: str) -> StringSetType:
|
|
14 | 14 |
""" |
15 | 15 |
Loades from database links of already downloaded files by crawler |
16 | 16 |
|
... | ... | |
32 | 32 |
return ignore_set |
33 | 33 |
|
34 | 34 |
|
35 |
def update_ignore_set_links(dataset_name,link):
|
|
35 |
def update_ignore_set_links(dataset_name: str, link: str) -> None:
|
|
36 | 36 |
""" |
37 | 37 |
Adds links of newly crawled files to the database |
38 | 38 |
|
... | ... | |
44 | 44 |
|
45 | 45 |
my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION] |
46 | 46 |
|
47 |
my_col.insert({ "name": link})
|
|
47 |
my_col.insert({"name": link}) |
|
48 | 48 |
|
49 | 49 |
|
50 |
def reset_ignore_set_links(dataset_name):
|
|
50 |
def reset_ignore_set_links(dataset_name: str) -> None:
|
|
51 | 51 |
""" |
52 | 52 |
Drops collection of already downloaded links |
53 | 53 |
|
... | ... | |
62 | 62 |
my_col.drop() |
63 | 63 |
|
64 | 64 |
|
65 |
|
|
66 |
def load_ignore_set_processed(dataset_name): |
|
65 |
def load_ignore_set_processed(dataset_name: str) -> StringSetType: |
|
67 | 66 |
""" |
68 | 67 |
Loads from database set of already processed files |
69 | 68 |
|
... | ... | |
85 | 84 |
return ignore_set |
86 | 85 |
|
87 | 86 |
|
88 |
def update_ignore_set_processed(dataset_name,filename):
|
|
87 |
def update_ignore_set_processed(dataset_name: str, filename: str) -> None:
|
|
89 | 88 |
""" |
90 | 89 |
Adds files of newly processed files to the database |
91 | 90 |
|
... | ... | |
97 | 96 |
|
98 | 97 |
my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION] |
99 | 98 |
|
100 |
my_col.insert({ "name": filename}) |
|
101 |
|
|
99 |
my_col.insert({"name": filename}) |
|
102 | 100 |
|
103 | 101 |
|
104 |
def reset_ignore_set_processed(dataset_name):
|
|
102 |
def reset_ignore_set_processed(dataset_name: str) -> None:
|
|
105 | 103 |
""" |
106 | 104 |
Drops collection of already processed files |
107 | 105 |
|
... | ... | |
116 | 114 |
my_col.drop() |
117 | 115 |
|
118 | 116 |
|
119 |
|
|
120 |
def load_ignore_set_loaded(dataset_name): |
|
117 |
def load_ignore_set_loaded(dataset_name: str) -> StringSetType: |
|
121 | 118 |
""" |
122 | 119 |
Loads from database set of already loaded files in database |
123 | 120 |
|
... | ... | |
139 | 136 |
return ignore_set |
140 | 137 |
|
141 | 138 |
|
142 |
|
|
143 |
def update_ignore_set_loaded(dataset_name,filename): |
|
139 |
def update_ignore_set_loaded(dataset_name: str, filename: str) -> None: |
|
144 | 140 |
""" |
145 | 141 |
Adds files of newly loaded files to the database |
146 | 142 |
|
... | ... | |
152 | 148 |
|
153 | 149 |
my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION] |
154 | 150 |
|
155 |
my_col.insert({ "name": filename})
|
|
151 |
my_col.insert({"name": filename}) |
|
156 | 152 |
|
157 | 153 |
|
158 |
def reset_ignore_set_loaded(dataset_name):
|
|
154 |
def reset_ignore_set_loaded(dataset_name: str) -> None:
|
|
159 | 155 |
""" |
160 | 156 |
Drops collection of already loaded files |
161 | 157 |
|
... | ... | |
170 | 166 |
my_col.drop() |
171 | 167 |
|
172 | 168 |
|
173 |
def load_updated(dataset_name):
|
|
169 |
def load_updated(dataset_name: str) -> int:
|
|
174 | 170 |
""" |
175 | 171 |
Loads value of (days from last update) from db |
176 | 172 |
|
... | ... | |
184 | 180 |
|
185 | 181 |
my_col = connection[MONGODB_DATASET_COLLECTION] |
186 | 182 |
|
187 |
data = my_col.find_one({'key-name': dataset_name},{'updated'}) |
|
183 |
data = my_col.find_one({'key-name': dataset_name}, {'updated'})
|
|
188 | 184 |
|
189 | 185 |
updated = int(data['updated']) |
190 | 186 |
|
191 | 187 |
return updated |
192 | 188 |
|
193 | 189 |
|
194 |
def update_updated(dataset_name,value):
|
|
190 |
def update_updated(dataset_name: str, value: int):
|
|
195 | 191 |
""" |
196 | 192 |
Updates value of (days from last update) in db |
197 | 193 |
|
... | ... | |
203 | 199 |
|
204 | 200 |
my_col = connection[MONGODB_DATASET_COLLECTION] |
205 | 201 |
|
206 |
myquery = { 'key-name': dataset_name }
|
|
207 |
new_values = { "$set": { "updated": value } }
|
|
202 |
myquery = {'key-name': dataset_name}
|
|
203 |
new_values = {"$set": {"updated": value}}
|
|
208 | 204 |
|
209 |
my_col.update_one(myquery,new_values) |
|
205 |
my_col.update_one(myquery, new_values) |
modules/crawler/Utilities/configure_functions.py | ||
---|---|---|
1 | 1 |
import yaml |
2 | 2 |
import os |
3 |
from typing import Dict, Set |
|
4 |
from shared_types import StringSetType |
|
3 | 5 |
from Utilities.Database import database_record_logs |
4 | 6 |
from Utilities.helpers import should_skip |
5 | 7 |
|
... | ... | |
9 | 11 |
CONFIG_FILE_TYPE = ".yaml" |
10 | 12 |
|
11 | 13 |
|
12 |
def load_configuration(dataset_name):
|
|
14 |
def load_configuration(dataset_name: str) -> Dict[str, any]:
|
|
13 | 15 |
""" |
14 | 16 |
Loads yaml configuration file into memory |
15 | 17 |
|
... | ... | |
22 | 24 |
with open(CONFIG_FILES_PATH + dataset_name + CONFIG_FILE_TYPE, "r") as f: |
23 | 25 |
data = yaml.load(f, Loader=yaml.FullLoader) |
24 | 26 |
|
25 |
devices_dic = dict()
|
|
27 |
devices_dic = {}
|
|
26 | 28 |
|
27 | 29 |
if data["devices"] is not None: |
28 | 30 |
for item in data["devices"]: |
... | ... | |
33 | 35 |
return data |
34 | 36 |
|
35 | 37 |
|
36 |
def update_configuration(dataset_name, new_devices): |
|
38 |
def update_configuration(dataset_name: str, |
|
39 |
new_devices: StringSetType) -> None: |
|
37 | 40 |
""" |
38 | 41 |
Open dataset and appends new_devices to the end |
39 | 42 |
|
... | ... | |
53 | 56 |
file.write("\n") |
54 | 57 |
|
55 | 58 |
|
56 |
def check_if_there_is_a_config_file(dataset_name):
|
|
59 |
def check_if_there_is_a_config_file(dataset_name: str) -> bool:
|
|
57 | 60 |
""" |
58 | 61 |
Goes trough all config files (represeting valid dataset in database) |
59 | 62 |
and checks if dataset_name is there |
... | ... | |
75 | 78 |
return False |
76 | 79 |
|
77 | 80 |
|
78 |
def return_dictionary_of_valid_devices(devices): |
|
81 |
def return_dictionary_of_valid_devices( |
|
82 |
devices: Dict[str, any]) -> Dict[str, Dict[str, any]]: |
|
79 | 83 |
""" |
80 | 84 |
Iterates over all devices specified in config file |
81 | 85 |
|
... | ... | |
87 | 91 |
Returns: |
88 | 92 |
Dictonary containing only valid devices |
89 | 93 |
""" |
90 |
valid_devices = dict()
|
|
94 |
valid_devices = {}
|
|
91 | 95 |
|
92 | 96 |
for device in devices.keys(): |
93 | 97 |
if not should_skip(devices[device]): |
modules/crawler/Utilities/date_formating.py | ||
---|---|---|
1 |
def date_formatter(string_date):
|
|
1 |
def date_formatter(string_date: str) -> str:
|
|
2 | 2 |
""" |
3 | 3 |
|
4 | 4 |
Args: |
... | ... | |
21 | 21 |
|
22 | 22 |
string_date = srr |
23 | 23 |
|
24 |
return_date = string_date[6:10] + '-' + string_date[3:5] + '-' + string_date[:2] |
|
24 |
return_date = string_date[6:10] + '-' + string_date[ |
|
25 |
3:5] + '-' + string_date[:2] |
|
25 | 26 |
|
26 | 27 |
return return_date |
27 | 28 |
|
28 | 29 |
|
29 |
def date_time_formatter(string_date):
|
|
30 |
def date_time_formatter(string_date: str) -> str:
|
|
30 | 31 |
""" |
31 | 32 |
Converts one type of date format "dd.mm.yyyy hh.mm.ss" to date format YYYY-mm-dd-hh |
32 | 33 |
Args: |
... | ... | |
49 | 50 |
|
50 | 51 |
string_date = srr |
51 | 52 |
|
52 |
return_date = string_date[6:10] + '-' + string_date[3:5] + '-' + string_date[:2] + '-' + string_date[11:13] |
|
53 |
return_date = string_date[6:10] + '-' + string_date[ |
|
54 |
3:5] + '-' + string_date[:2] + '-' + string_date[11:13] |
|
53 | 55 |
|
54 | 56 |
return return_date |
modules/crawler/Utilities/folder_processor.py | ||
---|---|---|
1 | 1 |
import os |
2 | 2 |
import zipfile |
3 |
from shared_types import ConfigType, StringSetType |
|
3 | 4 |
from Utilities.CSV import csv_utils |
4 | 5 |
from Utilities.Database import database_record_logs |
5 | 6 |
|
6 | 7 |
|
7 |
def list_of_all_new_files(ignore_set,path): |
|
8 |
def list_of_all_new_files(ignore_set: StringSetType, |
|
9 |
path: str) -> StringSetType: |
|
8 | 10 |
""" |
9 | 11 |
Get all files from directory and all files written in ignore.txt |
10 | 12 |
and return the difference |
... | ... | |
16 | 18 |
""" |
17 | 19 |
files_in_dir = os.listdir(path) |
18 | 20 |
|
19 |
|
|
20 | 21 |
return set(files_in_dir).difference(ignore_set) |
21 | 22 |
|
22 | 23 |
|
23 |
|
|
24 |
def get_devices_set(dataset_name,path): |
|
24 |
def get_devices_set(dataset_name: str, path: str) -> StringSetType: |
|
25 | 25 |
""" |
26 | 26 |
Goes trough every not loaded file(not contained in ProcessedData/ignore.txt) |
27 | 27 |
Extracts names from not loaded file which should be in first column |
... | ... | |
34 | 34 |
set of unique names contained in not loaded files |
35 | 35 |
""" |
36 | 36 |
ignore_set = database_record_logs.load_ignore_set_loaded(dataset_name) |
37 |
files_in_dir = list_of_all_new_files(ignore_set,path) |
|
37 |
files_in_dir = list_of_all_new_files(ignore_set, path)
|
|
38 | 38 |
|
39 | 39 |
unique_names = set() |
40 | 40 |
|
41 | 41 |
for file_path in files_in_dir: |
42 |
unique_names.update(csv_utils.get_unique_names_from_file(path+file_path, 0)) |
|
42 |
unique_names.update( |
|
43 |
csv_utils.get_unique_names_from_file(path + file_path, 0)) |
|
43 | 44 |
|
44 | 45 |
return unique_names |
45 | 46 |
|
46 | 47 |
|
47 |
def get_unknown_devices_set(config, devices): |
|
48 |
def get_unknown_devices_set(config: ConfigType, |
|
49 |
devices: StringSetType) -> StringSetType: |
|
48 | 50 |
""" |
49 | 51 |
Compares config and devices a return difference |
50 | 52 |
|
... | ... | |
61 | 63 |
return unknown_devices_set |
62 | 64 |
|
63 | 65 |
|
64 |
def unzip_all_csv_zip_files_in_folder(path):
|
|
66 |
def unzip_all_csv_zip_files_in_folder(path: str) -> None:
|
|
65 | 67 |
""" |
66 | 68 |
Load all files from directory and unzip those which end by .zip |
67 | 69 |
After unziping deletes the zip file |
... | ... | |
83 | 85 |
os.remove(zip_file) |
84 | 86 |
|
85 | 87 |
|
86 |
def clean_folder(path):
|
|
88 |
def clean_folder(path: str) -> None:
|
|
87 | 89 |
""" |
88 | 90 |
Deletes all files in folder |
89 | 91 |
|
... | ... | |
93 | 95 |
files = os.listdir(path) |
94 | 96 |
|
95 | 97 |
for file in files: |
96 |
os.remove(path+file) |
|
98 |
os.remove(path + file) |
modules/crawler/Utilities/helpers.py | ||
---|---|---|
4 | 4 |
UNKNOWN = "UNKNOWN!" |
5 | 5 |
|
6 | 6 |
|
7 |
def should_skip(device) -> bool: |
|
7 |
def should_skip(device: Dict[str, str]) -> bool:
|
|
8 | 8 |
return device['x'] == SKIP or device['y'] == SKIP or device[ |
9 | 9 |
'x'] == UNKNOWN or device['y'] == UNKNOWN |
10 | 10 |
|
11 | 11 |
|
12 |
def detect_change(first: Dict[str, str], second: Dict[str, str], compareKeys: [str]) -> bool: |
|
12 |
def detect_change(first: Dict[str, str], second: Dict[str, str], |
|
13 |
compareKeys: [str]) -> bool: |
|
13 | 14 |
"""Detects change between two dictonaries |
14 | 15 |
|
15 | 16 |
Args: |
modules/crawler/crone_update_script.py | ||
---|---|---|
5 | 5 |
CONFIG_FILES_PATH = "DatasetConfigs/" |
6 | 6 |
|
7 | 7 |
|
8 |
def run_pipeline_for_all_datasets(): |
|
8 |
def run_pipeline_for_all_datasets() -> None:
|
|
9 | 9 |
""" |
10 | 10 |
Runs whole DataScript pipeline for every dataset that has existing configuration file |
11 | 11 |
""" |
... | ... | |
16 | 16 |
pipeline.run_full_pipeline_crone(name) |
17 | 17 |
|
18 | 18 |
|
19 |
run_pipeline_for_all_datasets() |
|
19 |
def main() -> None: |
|
20 |
run_pipeline_for_all_datasets() |
|
21 |
|
|
22 |
|
|
23 |
if __name__ == "__main__": |
|
24 |
main() |
modules/crawler/docker_prepare_structure.py | ||
---|---|---|
10 | 10 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
11 | 11 |
|
12 | 12 |
|
13 |
def prepare_strucure_for_all_datasets(): |
|
13 |
def prepare_strucure_for_all_datasets() -> None:
|
|
14 | 14 |
""" |
15 | 15 |
Prepares folders that are necessery but does not contain code so they are excluded from gitlab by gitignore |
16 | 16 |
""" |
17 | 17 |
|
18 |
if not os.path.isdir(CRAWLED_DATA_PATH) :
|
|
18 |
if not os.path.isdir(CRAWLED_DATA_PATH):
|
|
19 | 19 |
try: |
20 | 20 |
os.mkdir(CRAWLED_DATA_PATH) |
21 | 21 |
except os.error as e: |
22 | 22 |
print(e) |
23 |
print("Creation of the directory %s failed" % CRAWLED_DATA_PATH)
|
|
23 |
print("Nelze vytvořit adresář %s" % CRAWLED_DATA_PATH)
|
|
24 | 24 |
|
25 |
if not os.path.isdir(PROCESSED_DATA_PATH) :
|
|
25 |
if not os.path.isdir(PROCESSED_DATA_PATH):
|
|
26 | 26 |
try: |
27 | 27 |
os.mkdir(PROCESSED_DATA_PATH) |
28 | 28 |
except os.error as e: |
29 | 29 |
print(e) |
30 |
print("Creation of the directory %s failed" % PROCESSED_DATA_PATH)
|
|
31 |
|
|
32 |
if not os.path.isdir(CRAWLER_LOGS_PATH) :
|
|
30 |
print("Nelze vytvořit adresář %s" % PROCESSED_DATA_PATH)
|
|
31 |
|
|
32 |
if not os.path.isdir(CRAWLER_LOGS_PATH):
|
|
33 | 33 |
try: |
34 | 34 |
os.mkdir(CRAWLER_LOGS_PATH) |
35 | 35 |
except os.error as e: |
36 | 36 |
print(e) |
37 |
print("Creation of the directory %s failed" % PROCESSED_DATA_PATH) |
|
38 |
|
|
37 |
print("Nelze vytvořit adresář %s" % CRAWLER_LOGS_PATH) |
|
39 | 38 |
|
40 | 39 |
files_in_dir = os.listdir(CONFIG_FILES_PATH) |
41 | 40 |
|
... | ... | |
44 | 43 |
prepare_structure(name[0]) |
45 | 44 |
|
46 | 45 |
|
47 |
def prepare_structure(dataset_name):
|
|
46 |
def prepare_structure(dataset_name: str) -> None:
|
|
48 | 47 |
""" |
49 | 48 |
Create folder for every dataset in newly created folder for processed and crawled data |
50 | 49 |
""" |
51 | 50 |
|
52 |
path = CRAWLED_DATA_PATH + dataset_name
|
|
53 |
if not os.path.isdir(path) :
|
|
51 |
path = CRAWLED_DATA_PATH + dataset_name |
|
52 |
if not os.path.isdir(path):
|
|
54 | 53 |
os.mkdir(path) |
55 | 54 |
|
56 |
path = PROCESSED_DATA_PATH + dataset_name
|
|
57 |
if not os.path.isdir(path):
|
|
55 |
path = PROCESSED_DATA_PATH + dataset_name |
|
56 |
if not os.path.isdir(path): |
|
58 | 57 |
os.mkdir(PROCESSED_DATA_PATH + dataset_name) |
59 | 58 |
|
60 | 59 |
|
61 |
print("Inicializuji počáteční strukturu pro stažená a zpracovaná data") |
|
62 |
prepare_strucure_for_all_datasets() |
|
60 |
def main() -> None: |
|
61 |
print("Inicializuji počáteční strukturu pro stažená a zpracovaná data") |
|
62 |
prepare_strucure_for_all_datasets() |
|
63 |
|
|
64 |
|
|
65 |
if __name__ == "__main__": |
|
66 |
main() |
modules/crawler/force_update_datasets.py | ||
---|---|---|
1 |
from Utilities import configure_functions |
|
1 | 2 |
import pipeline |
2 | 3 |
import os |
3 |
from Utilities import configure_functions
|
|
4 |
import sys
|
|
4 | 5 |
|
5 | 6 |
# Path to configuration files |
6 | 7 |
CONFIG_FILES_PATH = "DatasetConfigs/" |
8 |
WRONG_ARG_MSG = "Do argumentu funkce dejte jméno Datasetu, který chcete aktualizovat (pokud všechny zadejte 'ALL'):\n" |
|
9 |
DATASET_NOT_FOUND_MSG = "Tento dataset v architektuře neexistuje" |
|
7 | 10 |
|
8 | 11 |
|
9 |
def run_pipeline_for_one_datasets(dataset_name):
|
|
12 |
def run_pipeline_for_one_datasets(dataset_name: str) -> None:
|
|
10 | 13 |
print("Probíhá update datasetu " + dataset_name) |
11 | 14 |
pipeline.run_full_pipeline(dataset_name) |
12 | 15 |
|
13 | 16 |
|
14 |
def run_pipeline_for_all_datasets(): |
|
17 |
def run_pipeline_for_all_datasets() -> None:
|
|
15 | 18 |
""" |
16 | 19 |
Runs whole DataScript pipeline for every dataset that has existing configuration file |
17 | 20 |
""" |
... | ... | |
23 | 26 |
pipeline.run_full_pipeline(name) |
24 | 27 |
|
25 | 28 |
|
26 |
print("Zadejte jméno Datasetu který chcete updatovat (pokud všechny zadejte '-ALL'):\n") |
|
29 |
def main() -> None: |
|
30 |
if len(sys.argv) > 1: |
|
31 |
dataset_name = sys.argv[1].upper() |
|
32 |
if dataset_name == "ALL": |
|
33 |
run_pipeline_for_all_datasets() |
|
34 |
else: |
|
35 |
test = configure_functions.check_if_there_is_a_config_file( |
|
36 |
dataset_name) |
|
37 |
if test == True: |
|
38 |
run_pipeline_for_one_datasets(dataset_name) |
|
39 |
else: |
|
40 |
print(DATASET_NOT_FOUND_MSG) |
|
41 |
else: |
|
42 |
print(WRONG_ARG_MSG) |
|
27 | 43 |
|
28 |
dataset_name = input().upper() |
|
29 | 44 |
|
30 |
if dataset_name == '-ALL': |
|
31 |
run_pipeline_for_all_datasets() |
|
32 |
else: |
|
33 |
test = configure_functions.check_if_there_is_a_config_file(dataset_name) |
|
34 |
if test == True: |
|
35 |
run_pipeline_for_one_datasets(dataset_name) |
|
36 |
else: |
|
37 |
print("Tento dataset v architektuře neexistuje") |
|
45 |
if __name__ == "__main__": |
|
46 |
main() |
modules/crawler/fully_clean_database.py | ||
---|---|---|
1 | 1 |
from Utilities.Database import database_loader |
2 | 2 |
|
3 | 3 |
|
4 |
#TODO: smazat vsechny pomocny soubory po cisteni databaze + prejmenovat |
|
5 |
def clean_database(): |
|
4 |
def clean_database() -> None: |
|
6 | 5 |
""" |
7 | 6 |
Drops every collection in database |
8 | 7 |
""" |
... | ... | |
18 | 17 |
mydb[name].drop() |
19 | 18 |
|
20 | 19 |
|
21 |
print('Data z databáze budou smazána:') |
|
22 |
clean_database() |
|
20 |
def main() -> None: |
|
21 |
print('Data z databáze budou smazána:') |
|
22 |
clean_database() |
|
23 |
|
|
24 |
|
|
25 |
if __name__ == "__main__": |
|
26 |
main() |
modules/crawler/pipeline.py | ||
---|---|---|
1 | 1 |
from Utilities import folder_processor, configure_functions |
2 | 2 |
from Utilities.Database import database_loader, database_record_logs |
3 | 3 |
from Utilities.CSV import csv_utils |
4 |
from shared_types import ConfigType |
|
4 | 5 |
import os |
5 | 6 |
import pymongo |
6 | 7 |
|
... | ... | |
20 | 21 |
|
21 | 22 |
#logger |
22 | 23 |
logging.basicConfig(filename=CRAWLER_LOGS_PATH + 'Applicationlog-' + |
23 |
date.today().strftime("%b-%Y") + '.log',
|
|
24 |
level=logging.INFO,
|
|
25 |
format='%(asctime)s %(message)s')
|
|
24 |
date.today().strftime("%b-%Y") + '.log', |
|
25 |
level=logging.INFO, |
|
26 |
format='%(asctime)s %(message)s') |
|
26 | 27 |
|
27 | 28 |
|
28 |
def check_last_update(config):
|
|
29 |
def check_last_update(config: ConfigType) -> bool:
|
|
29 | 30 |
""" |
30 | 31 |
Loads integer from updated.txt in CrawlerLogs/"dataset_name" |
31 | 32 |
representing number of days from last update if number equals |
... | ... | |
56 | 57 |
return False |
57 | 58 |
|
58 | 59 |
|
59 |
def crawl_data(config):
|
|
60 |
def crawl_data(config: ConfigType) -> None:
|
|
60 | 61 |
""" |
61 | 62 |
Imports dataset crawler in DatasetCrawler/"dataset_name"_crawler.py |
62 | 63 |
runs crawler. |
... | ... | |
73 | 74 |
dataset_name += '/' |
74 | 75 |
|
75 | 76 |
|
76 |
def process_data(config):
|
|
77 |
def process_data(config: ConfigType) -> None:
|
|
77 | 78 |
""" |
78 | 79 |
Goes trough every not processed file(list of processed files is saved in databse) |
79 | 80 |
Imports dataset processor in DatasetProcessing/"dataset_name"_processor.py |
... | ... | |
100 | 101 |
path = CRAWLED_DATA_PATH + dataset_path + not_processed_file |
101 | 102 |
date_dic = process_file_func(path) |
102 | 103 |
csv_utils.export_data_to_csv(path, date_dic) |
104 |
print("Vytvářím: " + not_processed_file) |
|
103 | 105 |
database_record_logs.update_ignore_set_processed( |
104 | 106 |
dataset_name, not_processed_file) |
105 | 107 |
|
... | ... | |
107 | 109 |
str(len(not_processed_files)) + " newly crawled files") |
108 | 110 |
|
109 | 111 |
|
110 |
def process_data_crone(config):
|
|
112 |
def process_data_crone(config: ConfigType) -> None:
|
|
111 | 113 |
""" |
112 | 114 |
Goes trough every not processed file(list of processed files is saved in database) |
113 | 115 |
Imports dataset processor in DatasetProcessing/"dataset_name"_processor.py |
... | ... | |
146 | 148 |
str(len(not_processed_files)) + " newly crawled files") |
147 | 149 |
|
148 | 150 |
|
149 |
def validate_process_data(config):
|
|
151 |
def validate_process_data(config: ConfigType) -> bool:
|
|
150 | 152 |
""" |
151 | 153 |
Function goes through newly processed data and checks theirs status |
152 | 154 |
|
... | ... | |
186 | 188 |
return True |
187 | 189 |
|
188 | 190 |
|
189 |
def load_data_to_database(config):
|
|
191 |
def load_data_to_database(config: ConfigType) -> None:
|
|
190 | 192 |
""" |
191 | 193 |
Goes trough every not loaded file(list of loaded files is saved in database) |
192 | 194 |
loads data appends coordination from configurations |
... | ... | |
207 | 209 |
changes_in_devices = database_loader.update_devices_collection(config) |
208 | 210 |
|
209 | 211 |
if changes_in_devices == True: |
210 |
logging.info(
|
|
211 |
dataset_name +
|
|
212 |
" contains changes in devices configuration. Deleting old data and preparing new"
|
|
213 |
) |
|
212 |
logg_string = dataset_name + " contains changes in devices configuration. Deleting old data and preparing new"
|
|
213 |
logg_string_cs = dataset_name + " obsahuje změny v konfiguračním souboru. Probíha odstraňování starých dat a připravení nových."
|
|
214 |
logging.info(logg_string)
|
|
215 |
print(logg_string_cs)
|
|
214 | 216 |
database_loader.reset_dataset_database(dataset_name) |
215 | 217 |
|
216 | 218 |
# get all unprocessed files from dataset |
... | ... | |
230 | 232 |
database_record_logs.update_ignore_set_loaded(dataset_name, |
231 | 233 |
not_loaded_file) |
232 | 234 |
|
233 |
logging.info(dataset_name + " has loaded to database " + |
|
234 |
str(len(not_loaded_files)) + " newly processed files.") |
|
235 |
logg_string = dataset_name + " has loaded to database " + str( |
|
236 |
len(not_loaded_files)) + " newly processed files." |
|
237 |
logg_string_cs = dataset_name + " načetl " + str( |
|
238 |
len(not_loaded_files)) + " nových zpracovaných souborů \n" |
|
239 |
|
|
240 |
logging.info(logg_string) |
|
241 |
print(logg_string_cs) |
|
235 | 242 |
|
236 | 243 |
client = pymongo.MongoClient() |
237 | 244 |
client.close() |
238 | 245 |
|
239 | 246 |
|
240 |
def load_data_to_database_crone(config):
|
|
247 |
def load_data_to_database_crone(config: ConfigType) -> None:
|
|
241 | 248 |
""" |
242 | 249 |
Goes trough every not loaded file(list of loaded files is saved in database) |
243 | 250 |
loads data appends coordination from configurations |
... | ... | |
280 | 287 |
client.close() |
281 | 288 |
|
282 | 289 |
|
283 |
def run_full_pipeline(dataset_name):
|
|
290 |
def run_full_pipeline(dataset_name: str) -> None:
|
|
284 | 291 |
""" |
285 | 292 |
Loads config file and starts full pipeline |
286 | 293 |
-crawl data |
... | ... | |
292 | 299 |
""" |
293 | 300 |
logging.info("Starting pipeline for dataset " + dataset_name) |
294 | 301 |
print("Zpracovávám dataset " + dataset_name + |
295 |
" průběh lze sledovat v logu umístěném v in CrawlerLogs folder")
|
|
302 |
", průběh lze sledovat v logu umístěném v adresáři CrawlerLogs")
|
|
296 | 303 |
|
297 | 304 |
config = configure_functions.load_configuration(dataset_name) |
298 | 305 |
crawl_data(config) |
... | ... | |
304 | 311 |
load_data_to_database(config) |
305 | 312 |
|
306 | 313 |
|
307 |
def run_full_pipeline_crone(dataset_name):
|
|
314 |
def run_full_pipeline_crone(dataset_name: str) -> None:
|
|
308 | 315 |
""" |
309 | 316 |
Loads config file and starts full pipeline |
310 | 317 |
-crawl data |
modules/crawler/prepare_new_dataset.py | ||
---|---|---|
1 | 1 |
import os |
2 |
|
|
3 | 2 |
# Path to crawled data |
4 | 3 |
CRAWLED_DATA_PATH = "CrawledData/" |
5 | 4 |
# Path to processed data |
... | ... | |
14 | 13 |
DEFAULT_COLOR = "#000000" |
15 | 14 |
|
16 | 15 |
|
17 |
def create_default_config_file(dataset_name: str): |
|
16 |
def create_default_config_file(dataset_name: str) -> None:
|
|
18 | 17 |
""" |
19 | 18 |
Creates default config file |
20 | 19 |
|
... | ... | |
22 | 21 |
dataset_name: Name of newly created dataset |
23 | 22 |
""" |
24 | 23 |
with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file: |
25 |
file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
|
|
24 |
file.write("# Name of the dataset inside the application\n")
|
|
26 | 25 |
file.write("display-name: " + dataset_name + "\n") |
27 | 26 |
file.write( |
28 |
"# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře\n")
|
|
29 |
file.write("display-color: " + DEFAULT_COLOR + "\n")
|
|
27 |
"# Color for the dataset in a hex value (default value #000000)\n")
|
|
28 |
file.write(f'display-color: \'{DEFAULT_COLOR}\' \n')
|
|
30 | 29 |
file.write( |
31 |
"# barva pro tento dataset v hexadecimální hodnotě (#000000)\n") |
|
30 |
"# One word dataset name (structure of all modules will be affected by this)\n" |
|
31 |
) |
|
32 | 32 |
file.write("dataset-name: " + dataset_name + "\n") |
33 |
file.write("# root slozka, ktera obsahuje odkazy na dataset\n") |
|
34 |
file.write("url: ZDE VLOZTE URL\n") |
|
33 |
file.write("# Url for the source of this dataset\n") |
|
34 |
file.write("url: ENTER URL HERE\n") |
|
35 |
file.write( |
|
36 |
"# Optional parameter which specifies a pattern of the datasets name\n" |
|
37 |
) |
|
38 |
file.write( |
|
39 |
"# Example: DATASET_NAME_[0-9][0-9]_[0-9][0-9][0-9][0-9].zip\n") |
|
40 |
file.write( |
|
41 |
"# - DATASET_NAME_01_2020.zip where '01_2020' specifies date in this dataset\n" |
|
42 |
) |
|
43 |
file.write("regex: ENTER REGEX HERE\n") |
|
35 | 44 |
file.write( |
36 |
"# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n") |
|
37 |
file.write("regex: ZDE VLOZTE REGEX\n") |
|
38 |
file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, " |
|
39 |
"tak defaultni hodnota (dny)\n") |
|
40 |
file.write("update-period: ZDE VLOZTE HODNOTU\n") |
|
41 |
file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n") |
|
45 |
"# Optional parameter which specifies the way of searching new datasets (if empty the period is set to every day)\n" |
|
46 |
) |
|
47 |
file.write("update-period: ENTER UPDATE PERIOD HERE\n") |
|
48 |
file.write("# Coordinates of every datasets device (entinty)\n") |
|
42 | 49 |
file.write("devices:\n") |
43 | 50 |
|
44 | 51 |
|
45 |
def create_default_processor(dataset_name):
|
|
52 |
def create_default_processor(dataset_name: str) -> None:
|
|
46 | 53 |
""" |
47 | 54 |
Creates default processor for dataset |
48 | 55 |
|
49 | 56 |
Args: |
50 | 57 |
dataset_name: Name of newly created dataset |
51 | 58 |
""" |
52 |
with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "_processor.py", "w") as file: |
|
53 |
file.write("from Utilities.CSV import csv_data_line") |
|
59 |
with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "_processor.py", |
|
60 |
"w") as file: |
|
61 |
file.write("from Utilities.CSV import csv_data_line\n") |
|
62 |
file.write("from shared_types import DateDict") |
|
54 | 63 |
file.write("\n") |
55 | 64 |
file.write("\n") |
56 |
file.write("def process_file(filename):\n")
|
|
65 |
file.write("def process_file(filename: str) -> DateDict:\n")
|
|
57 | 66 |
file.write(" \"\"\"\n") |
58 | 67 |
file.write( |
59 |
" Method that take path to crawled file and outputs date dictionary:\n") |
|
68 |
" Method that takes the path to crawled file and outputs date dictionary:\n" |
|
69 |
) |
|
60 | 70 |
file.write( |
61 |
" Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)\n") |
|
71 |
" Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)\n" |
|
72 |
) |
|
62 | 73 |
file.write( |
63 |
" and value is dictionary where keys are devices (specified in configuration file)\n") |
|
74 |
" and value is dictionary where keys are devices (specified in configuration file)\n" |
|
75 |
) |
|
64 | 76 |
file.write( |
65 |
" and value is CSVDataLine.csv_data_line with device,date and occurrence\n") |
|
77 |
" and value is CSVDataLine.csv_data_line with device,date and occurrence\n" |
|
78 |
) |
|
66 | 79 |
file.write("\n") |
67 | 80 |
file.write(" Args:\n") |
68 |
file.write(" filename: name of processed file\n") |
|
81 |
file.write(" filename: name of the processed file\n")
|
|
69 | 82 |
file.write("\n") |
70 | 83 |
file.write(" Returns:\n") |
71 | 84 |
file.write(" None if not implemented\n") |
72 | 85 |
file.write(" date_dict when implemented\n") |
73 | 86 |
file.write(" \"\"\"\n") |
74 |
file.write(" date_dict = dict()\n")
|
|
87 |
file.write(" date_dict: DateDict = {}\n")
|
|
75 | 88 |
file.write("\n") |
76 | 89 |
file.write(" #with open(filename, \"r\") as file:\n") |
77 | 90 |
file.write( |
78 |
" print(\"You must implements process_file method first!\")\n") |
|
79 |
file.write(" return None\n") |
|
91 |
" print(\"You must implement the process_file method first!\")\n" |
|
92 |
) |
|
93 |
file.write(" return date_dict\n") |
|
80 | 94 |
|
81 | 95 |
|
82 |
def create_default_crawler(dataset_name):
|
|
96 |
def create_default_crawler(dataset_name: str) -> None:
|
|
83 | 97 |
""" |
84 | 98 |
Creates default crawler for dataset |
85 | 99 |
|
... | ... | |
87 | 101 |
dataset_name: Name of newly created dataset |
88 | 102 |
""" |
89 | 103 |
|
90 |
with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "_crawler.py", "w") as file: |
|
104 |
with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "_crawler.py", |
|
105 |
"w") as file: |
|
106 |
file.write("from shared_types import ConfigType\n") |
|
91 | 107 |
file.write("# Path to crawled data\n") |
92 |
file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
|
|
108 |
file.write(f'CRAWLED_DATA_PATH = "{CRAWLED_DATA_PATH}" \n')
|
|
93 | 109 |
file.write("\n") |
94 | 110 |
file.write("\n") |
95 |
file.write("def crawl(config):\n") |
|
111 |
file.write("def crawl(config: ConfigType):\n")
|
|
96 | 112 |
file.write(" \"\"\"\n") |
97 | 113 |
file.write( |
98 |
" Implement crawl method that downloads new data to path_for_files\n") |
|
114 |
" Implementation the crawl method which downloads new data to the path_for_files\n" |
|
115 |
) |
|
99 | 116 |
file.write(" For keeping the project structure\n") |
100 | 117 |
file.write(" url , regex, and dataset_name from config\n") |
101 | 118 |
file.write( |
102 |
" You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n") |
|
119 |
" You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n" |
|
120 |
) |
|
103 | 121 |
file.write("\n") |
104 | 122 |
file.write(" Args:\n") |
105 | 123 |
file.write(" config: loaded configuration file of dataset\n") |
... | ... | |
109 | 127 |
file.write(" regex = config['regex']\n") |
110 | 128 |
file.write( |
111 | 129 |
" path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n") |
112 |
file.write(" print(\"You must implements Crawl method first!\")\n") |
|
130 |
file.write( |
|
131 |
" print(\"Není implementován crawler pro získávání dat!\")\n") |
|
113 | 132 |
|
114 | 133 |
|
115 |
def prepare_dataset_structure(dataset_name):
|
|
134 |
def prepare_dataset_structure(dataset_name: str) -> None:
|
|
116 | 135 |
""" |
117 | 136 |
Prepares folders for new dataset |
118 | 137 |
Args: |
... | ... | |
120 | 139 |
""" |
121 | 140 |
|
122 | 141 |
# create folder for crawled data |
123 |
path = CRAWLED_DATA_PATH+dataset_name
|
|
142 |
path = CRAWLED_DATA_PATH + dataset_name
|
|
124 | 143 |
try: |
125 | 144 |
os.mkdir(path) |
126 | 145 |
except os.error as e: |
... | ... | |
132 | 151 |
try: |
133 | 152 |
os.mkdir(path) |
134 | 153 |
except OSError: |
135 |
print("Creation of the directory %s failed" % path)
|
|
154 |
print("Nelze vytvořit adresář %s" % path)
|
|
136 | 155 |
|
137 | 156 |
create_default_crawler(dataset_name) |
138 | 157 |
create_default_processor(dataset_name) |
139 | 158 |
create_default_config_file(dataset_name) |
140 | 159 |
|
141 | 160 |
|
142 |
print("Zadejte jméno nového datasetu:\n") |
|
161 |
def main() -> None: |
|
162 |
print("Zadejte jméno nového datasetu:\n") |
|
163 |
dataset_name = input().upper() |
|
164 |
|
|
165 |
if dataset_name.isalpha(): |
|
166 |
prepare_dataset_structure(dataset_name) |
|
167 |
print("Architektura vytvořena \n") |
|
168 |
else: |
|
169 |
print("Jméno musí obsahovat pouze písmena z abecedy (bez mezer)\n") |
|
143 | 170 |
|
144 |
dataset_name = input().upper() |
|
145 | 171 |
|
146 |
if dataset_name.isalpha(): |
|
147 |
prepare_dataset_structure(dataset_name) |
|
148 |
print("Architektura vytvořena \n") |
|
149 |
else: |
|
150 |
print("Jméno musí obsahovat pouze písmena z abecedy (bez mezer)\n") |
|
172 |
if __name__ == "__main__": |
|
173 |
main() |
modules/crawler/python.code-workspace | ||
---|---|---|
12 | 12 |
"python.linting.pylintEnabled": true, |
13 | 13 |
"python.linting.enabled": true, |
14 | 14 |
"python.linting.pylintPath": "pylint", |
15 |
"python.pythonPath": "/usr/local/bin/python", |
|
16 | 15 |
"python.formatting.provider": "yapf", |
17 | 16 |
}, |
18 | 17 |
"extensions": { |
Také k dispozici: Unified diff
Re #8193 - refactoring crawler