Revize af7609b5
Přidáno uživatelem Tomáš Ballák před více než 3 roky(ů)
modules/crawler/Utilities/Database/database_loader.py | ||
---|---|---|
1 | 1 |
from Utilities.Database import database_data_line, database_record_logs |
2 | 2 |
from Utilities import configure_functions |
3 | 3 |
from Utilities.helpers import should_skip, detect_change |
4 |
from shared_types import ConfigType |
|
5 |
from typing import Dict |
|
4 | 6 |
import pymongo |
5 | 7 |
import re |
6 | 8 |
|
... | ... | |
20 | 22 |
# Path to processed data |
21 | 23 |
PROCESSED_DATA_PATH = "ProcessedData/" |
22 | 24 |
|
25 |
DatabaseConnectionType = Dict[str, any] |
|
23 | 26 |
|
24 |
def create_database_connection(): |
|
27 |
|
|
28 |
def create_database_connection() -> pymongo.database.Database: |
|
25 | 29 |
""" |
26 | 30 |
Creates connection to mongoDB |
27 | 31 |
|
... | ... | |
38 | 42 |
return database |
39 | 43 |
|
40 | 44 |
|
41 |
def get_data_from_file(filename, config):
|
|
45 |
def get_data_from_file(filename: str, config: ConfigType) -> Dict[str, any]:
|
|
42 | 46 |
""" |
43 | 47 |
Opens processed file, reads it line by line |
44 | 48 |
name, ocurrence, date |
... | ... | |
59 | 63 |
f = open(dataset_path + filename, "r") |
60 | 64 |
|
61 | 65 |
devices = config["devices"] |
62 |
date_dict = dict()
|
|
66 |
date_dict = {}
|
|
63 | 67 |
|
64 | 68 |
for line in f: |
65 | 69 |
line = line[:-1] |
... | ... | |
86 | 90 |
return date_dict |
87 | 91 |
|
88 | 92 |
|
89 |
def load_data_to_database(database_connection, dataset_name, data_dic, |
|
90 |
file_name): |
|
93 |
def load_data_to_database(database_connection: DatabaseConnectionType, |
|
94 |
dataset_name: str, data_dic: Dict[str, any], |
|
95 |
file_name: str) -> None: |
|
91 | 96 |
""" |
92 | 97 |
Takes data_dic created in method get_data_from_file |
93 | 98 |
and loads into into database where collection name is dataset_name + data_dic key |
... | ... | |
107 | 112 |
date_dataset.insert_many(data_dic[date]) |
108 | 113 |
|
109 | 114 |
|
110 |
def check_or_update_datasets_collection(database_connection, config): |
|
115 |
def check_or_update_datasets_collection( |
|
116 |
database_connection: DatabaseConnectionType, config: ConfigType): |
|
111 | 117 |
""" |
112 | 118 |
Checks if DATASETS collection contains dataset and if display name was not updated |
113 | 119 |
|
... | ... | |
116 | 122 |
config: loaded configuration file of dataset |
117 | 123 |
""" |
118 | 124 |
# collection where are specified aviable datasets |
119 |
compareKeys = ['display-name', |
|
120 |
'display-color'] |
|
125 |
compareKeys = ['display-name', 'display-color'] |
|
121 | 126 |
collection_datasets = database_connection[MONGODB_DATASET_COLLECTION] |
122 | 127 |
|
123 | 128 |
query = {'key-name': config['dataset-name']} |
... | ... | |
139 | 144 |
collection_datasets.update_one(query, {"$set": newVal}) |
140 | 145 |
|
141 | 146 |
|
142 |
def update_devices_collection(config): |
|
147 |
def update_devices_collection(config: ConfigType):
|
|
143 | 148 |
""" |
144 | 149 |
Checks if there are any changes in devices specified in config file against |
145 | 150 |
devices processed and loaded into the database |
... | ... | |
164 | 169 |
|
165 | 170 |
devices_cursor = collection_devices.find() |
166 | 171 |
|
167 |
db_device_dict = dict()
|
|
172 |
db_device_dict = {}
|
|
168 | 173 |
|
169 | 174 |
for device in devices_cursor: |
170 | 175 |
name = device['name'] |
... | ... | |
208 | 213 |
return change_in_devices |
209 | 214 |
|
210 | 215 |
|
211 |
def remove_dataset_database(dataset_name): |
|
216 |
def remove_dataset_database(dataset_name: str):
|
|
212 | 217 |
""" |
213 | 218 |
Removes dataset entries from database |
214 | 219 |
Args: |
... | ... | |
221 | 226 |
collection_datasets = mydb[MONGODB_DATASET_COLLECTION] |
222 | 227 |
|
223 | 228 |
collection_datasets.delete_one({"key-name": dataset_name}) |
224 |
print("Removing record from DATASETS collection")
|
|
229 |
print("Odstraňování záznamu z DATASETS kolekce")
|
|
225 | 230 |
|
226 | 231 |
# Retrieve list of all collections |
227 | 232 |
collections = mydb.list_collection_names() |
... | ... | |
230 | 235 |
for name in collections: |
231 | 236 |
if name.startswith(dataset_name): |
232 | 237 |
mydb[name].drop() |
233 |
print("Dropping: " + name)
|
|
238 |
print("Odstraňuji: " + name)
|
|
234 | 239 |
|
235 | 240 |
|
236 |
def reset_dataset_database(dataset_name): |
|
241 |
def reset_dataset_database(dataset_name: str):
|
|
237 | 242 |
""" |
238 | 243 |
Reset dataset in database |
239 | 244 |
- delete everything from except crawled links and mention in DATASETS collection |
... | ... | |
252 | 257 |
for name in collections: |
253 | 258 |
if pattern.match(name): |
254 | 259 |
mydb[name].drop() |
255 |
print("Dropping: " + name)
|
|
260 |
print("Odstraňuji: " + name)
|
|
256 | 261 |
|
257 | 262 |
database_record_logs.reset_ignore_set_loaded(dataset_name) |
Také k dispozici: Unified diff
Re #8193 - refactoring crawler