Revize 753d424e
Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)
modules/crawler/DatasetConfigs/KOLOBEZKY.yaml | ||
---|---|---|
33 | 33 |
- stojan-borska: |
34 | 34 |
x: 49.734518 |
35 | 35 |
y: 13.359475 |
36 |
|
modules/crawler/Utilities/Database/database_loader.py | ||
---|---|---|
241 | 241 |
for name in collections: |
242 | 242 |
if pattern.match(name): |
243 | 243 |
mydb[name].drop() |
244 |
print("Dropping: " + name) |
|
244 | 245 |
|
245 |
database_record_logs.reset_ignore_set_processed(dataset_name) |
|
246 | 246 |
database_record_logs.reset_ignore_set_loaded(dataset_name) |
modules/crawler/fully_clean_database.py | ||
---|---|---|
1 |
from Utilities.Database import database_loader |
|
2 |
|
|
3 |
def clean_database(): |
|
4 |
""" |
|
5 |
Drops every collection in database |
|
6 |
""" |
|
7 |
# Creating connection |
|
8 |
mydb = database_loader.create_database_connection() |
|
9 |
|
|
10 |
# Retrieve list of all collections |
|
11 |
collections = mydb.list_collection_names() |
|
12 |
|
|
13 |
# Drop of all collections |
|
14 |
for name in collections: |
|
15 |
print(name) |
|
16 |
mydb[name].drop() |
|
17 |
|
|
18 |
|
|
19 |
print('Data z databáze budou smazána:') |
|
20 |
clean_database() |
modules/crawler/pipeline.py | ||
---|---|---|
86 | 86 |
dataset_name = config["dataset-name"] |
87 | 87 |
dataset_path = dataset_name + '/' |
88 | 88 |
|
89 |
changes_in_devices = database_loader.update_devices_collection(config) |
|
90 |
|
|
91 |
if changes_in_devices == True: |
|
92 |
logging.info(dataset_name + " contains changes in devices configuration. Deleteing old data and preparing new") |
|
93 |
database_loader.reset_dataset_database(dataset_name) |
|
94 |
folder_processor.clean_folder(PROCESSED_DATA_PATH + dataset_path) |
|
95 |
|
|
96 |
|
|
97 | 89 |
process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "_processor", globals(), locals(), |
98 | 90 |
['process_file']).process_file |
99 | 91 |
|
... | ... | |
194 | 186 |
|
195 | 187 |
database_loader.check_or_update_datasets_collection(database_connection,config) |
196 | 188 |
|
189 |
changes_in_devices = database_loader.update_devices_collection(config) |
|
190 |
|
|
191 |
if changes_in_devices == True: |
|
192 |
logging.info(dataset_name + " contains changes in devices configuration. Deleting old data and preparing new") |
|
193 |
database_loader.reset_dataset_database(dataset_name) |
|
194 |
|
|
197 | 195 |
# get all unprocessed files from dataset |
198 | 196 |
ignore_set = database_record_logs.load_ignore_set_loaded(dataset_name) |
199 | 197 |
not_loaded_files = folder_processor.list_of_all_new_files(ignore_set,PROCESSED_DATA_PATH + dataset_path) |
... | ... | |
261 | 259 |
dataset_name: name of dataset that has existing configuration file |
262 | 260 |
""" |
263 | 261 |
logging.info("Starting pipeline for dataset " + dataset_name) |
264 |
print("Processing dataset " + dataset_name + " you can watch the progress in log contained in CrawlerLogs folder")
|
|
262 |
print("Zpracovávám dataset " + dataset_name + " průběh lze sledovat v logu umístěném v in CrawlerLogs folder")
|
|
265 | 263 |
|
266 | 264 |
config = configure_functions.load_configuration(dataset_name) |
267 | 265 |
crawl_data(config) |
modules/crawler/reset_datasets.py | ||
---|---|---|
13 | 13 |
CONFIG_FILES_PATH = "DatasetConfigs" |
14 | 14 |
|
15 | 15 |
|
16 |
def reset_dataset(dataset_name): |
|
16 |
def hard_reset_dataset(dataset_name):
|
|
17 | 17 |
""" |
18 | 18 |
Resets all saved data in dataset except config and implementation |
19 | 19 |
Args: |
... | ... | |
29 | 29 |
database_loader.remove_dataset_database(dataset_name) |
30 | 30 |
|
31 | 31 |
|
32 |
def reset_all_datasets(): |
|
32 |
def soft_reset_dataset(dataset_name): |
|
33 |
""" |
|
34 |
Resets all saved data in dataset except config and implementation |
|
35 |
Args: |
|
36 |
dataset_name: name of dataset that has existing configuration file |
|
37 |
""" |
|
38 |
path = PROCESSED_DATA_PATH + dataset_name + "/" |
|
39 |
folder_processor.clean_folder(path) |
|
40 |
|
|
41 |
database_loader.remove_dataset_database(dataset_name) |
|
42 |
|
|
43 |
|
|
44 |
|
|
45 |
def soft_reset_all_datasets(): |
|
33 | 46 |
""" |
34 | 47 |
Resets all saved data in all datasets with config file except configs and implementation |
35 | 48 |
""" |
36 | 49 |
datasets = os.listdir(CONFIG_FILES_PATH) |
37 | 50 |
|
38 | 51 |
for dataset in datasets: |
39 |
reset_dataset(dataset.split('.')[0]) |
|
52 |
soft_reset_dataset(dataset.split('.')[0])
|
|
40 | 53 |
|
41 | 54 |
|
55 |
def hard_reset_all_datasets(): |
|
56 |
""" |
|
57 |
Resets all saved data in all datasets with config file except configs and implementation |
|
58 |
""" |
|
59 |
datasets = os.listdir(CONFIG_FILES_PATH) |
|
60 |
|
|
61 |
for dataset in datasets: |
|
62 |
hard_reset_dataset(dataset.split('.')[0]) |
|
63 |
|
|
42 | 64 |
|
43 | 65 |
print("Zadejte jméno Datasetu který chcete resetovat (pokud všechny zadejte '-ALL'):\n") |
44 | 66 |
|
45 | 67 |
dataset_name = input().upper() |
46 | 68 |
|
69 |
print("Chcete smazat i stažené stažená data ? (ANO/NE) \n") |
|
70 |
|
|
71 |
vstup = input().upper() |
|
72 |
|
|
47 | 73 |
if dataset_name == '-ALL': |
48 |
reset_all_datasets() |
|
74 |
if vstup == 'ANO': |
|
75 |
hard_reset_all_datasets() |
|
76 |
elif vstup == 'NE': |
|
77 |
soft_reset_all_datasets() |
|
78 |
else: |
|
79 |
print('Neplatný vstup (ANO/NE)') |
|
49 | 80 |
else: |
50 | 81 |
test = configure_functions.check_if_there_is_a_config_file(dataset_name) |
51 | 82 |
if test == True: |
52 |
reset_dataset(dataset_name) |
|
83 |
if vstup == 'ANO': |
|
84 |
hard_reset_dataset(dataset_name) |
|
85 |
elif vstup == 'NE': |
|
86 |
soft_reset_dataset(dataset_name) |
|
87 |
else: |
|
88 |
print('Neplatný vstup (ANO/NE)') |
|
53 | 89 |
else: |
54 | 90 |
print("Tento dataset v architektuře neexistuje") |
91 |
|
Také k dispozici: Unified diff
Predelany komentáře upravena kontrola datasetu