Projekt

Obecné

Profil

« Předchozí | Další » 

Revize 753d424e

Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)

Predelany komentáře upravena kontrola datasetu

Zobrazit rozdíly:

modules/crawler/DatasetConfigs/KOLOBEZKY.yaml
33 33
  - stojan-borska:
34 34
      x: 49.734518
35 35
      y: 13.359475
36

  
modules/crawler/Utilities/Database/database_loader.py
241 241
    for name in collections:
242 242
        if pattern.match(name):
243 243
            mydb[name].drop()
244
            print("Dropping: " + name)
244 245

  
245
    database_record_logs.reset_ignore_set_processed(dataset_name)
246 246
    database_record_logs.reset_ignore_set_loaded(dataset_name)
modules/crawler/fully_clean_database.py
1
from Utilities.Database import database_loader
2

  
3
def clean_database():
4
    """
5
    Drops every collection in database
6
    """
7
    # Creating connection
8
    mydb = database_loader.create_database_connection()
9

  
10
    # Retrieve list of all collections
11
    collections = mydb.list_collection_names()
12

  
13
    # Drop of all collections
14
    for name in collections:
15
        print(name)
16
        mydb[name].drop()
17

  
18

  
19
print('Data z databáze budou smazána:')
20
clean_database()
modules/crawler/pipeline.py
86 86
    dataset_name = config["dataset-name"]
87 87
    dataset_path = dataset_name + '/'
88 88

  
89
    changes_in_devices = database_loader.update_devices_collection(config)
90

  
91
    if changes_in_devices == True:
92
        logging.info(dataset_name + " contains changes in devices configuration. Deleteing old data and preparing new")
93
        database_loader.reset_dataset_database(dataset_name)
94
        folder_processor.clean_folder(PROCESSED_DATA_PATH + dataset_path)
95

  
96

  
97 89
    process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "_processor", globals(), locals(),
98 90
                                   ['process_file']).process_file
99 91

  
......
194 186

  
195 187
    database_loader.check_or_update_datasets_collection(database_connection,config)
196 188

  
189
    changes_in_devices = database_loader.update_devices_collection(config)
190

  
191
    if changes_in_devices == True:
192
        logging.info(dataset_name + " contains changes in devices configuration. Deleting old data and preparing new")
193
        database_loader.reset_dataset_database(dataset_name)
194

  
197 195
    # get all unprocessed files from dataset
198 196
    ignore_set = database_record_logs.load_ignore_set_loaded(dataset_name)
199 197
    not_loaded_files = folder_processor.list_of_all_new_files(ignore_set,PROCESSED_DATA_PATH + dataset_path)
......
261 259
        dataset_name: name of dataset that has existing configuration file
262 260
    """
263 261
    logging.info("Starting pipeline for dataset " + dataset_name)
264
    print("Processing dataset " + dataset_name + " you can watch the progress in log contained in CrawlerLogs folder")
262
    print("Zpracovávám dataset " + dataset_name + " průběh lze sledovat v logu umístěném v in CrawlerLogs folder")
265 263
    
266 264
    config = configure_functions.load_configuration(dataset_name)
267 265
    crawl_data(config)
modules/crawler/reset_datasets.py
13 13
CONFIG_FILES_PATH = "DatasetConfigs"
14 14

  
15 15

  
16
def reset_dataset(dataset_name):
16
def hard_reset_dataset(dataset_name):
17 17
    """
18 18
    Resets all saved data in dataset except config and implementation
19 19
    Args:
......
29 29
    database_loader.remove_dataset_database(dataset_name)
30 30

  
31 31

  
32
def reset_all_datasets():
32
def soft_reset_dataset(dataset_name):
33
    """
34
    Resets all saved data in dataset except config and implementation
35
    Args:
36
        dataset_name: name of dataset that has existing configuration file
37
    """
38
    path = PROCESSED_DATA_PATH + dataset_name + "/"
39
    folder_processor.clean_folder(path)
40

  
41
    database_loader.remove_dataset_database(dataset_name)
42

  
43

  
44

  
45
def soft_reset_all_datasets():
33 46
    """
34 47
    Resets all saved data in all datasets with config file except configs and implementation
35 48
    """
36 49
    datasets = os.listdir(CONFIG_FILES_PATH)
37 50

  
38 51
    for dataset in datasets:
39
        reset_dataset(dataset.split('.')[0])
52
        soft_reset_dataset(dataset.split('.')[0])
40 53

  
41 54

  
55
def hard_reset_all_datasets():
56
    """
57
    Resets all saved data in all datasets with config file except configs and implementation
58
    """
59
    datasets = os.listdir(CONFIG_FILES_PATH)
60

  
61
    for dataset in datasets:
62
        hard_reset_dataset(dataset.split('.')[0])
63

  
42 64

  
43 65
print("Zadejte jméno Datasetu který chcete resetovat (pokud všechny zadejte '-ALL'):\n")
44 66

  
45 67
dataset_name = input().upper()
46 68

  
69
print("Chcete smazat i stažené stažená data ? (ANO/NE) \n")
70

  
71
vstup = input().upper()
72

  
47 73
if dataset_name == '-ALL':
48
    reset_all_datasets()
74
    if vstup == 'ANO':
75
        hard_reset_all_datasets()
76
    elif vstup == 'NE':
77
        soft_reset_all_datasets()
78
    else:
79
        print('Neplatný vstup (ANO/NE)')
49 80
else:
50 81
    test = configure_functions.check_if_there_is_a_config_file(dataset_name)
51 82
    if test == True:
52
        reset_dataset(dataset_name)
83
        if vstup == 'ANO':
84
            hard_reset_dataset(dataset_name)
85
        elif vstup == 'NE':
86
            soft_reset_dataset(dataset_name)
87
        else:
88
            print('Neplatný vstup (ANO/NE)')
53 89
    else:
54 90
        print("Tento dataset v architektuře neexistuje")
91

  

Také k dispozici: Unified diff