Projekt

Obecné

Profil

Stáhnout (4.4 KB) Statistiky
| Větev: | Revize:
1 70e660a8 petrh
from Utilities import FolderProcessor, ConfigureFunctions
2 c8f3051b petrh
from Utilities.Database import DatabaseLoader
3
4 04a2b5a4 petrh
# Path to crawled data
5 c8f3051b petrh
CRAWLED_DATA_PATH = "CrawledData/"
6 04a2b5a4 petrh
# Path to processed data
7 c8f3051b petrh
PROCESSED_DATA_PATH = "ProcessedData/"
8 04a2b5a4 petrh
# Path to dataset crawler implementations
9 c8f3051b petrh
CRAWLER_LIB_PATH = "DatasetCrawler."
10 04a2b5a4 petrh
# Path to dataset processor implementations
11 c8f3051b petrh
PROCESSOR_LIB_PATH = "DatasetProcessing."
12
13
14
def crawl_data(config):
15 04a2b5a4 petrh
    """
16
      Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py
17
      runs crawler.
18 c8f3051b petrh
19 04a2b5a4 petrh
    Args:
20
        config: loaded configuration file of dataset
21
    """
22 c8f3051b petrh
    dataset_name = config["dataset-name"]
23
24 04a2b5a4 petrh
    my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
25 c8f3051b petrh
    my_function(config)
26
27
    dataset_name += '/'
28
29
30
def process_data(dataset_name):
31 04a2b5a4 petrh
    """
32
    Goes trough every not processed file(not contained in CrawledData/"dataset_name"/ignore.txt)
33
    Imports dataset processor in DatasetProcessing/"dataset_name"Processor.py
34
    Runs processor on every file
35
    After successful processing updates ignore.txt
36
37
    Args:
38
        dataset_name: name of dataset that has existing configuration file
39
    """
40 c8f3051b petrh
    dataset_path = dataset_name + '/'
41
42 04a2b5a4 petrh
    process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor", globals(), locals(),
43 c8f3051b petrh
                                   ['process_file']).process_file
44
45
    not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path)
46
47
    for not_processed_file in not_processed_files:
48
        process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file)
49
        FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file)
50
51
52 70e660a8 petrh
def validate_process_data(config):
53 04a2b5a4 petrh
    """
54
    Function goes through newly processed data and checks theirs status
55
56
    Args:
57
        config: loaded configuration file of dataset
58
59
    Returns:
60
        boolean variable TRUE/FALSE.
61
        Data processed correctly - TRUE
62
        Wrong format or NEW unknown devices - FALSE
63
    """
64 70e660a8 petrh
    processed_devices_set = FolderProcessor.get_devices_set(PROCESSED_DATA_PATH + config["dataset-name"] + '/')
65 04a2b5a4 petrh
    unknown_devices_set = FolderProcessor.get_unknown_devices_set(config, processed_devices_set)
66 70e660a8 petrh
    unknown_devices_size = len(unknown_devices_set)
67
68
    if unknown_devices_size != 0:
69 1187e871 petrh
        print("There is " + str(unknown_devices_size) + " unknown devices\n")
70
        print("Adding devices to " + config["dataset-name"] + " config file\n")
71 04a2b5a4 petrh
        ConfigureFunctions.update_configuration(config["dataset-name"], unknown_devices_set)
72 70e660a8 petrh
        return False
73
74 1187e871 petrh
    for device in config["devices"]:
75
        device = config["devices"][device]
76
        if device["x"] == "UNKNOWN!" or device["y"] == "UNKNOWN!":
77
            print(config["dataset-name"] + " Config file contains devices with UNKOWN! values please update them\n")
78
            return False
79
80
    return True
81
82 70e660a8 petrh
83 c8f3051b petrh
def load_data_to_database(config):
84 04a2b5a4 petrh
    """
85
    Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
86
    loads data appends coordination from configurations
87
    and exports it into the database
88
    After successful exporting updates ignore.txt
89
90
    Args:
91
        config: loaded configuration file of dataset
92
    """
93 c8f3051b petrh
    dataset_name = config["dataset-name"]
94
    dataset_path = dataset_name + '/'
95
96
    # get all unprocessed files from dataset
97
    not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path)
98
99
    # load every file
100
    for not_loaded_file in not_loaded_files:
101
        # load processed data
102 04a2b5a4 petrh
        processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
103 c8f3051b petrh
        # load processed data to database
104
        DatabaseLoader.load_data_to_database(dataset_name, processed_data)
105
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
106
107
108
def run_full_pipeline(dataset_name):
109 04a2b5a4 petrh
    """
110
    Loads config file and starts full pipeline
111
    -crawl data
112
    -process data
113
    -load data to database
114
115
    Args:
116
        dataset_name: name of dataset that has existing configuration file
117
    """
118
    config = ConfigureFunctions.load_configuration(dataset_name)
119 c8f3051b petrh
    crawl_data(config)
120
    process_data(config["dataset-name"])
121 70e660a8 petrh
122
    validation_test = validate_process_data(config)
123
124
    if validation_test:
125
        load_data_to_database(config)
126