Projekt

Obecné

Profil

Stáhnout (6.35 KB) Statistiky
| Větev: | Revize:
1 70e660a8 petrh
from Utilities import FolderProcessor, ConfigureFunctions
2 c8f3051b petrh
from Utilities.Database import DatabaseLoader
3
4 04a2b5a4 petrh
# Path to crawled data
5 c8f3051b petrh
CRAWLED_DATA_PATH = "CrawledData/"
6 04a2b5a4 petrh
# Path to processed data
7 c8f3051b petrh
PROCESSED_DATA_PATH = "ProcessedData/"
8 34baf808 petrh
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
10 04a2b5a4 petrh
# Path to dataset crawler implementations
11 c8f3051b petrh
CRAWLER_LIB_PATH = "DatasetCrawler."
12 04a2b5a4 petrh
# Path to dataset processor implementations
13 c8f3051b petrh
PROCESSOR_LIB_PATH = "DatasetProcessing."
14
15
16 34baf808 petrh
def check_last_update(config):
17
    """
18
    Loads integer from updated.txt in CrawlerLogs/"dataset_name"
19
    representing number of days from last update if number equals
20
    number in confing update period updates it and reset number of
21
    days to zero else increment the number
22
23
    Arguments:
24
        config loaded configuration file of dataset
25
26
    Returns:
27
       True if updating
28
       Else if incementing days from last update
29
    """
30
    dataset_name = config["dataset-name"]
31
32
    with open(CRAWLER_LOGS_PATH + dataset_name + "/" + "updated.txt", "r+") as file:
33
        last_update = file.read()
34
        last_update = int(last_update)
35
        file.seek(0)
36
37
        confing_update_period = int(config["update-period"])
38
39
        if config["update-period"] <= last_update:
40
            print("Dataset " + dataset_name + " is being updated")
41
            file.write("0")
42
            file.truncate()
43
            return True
44
        else:
45
            last_update_days = last_update + 1
46
            print("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days")
47
            file.write(str(last_update_days))
48
            file.truncate()
49
            return False
50
51
52
53 c8f3051b petrh
def crawl_data(config):
54 04a2b5a4 petrh
    """
55
      Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py
56
      runs crawler.
57 c8f3051b petrh
58 04a2b5a4 petrh
    Args:
59
        config: loaded configuration file of dataset
60
    """
61 c8f3051b petrh
    dataset_name = config["dataset-name"]
62
63 04a2b5a4 petrh
    my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
64 c8f3051b petrh
    my_function(config)
65
66
    dataset_name += '/'
67
68
69
def process_data(dataset_name):
70 04a2b5a4 petrh
    """
71
    Goes trough every not processed file(not contained in CrawledData/"dataset_name"/ignore.txt)
72
    Imports dataset processor in DatasetProcessing/"dataset_name"Processor.py
73
    Runs processor on every file
74
    After successful processing updates ignore.txt
75
76
    Args:
77
        dataset_name: name of dataset that has existing configuration file
78
    """
79 c8f3051b petrh
    dataset_path = dataset_name + '/'
80
81 04a2b5a4 petrh
    process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor", globals(), locals(),
82 c8f3051b petrh
                                   ['process_file']).process_file
83
84
    not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path)
85
86
    for not_processed_file in not_processed_files:
87
        process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file)
88
        FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file)
89
90
91 70e660a8 petrh
def validate_process_data(config):
92 04a2b5a4 petrh
    """
93
    Function goes through newly processed data and checks theirs status
94
95
    Args:
96
        config: loaded configuration file of dataset
97
98
    Returns:
99
        boolean variable TRUE/FALSE.
100
        Data processed correctly - TRUE
101
        Wrong format or NEW unknown devices - FALSE
102
    """
103 70e660a8 petrh
    processed_devices_set = FolderProcessor.get_devices_set(PROCESSED_DATA_PATH + config["dataset-name"] + '/')
104 04a2b5a4 petrh
    unknown_devices_set = FolderProcessor.get_unknown_devices_set(config, processed_devices_set)
105 70e660a8 petrh
    unknown_devices_size = len(unknown_devices_set)
106
107
    if unknown_devices_size != 0:
108 1187e871 petrh
        print("There is " + str(unknown_devices_size) + " unknown devices\n")
109
        print("Adding devices to " + config["dataset-name"] + " config file\n")
110 04a2b5a4 petrh
        ConfigureFunctions.update_configuration(config["dataset-name"], unknown_devices_set)
111 70e660a8 petrh
        return False
112
113 1187e871 petrh
    for device in config["devices"]:
114
        device = config["devices"][device]
115
        if device["x"] == "UNKNOWN!" or device["y"] == "UNKNOWN!":
116
            print(config["dataset-name"] + " Config file contains devices with UNKOWN! values please update them\n")
117
            return False
118
119
    return True
120
121 70e660a8 petrh
122 c8f3051b petrh
def load_data_to_database(config):
123 04a2b5a4 petrh
    """
124
    Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
125
    loads data appends coordination from configurations
126
    and exports it into the database
127
    After successful exporting updates ignore.txt
128
129
    Args:
130
        config: loaded configuration file of dataset
131
    """
132 c8f3051b petrh
    dataset_name = config["dataset-name"]
133
    dataset_path = dataset_name + '/'
134
135
    # get all unprocessed files from dataset
136
    not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path)
137
138
    # load every file
139
    for not_loaded_file in not_loaded_files:
140
        # load processed data
141 04a2b5a4 petrh
        processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
142 c8f3051b petrh
        # load processed data to database
143
        DatabaseLoader.load_data_to_database(dataset_name, processed_data)
144
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
145
146
147
def run_full_pipeline(dataset_name):
148 04a2b5a4 petrh
    """
149
    Loads config file and starts full pipeline
150
    -crawl data
151
    -process data
152
    -load data to database
153
154
    Args:
155
        dataset_name: name of dataset that has existing configuration file
156
    """
157
    config = ConfigureFunctions.load_configuration(dataset_name)
158 c8f3051b petrh
    crawl_data(config)
159
    process_data(config["dataset-name"])
160 70e660a8 petrh
161
    validation_test = validate_process_data(config)
162
163
    if validation_test:
164
        load_data_to_database(config)
165 34baf808 petrh
        print("Dataset " + dataset_name + " has been sucessfully updated\n")
166
167
168
169
def run_full_pipeline_crone(dataset_name):
170
    """
171
    Loads config file and starts full pipeline
172
    -crawl data
173
    -process data
174
    -load data to database
175
176
    Args:
177
        dataset_name: name of dataset that has existing configuration file
178
    """
179
    config = ConfigureFunctions.load_configuration(dataset_name)
180
    update_test = check_last_update(config)
181
    if update_test:
182
        crawl_data(config)
183
        process_data(config["dataset-name"])
184
185
        validation_test = validate_process_data(config)
186 70e660a8 petrh
187 34baf808 petrh
        if validation_test:
188
            load_data_to_database(config)