Projekt

Obecné

Profil

Stáhnout (6.35 KB) Statistiky
| Větev: | Revize:
1
from Utilities import FolderProcessor, ConfigureFunctions
2
from Utilities.Database import DatabaseLoader
3

    
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6
# Path to processed data
7
PROCESSED_DATA_PATH = "ProcessedData/"
8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
10
# Path to dataset crawler implementations
11
CRAWLER_LIB_PATH = "DatasetCrawler."
12
# Path to dataset processor implementations
13
PROCESSOR_LIB_PATH = "DatasetProcessing."
14

    
15

    
16
def check_last_update(config):
17
    """
18
    Loads integer from updated.txt in CrawlerLogs/"dataset_name"
19
    representing number of days from last update if number equals
20
    number in confing update period updates it and reset number of
21
    days to zero else increment the number
22

    
23
    Arguments:
24
        config loaded configuration file of dataset
25

    
26
    Returns:
27
       True if updating
28
       Else if incementing days from last update
29
    """
30
    dataset_name = config["dataset-name"]
31

    
32
    with open(CRAWLER_LOGS_PATH + dataset_name + "/" + "updated.txt", "r+") as file:
33
        last_update = file.read()
34
        last_update = int(last_update)
35
        file.seek(0)
36

    
37
        confing_update_period = int(config["update-period"])
38

    
39
        if config["update-period"] <= last_update:
40
            print("Dataset " + dataset_name + " is being updated")
41
            file.write("0")
42
            file.truncate()
43
            return True
44
        else:
45
            last_update_days = last_update + 1
46
            print("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days")
47
            file.write(str(last_update_days))
48
            file.truncate()
49
            return False
50

    
51

    
52

    
53
def crawl_data(config):
54
    """
55
      Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py
56
      runs crawler.
57

    
58
    Args:
59
        config: loaded configuration file of dataset
60
    """
61
    dataset_name = config["dataset-name"]
62

    
63
    my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
64
    my_function(config)
65

    
66
    dataset_name += '/'
67

    
68

    
69
def process_data(dataset_name):
70
    """
71
    Goes trough every not processed file(not contained in CrawledData/"dataset_name"/ignore.txt)
72
    Imports dataset processor in DatasetProcessing/"dataset_name"Processor.py
73
    Runs processor on every file
74
    After successful processing updates ignore.txt
75

    
76
    Args:
77
        dataset_name: name of dataset that has existing configuration file
78
    """
79
    dataset_path = dataset_name + '/'
80

    
81
    process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor", globals(), locals(),
82
                                   ['process_file']).process_file
83

    
84
    not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path)
85

    
86
    for not_processed_file in not_processed_files:
87
        process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file)
88
        FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file)
89

    
90

    
91
def validate_process_data(config):
92
    """
93
    Function goes through newly processed data and checks theirs status
94

    
95
    Args:
96
        config: loaded configuration file of dataset
97

    
98
    Returns:
99
        boolean variable TRUE/FALSE.
100
        Data processed correctly - TRUE
101
        Wrong format or NEW unknown devices - FALSE
102
    """
103
    processed_devices_set = FolderProcessor.get_devices_set(PROCESSED_DATA_PATH + config["dataset-name"] + '/')
104
    unknown_devices_set = FolderProcessor.get_unknown_devices_set(config, processed_devices_set)
105
    unknown_devices_size = len(unknown_devices_set)
106

    
107
    if unknown_devices_size != 0:
108
        print("There is " + str(unknown_devices_size) + " unknown devices\n")
109
        print("Adding devices to " + config["dataset-name"] + " config file\n")
110
        ConfigureFunctions.update_configuration(config["dataset-name"], unknown_devices_set)
111
        return False
112

    
113
    for device in config["devices"]:
114
        device = config["devices"][device]
115
        if device["x"] == "UNKNOWN!" or device["y"] == "UNKNOWN!":
116
            print(config["dataset-name"] + " Config file contains devices with UNKOWN! values please update them\n")
117
            return False
118

    
119
    return True
120

    
121

    
122
def load_data_to_database(config):
123
    """
124
    Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
125
    loads data appends coordination from configurations
126
    and exports it into the database
127
    After successful exporting updates ignore.txt
128

    
129
    Args:
130
        config: loaded configuration file of dataset
131
    """
132
    dataset_name = config["dataset-name"]
133
    dataset_path = dataset_name + '/'
134

    
135
    # get all unprocessed files from dataset
136
    not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path)
137

    
138
    # load every file
139
    for not_loaded_file in not_loaded_files:
140
        # load processed data
141
        processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
142
        # load processed data to database
143
        DatabaseLoader.load_data_to_database(dataset_name, processed_data)
144
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
145

    
146

    
147
def run_full_pipeline(dataset_name):
148
    """
149
    Loads config file and starts full pipeline
150
    -crawl data
151
    -process data
152
    -load data to database
153

    
154
    Args:
155
        dataset_name: name of dataset that has existing configuration file
156
    """
157
    config = ConfigureFunctions.load_configuration(dataset_name)
158
    crawl_data(config)
159
    process_data(config["dataset-name"])
160

    
161
    validation_test = validate_process_data(config)
162

    
163
    if validation_test:
164
        load_data_to_database(config)
165
        print("Dataset " + dataset_name + " has been sucessfully updated\n")
166

    
167

    
168

    
169
def run_full_pipeline_crone(dataset_name):
170
    """
171
    Loads config file and starts full pipeline
172
    -crawl data
173
    -process data
174
    -load data to database
175

    
176
    Args:
177
        dataset_name: name of dataset that has existing configuration file
178
    """
179
    config = ConfigureFunctions.load_configuration(dataset_name)
180
    update_test = check_last_update(config)
181
    if update_test:
182
        crawl_data(config)
183
        process_data(config["dataset-name"])
184

    
185
        validation_test = validate_process_data(config)
186

    
187
        if validation_test:
188
            load_data_to_database(config)
(5-5/13)