Revize 34baf808
Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)
modules/crawler/Pipeline.py | ||
---|---|---|
5 | 5 |
CRAWLED_DATA_PATH = "CrawledData/" |
6 | 6 |
# Path to processed data |
7 | 7 |
PROCESSED_DATA_PATH = "ProcessedData/" |
8 |
# Path to crawler logs |
|
9 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
|
8 | 10 |
# Path to dataset crawler implementations |
9 | 11 |
CRAWLER_LIB_PATH = "DatasetCrawler." |
10 | 12 |
# Path to dataset processor implementations |
11 | 13 |
PROCESSOR_LIB_PATH = "DatasetProcessing." |
12 | 14 |
|
13 | 15 |
|
16 |
def check_last_update(config): |
|
17 |
""" |
|
18 |
Loads integer from updated.txt in CrawlerLogs/"dataset_name" |
|
19 |
representing number of days from last update if number equals |
|
20 |
number in confing update period updates it and reset number of |
|
21 |
days to zero else increment the number |
|
22 |
|
|
23 |
Arguments: |
|
24 |
config loaded configuration file of dataset |
|
25 |
|
|
26 |
Returns: |
|
27 |
True if updating |
|
28 |
Else if incementing days from last update |
|
29 |
""" |
|
30 |
dataset_name = config["dataset-name"] |
|
31 |
|
|
32 |
with open(CRAWLER_LOGS_PATH + dataset_name + "/" + "updated.txt", "r+") as file: |
|
33 |
last_update = file.read() |
|
34 |
last_update = int(last_update) |
|
35 |
file.seek(0) |
|
36 |
|
|
37 |
confing_update_period = int(config["update-period"]) |
|
38 |
|
|
39 |
if config["update-period"] <= last_update: |
|
40 |
print("Dataset " + dataset_name + " is being updated") |
|
41 |
file.write("0") |
|
42 |
file.truncate() |
|
43 |
return True |
|
44 |
else: |
|
45 |
last_update_days = last_update + 1 |
|
46 |
print("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days") |
|
47 |
file.write(str(last_update_days)) |
|
48 |
file.truncate() |
|
49 |
return False |
|
50 |
|
|
51 |
|
|
52 |
|
|
14 | 53 |
def crawl_data(config): |
15 | 54 |
""" |
16 | 55 |
Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py |
... | ... | |
123 | 162 |
|
124 | 163 |
if validation_test: |
125 | 164 |
load_data_to_database(config) |
165 |
print("Dataset " + dataset_name + " has been sucessfully updated\n") |
|
166 |
|
|
167 |
|
|
168 |
|
|
169 |
def run_full_pipeline_crone(dataset_name): |
|
170 |
""" |
|
171 |
Loads config file and starts full pipeline |
|
172 |
-crawl data |
|
173 |
-process data |
|
174 |
-load data to database |
|
175 |
|
|
176 |
Args: |
|
177 |
dataset_name: name of dataset that has existing configuration file |
|
178 |
""" |
|
179 |
config = ConfigureFunctions.load_configuration(dataset_name) |
|
180 |
update_test = check_last_update(config) |
|
181 |
if update_test: |
|
182 |
crawl_data(config) |
|
183 |
process_data(config["dataset-name"]) |
|
184 |
|
|
185 |
validation_test = validate_process_data(config) |
|
126 | 186 |
|
187 |
if validation_test: |
|
188 |
load_data_to_database(config) |
Také k dispozici: Unified diff
Re #7965
implementovat skript pro crone který spouští upadte datasetu podle configu