Revize 43697fec
Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)
modules/crawler/Pipeline.py | ||
---|---|---|
1 | 1 |
from Utilities import FolderProcessor, ConfigureFunctions |
2 | 2 |
from Utilities.Database import DatabaseLoader |
3 | 3 |
|
4 |
import logging |
|
5 |
from datetime import date |
|
6 |
|
|
7 |
|
|
4 | 8 |
# Path to crawled data |
5 | 9 |
CRAWLED_DATA_PATH = "CrawledData/" |
6 | 10 |
# Path to processed data |
... | ... | |
13 | 17 |
PROCESSOR_LIB_PATH = "DatasetProcessing." |
14 | 18 |
|
15 | 19 |
|
20 |
#logger |
|
21 |
logging.basicConfig(filename=CRAWLER_LOGS_PATH + "CommonRecords/" + 'Applicationlog-' + date.today().strftime("%b-%Y") + '.log', |
|
22 |
level=logging.INFO, |
|
23 |
format='%(asctime)s %(message)s' |
|
24 |
) |
|
25 |
|
|
26 |
|
|
16 | 27 |
def check_last_update(config): |
17 | 28 |
""" |
18 | 29 |
Loads integer from updated.txt in CrawlerLogs/"dataset_name" |
... | ... | |
37 | 48 |
confing_update_period = int(config["update-period"]) |
38 | 49 |
|
39 | 50 |
if config["update-period"] <= last_update: |
40 |
print("Dataset " + dataset_name + " is being updated")
|
|
51 |
logging.info("Dataset " + dataset_name + " is being updated today")
|
|
41 | 52 |
file.write("0") |
42 | 53 |
file.truncate() |
43 | 54 |
return True |
44 | 55 |
else: |
45 | 56 |
last_update_days = last_update + 1 |
46 |
print("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days")
|
|
57 |
logging.info("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days")
|
|
47 | 58 |
file.write(str(last_update_days)) |
48 | 59 |
file.truncate() |
49 | 60 |
return False |
... | ... | |
60 | 71 |
""" |
61 | 72 |
dataset_name = config["dataset-name"] |
62 | 73 |
|
63 |
my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
|
|
64 |
my_function(config)
|
|
74 |
crawl_func = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
|
|
75 |
crawl_func(config)
|
|
65 | 76 |
|
66 | 77 |
dataset_name += '/' |
67 | 78 |
|
... | ... | |
82 | 93 |
['process_file']).process_file |
83 | 94 |
|
84 | 95 |
not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path) |
96 |
logging.info(dataset_name + " has downloaded " + str(len(not_processed_files)) + " new files") |
|
85 | 97 |
|
86 | 98 |
for not_processed_file in not_processed_files: |
87 | 99 |
process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file) |
88 | 100 |
FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file) |
89 | 101 |
|
102 |
logging.info(dataset_name + " has processed " + str(len(not_processed_files)) + " newly crawled files") |
|
103 |
|
|
90 | 104 |
|
91 | 105 |
def validate_process_data(config): |
92 | 106 |
""" |
... | ... | |
105 | 119 |
unknown_devices_size = len(unknown_devices_set) |
106 | 120 |
|
107 | 121 |
if unknown_devices_size != 0: |
108 |
print("There is " + str(unknown_devices_size) + " unknown devices\n")
|
|
109 |
print("Adding devices to " + config["dataset-name"] + " config file\n")
|
|
122 |
logging.info("There is " + str(unknown_devices_size) + " unknown devices")
|
|
123 |
logging.info("Adding devices to " + config["dataset-name"] + " config file")
|
|
110 | 124 |
ConfigureFunctions.update_configuration(config["dataset-name"], unknown_devices_set) |
111 | 125 |
return False |
112 | 126 |
|
113 | 127 |
for device in config["devices"]: |
114 | 128 |
device = config["devices"][device] |
115 | 129 |
if device["x"] == "UNKNOWN!" or device["y"] == "UNKNOWN!": |
116 |
print(config["dataset-name"] + " Config file contains devices with UNKOWN! values please update them\n")
|
|
130 |
logging.info(config["dataset-name"] + " config file contains devices with UNKOWN! values please update them!!")
|
|
117 | 131 |
return False |
118 | 132 |
|
119 | 133 |
return True |
... | ... | |
143 | 157 |
DatabaseLoader.load_data_to_database(dataset_name, processed_data) |
144 | 158 |
FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file) |
145 | 159 |
|
160 |
logging.info(dataset_name + " has loaded to databse " + str(len(not_loaded_files)) + " newly processed files.") |
|
161 |
|
|
146 | 162 |
|
147 | 163 |
def run_full_pipeline(dataset_name): |
148 | 164 |
""" |
... | ... | |
154 | 170 |
Args: |
155 | 171 |
dataset_name: name of dataset that has existing configuration file |
156 | 172 |
""" |
173 |
logging.info("Starting pipeline for dataset " + dataset_name) |
|
174 |
|
|
157 | 175 |
config = ConfigureFunctions.load_configuration(dataset_name) |
158 | 176 |
crawl_data(config) |
159 | 177 |
process_data(config["dataset-name"]) |
... | ... | |
162 | 180 |
|
163 | 181 |
if validation_test: |
164 | 182 |
load_data_to_database(config) |
165 |
print("Dataset " + dataset_name + " has been sucessfully updated\n") |
|
166 | 183 |
|
167 | 184 |
|
168 | 185 |
|
... | ... | |
176 | 193 |
Args: |
177 | 194 |
dataset_name: name of dataset that has existing configuration file |
178 | 195 |
""" |
196 |
logging.info("Starting pipeline for dataset " + dataset_name) |
|
197 |
|
|
179 | 198 |
config = ConfigureFunctions.load_configuration(dataset_name) |
180 | 199 |
update_test = check_last_update(config) |
181 | 200 |
if update_test: |
... | ... | |
185 | 204 |
validation_test = validate_process_data(config) |
186 | 205 |
|
187 | 206 |
if validation_test: |
188 |
load_data_to_database(config) |
|
207 |
load_data_to_database(config) |
|
208 |
|
Také k dispozici: Unified diff
Re #7937
Pridano logovani do slozky CrawlerLogs/CommonRecords