Revize 04a2b5a4
Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)
python-module/Pipeline.py | ||
---|---|---|
1 | 1 |
from Utilities import FolderProcessor, ConfigureFunctions |
2 | 2 |
from Utilities.Database import DatabaseLoader |
3 | 3 |
|
4 |
|
|
5 |
CONFIG_FILES_PATH = "DatasetConfigs/" |
|
4 |
# Path to crawled data |
|
6 | 5 |
CRAWLED_DATA_PATH = "CrawledData/" |
6 |
# Path to processed data |
|
7 | 7 |
PROCESSED_DATA_PATH = "ProcessedData/" |
8 |
# Path to dataset crawler implementations |
|
8 | 9 |
CRAWLER_LIB_PATH = "DatasetCrawler." |
10 |
# Path to dataset processor implementations |
|
9 | 11 |
PROCESSOR_LIB_PATH = "DatasetProcessing." |
10 | 12 |
|
11 | 13 |
|
12 | 14 |
def crawl_data(config): |
15 |
""" |
|
16 |
Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py |
|
17 |
runs crawler. |
|
13 | 18 |
|
19 |
Args: |
|
20 |
config: loaded configuration file of dataset |
|
21 |
""" |
|
14 | 22 |
dataset_name = config["dataset-name"] |
15 | 23 |
|
16 |
my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler",globals(),locals(),['crawl']).crawl
|
|
24 |
my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
|
|
17 | 25 |
my_function(config) |
18 | 26 |
|
19 | 27 |
dataset_name += '/' |
20 | 28 |
|
21 | 29 |
|
22 | 30 |
def process_data(dataset_name): |
31 |
""" |
|
32 |
Goes trough every not processed file(not contained in CrawledData/"dataset_name"/ignore.txt) |
|
33 |
Imports dataset processor in DatasetProcessing/"dataset_name"Processor.py |
|
34 |
Runs processor on every file |
|
35 |
After successful processing updates ignore.txt |
|
36 |
|
|
37 |
Args: |
|
38 |
dataset_name: name of dataset that has existing configuration file |
|
39 |
""" |
|
23 | 40 |
dataset_path = dataset_name + '/' |
24 | 41 |
|
25 |
process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor",globals(),locals(),
|
|
42 |
process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor", globals(), locals(),
|
|
26 | 43 |
['process_file']).process_file |
27 | 44 |
|
28 |
# get all not processed files from dataset |
|
29 | 45 |
not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path) |
30 | 46 |
|
31 |
# process every file |
|
32 | 47 |
for not_processed_file in not_processed_files: |
33 |
# call processing for one file in dataset |
|
34 | 48 |
process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file) |
35 | 49 |
FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file) |
36 | 50 |
|
37 | 51 |
|
38 | 52 |
def validate_process_data(config): |
53 |
""" |
|
54 |
Function goes through newly processed data and checks theirs status |
|
55 |
|
|
56 |
Args: |
|
57 |
config: loaded configuration file of dataset |
|
58 |
|
|
59 |
Returns: |
|
60 |
boolean variable TRUE/FALSE. |
|
61 |
Data processed correctly - TRUE |
|
62 |
Wrong format or NEW unknown devices - FALSE |
|
63 |
""" |
|
39 | 64 |
processed_devices_set = FolderProcessor.get_devices_set(PROCESSED_DATA_PATH + config["dataset-name"] + '/') |
40 |
unknown_devices_set = FolderProcessor.get_unknown_devices_set(config,processed_devices_set) |
|
65 |
unknown_devices_set = FolderProcessor.get_unknown_devices_set(config, processed_devices_set)
|
|
41 | 66 |
unknown_devices_size = len(unknown_devices_set) |
42 | 67 |
|
43 | 68 |
if unknown_devices_size != 0: |
44 | 69 |
print("There is " + str(unknown_devices_size) + " unknown devies") |
45 |
ConfigureFunctions.update_configuration(CONFIG_FILES_PATH + config["dataset-name"] + ".yaml", unknown_devices_set)
|
|
70 |
ConfigureFunctions.update_configuration(config["dataset-name"], unknown_devices_set)
|
|
46 | 71 |
return False |
47 | 72 |
|
48 | 73 |
|
49 | 74 |
def load_data_to_database(config): |
50 |
|
|
75 |
""" |
|
76 |
Goes trough every not loaded file(not contained in ProcessedData/ignore.txt) |
|
77 |
loads data appends coordination from configurations |
|
78 |
and exports it into the database |
|
79 |
After successful exporting updates ignore.txt |
|
80 |
|
|
81 |
Args: |
|
82 |
config: loaded configuration file of dataset |
|
83 |
""" |
|
51 | 84 |
dataset_name = config["dataset-name"] |
52 | 85 |
dataset_path = dataset_name + '/' |
53 | 86 |
|
... | ... | |
57 | 90 |
# load every file |
58 | 91 |
for not_loaded_file in not_loaded_files: |
59 | 92 |
# load processed data |
60 |
processed_data = DatabaseLoader.get_data_from_file(PROCESSED_DATA_PATH + dataset_path + not_loaded_file, |
|
61 |
config["devices"]) |
|
93 |
processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config) |
|
62 | 94 |
# load processed data to database |
63 | 95 |
DatabaseLoader.load_data_to_database(dataset_name, processed_data) |
64 | 96 |
FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file) |
65 | 97 |
|
66 | 98 |
|
67 | 99 |
def run_full_pipeline(dataset_name): |
68 |
config = ConfigureFunctions.load_configuration(CONFIG_FILES_PATH + dataset_name) |
|
100 |
""" |
|
101 |
Loads config file and starts full pipeline |
|
102 |
-crawl data |
|
103 |
-process data |
|
104 |
-load data to database |
|
105 |
|
|
106 |
Args: |
|
107 |
dataset_name: name of dataset that has existing configuration file |
|
108 |
""" |
|
109 |
config = ConfigureFunctions.load_configuration(dataset_name) |
|
69 | 110 |
crawl_data(config) |
70 | 111 |
process_data(config["dataset-name"]) |
71 | 112 |
|
Také k dispozici: Unified diff
Re #7939
- pridana dokumentace metod a trid
- korekce chyb v jmenech promenych
- pridani informaci pro vygenerovane skripty