Projekt

Obecné

Profil

Stáhnout (2.19 KB) Statistiky
| Větev: | Revize:
1 c8f3051b petrh
from Utilities import FolderProcessor, ConfigureLoader
2
from Utilities.Database import DatabaseLoader
3
4
5
CONFIG_FILES_PATH = "DatasetConfigs/"
6
CRAWLED_DATA_PATH = "CrawledData/"
7
PROCESSED_DATA_PATH = "ProcessedData/"
8
CRAWLER_LIB_PATH = "DatasetCrawler."
9
PROCESSOR_LIB_PATH = "DatasetProcessing."
10
11
12
def crawl_data(config):
13
14
    dataset_name = config["dataset-name"]
15
16
    my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler",globals(),locals(),['crawl']).crawl
17
    my_function(config)
18
19
    dataset_name += '/'
20
21
22
def process_data(dataset_name):
23
    dataset_path = dataset_name + '/'
24
25
    process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor",globals(),locals(),
26
                                   ['process_file']).process_file
27
28
    # get all not processed files from dataset
29
    not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path)
30
31
    # process every file
32
    for not_processed_file in not_processed_files:
33
        # call processing for one file in dataset
34
        process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file)
35
        FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file)
36
37
38
def load_data_to_database(config):
39
40
    dataset_name = config["dataset-name"]
41
    dataset_path = dataset_name + '/'
42
43
    # get all unprocessed files from dataset
44
    not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path)
45
46
    # load every file
47
    for not_loaded_file in not_loaded_files:
48
        # load processed data
49
        processed_data = DatabaseLoader.get_data_from_file(PROCESSED_DATA_PATH + dataset_path + not_loaded_file,
50
                                                           config["devices"])
51
        # load processed data to database
52
        DatabaseLoader.load_data_to_database(dataset_name, processed_data)
53
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
54
55
56
def run_full_pipeline(dataset_name):
57
    config = ConfigureLoader.load_configuration(CONFIG_FILES_PATH + dataset_name)
58
    crawl_data(config)
59
    process_data(config["dataset-name"])
60
    load_data_to_database(config)