Projekt

Obecné

Profil

« Předchozí | Další » 

Revize 04a2b5a4

Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)

Re #7939
- pridana dokumentace metod a trid
- korekce chyb v jmenech promenych
- pridani informaci pro vygenerovane skripty

Zobrazit rozdíly:

python-module/Pipeline.py
1 1
from Utilities import FolderProcessor, ConfigureFunctions
2 2
from Utilities.Database import DatabaseLoader
3 3

  
4

  
5
CONFIG_FILES_PATH = "DatasetConfigs/"
4
# Path to crawled data
6 5
CRAWLED_DATA_PATH = "CrawledData/"
6
# Path to processed data
7 7
PROCESSED_DATA_PATH = "ProcessedData/"
8
# Path to dataset crawler implementations
8 9
CRAWLER_LIB_PATH = "DatasetCrawler."
10
# Path to dataset processor implementations
9 11
PROCESSOR_LIB_PATH = "DatasetProcessing."
10 12

  
11 13

  
12 14
def crawl_data(config):
15
    """
16
      Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py
17
      runs crawler.
13 18

  
19
    Args:
20
        config: loaded configuration file of dataset
21
    """
14 22
    dataset_name = config["dataset-name"]
15 23

  
16
    my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler",globals(),locals(),['crawl']).crawl
24
    my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
17 25
    my_function(config)
18 26

  
19 27
    dataset_name += '/'
20 28

  
21 29

  
22 30
def process_data(dataset_name):
31
    """
32
    Goes trough every not processed file(not contained in CrawledData/"dataset_name"/ignore.txt)
33
    Imports dataset processor in DatasetProcessing/"dataset_name"Processor.py
34
    Runs processor on every file
35
    After successful processing updates ignore.txt
36

  
37
    Args:
38
        dataset_name: name of dataset that has existing configuration file
39
    """
23 40
    dataset_path = dataset_name + '/'
24 41

  
25
    process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor",globals(),locals(),
42
    process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor", globals(), locals(),
26 43
                                   ['process_file']).process_file
27 44

  
28
    # get all not processed files from dataset
29 45
    not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path)
30 46

  
31
    # process every file
32 47
    for not_processed_file in not_processed_files:
33
        # call processing for one file in dataset
34 48
        process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file)
35 49
        FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file)
36 50

  
37 51

  
38 52
def validate_process_data(config):
53
    """
54
    Function goes through newly processed data and checks theirs status
55

  
56
    Args:
57
        config: loaded configuration file of dataset
58

  
59
    Returns:
60
        boolean variable TRUE/FALSE.
61
        Data processed correctly - TRUE
62
        Wrong format or NEW unknown devices - FALSE
63
    """
39 64
    processed_devices_set = FolderProcessor.get_devices_set(PROCESSED_DATA_PATH + config["dataset-name"] + '/')
40
    unknown_devices_set = FolderProcessor.get_unknown_devices_set(config,processed_devices_set)
65
    unknown_devices_set = FolderProcessor.get_unknown_devices_set(config, processed_devices_set)
41 66
    unknown_devices_size = len(unknown_devices_set)
42 67

  
43 68
    if unknown_devices_size != 0:
44 69
        print("There is " + str(unknown_devices_size) + " unknown devies")
45
        ConfigureFunctions.update_configuration(CONFIG_FILES_PATH + config["dataset-name"] + ".yaml", unknown_devices_set)
70
        ConfigureFunctions.update_configuration(config["dataset-name"], unknown_devices_set)
46 71
        return False
47 72

  
48 73

  
49 74
def load_data_to_database(config):
50

  
75
    """
76
    Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
77
    loads data appends coordination from configurations
78
    and exports it into the database
79
    After successful exporting updates ignore.txt
80

  
81
    Args:
82
        config: loaded configuration file of dataset
83
    """
51 84
    dataset_name = config["dataset-name"]
52 85
    dataset_path = dataset_name + '/'
53 86

  
......
57 90
    # load every file
58 91
    for not_loaded_file in not_loaded_files:
59 92
        # load processed data
60
        processed_data = DatabaseLoader.get_data_from_file(PROCESSED_DATA_PATH + dataset_path + not_loaded_file,
61
                                                           config["devices"])
93
        processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
62 94
        # load processed data to database
63 95
        DatabaseLoader.load_data_to_database(dataset_name, processed_data)
64 96
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
65 97

  
66 98

  
67 99
def run_full_pipeline(dataset_name):
68
    config = ConfigureFunctions.load_configuration(CONFIG_FILES_PATH + dataset_name)
100
    """
101
    Loads config file and starts full pipeline
102
    -crawl data
103
    -process data
104
    -load data to database
105

  
106
    Args:
107
        dataset_name: name of dataset that has existing configuration file
108
    """
109
    config = ConfigureFunctions.load_configuration(dataset_name)
69 110
    crawl_data(config)
70 111
    process_data(config["dataset-name"])
71 112

  

Také k dispozici: Unified diff