Projekt

Obecné

Profil

« Předchozí | Další » 

Revize 34baf808

Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)

Re #7965
implementovat skript pro crone který spouští upadte datasetu podle configu

Zobrazit rozdíly:

modules/crawler/Pipeline.py
5 5
CRAWLED_DATA_PATH = "CrawledData/"
6 6
# Path to processed data
7 7
PROCESSED_DATA_PATH = "ProcessedData/"
8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
8 10
# Path to dataset crawler implementations
9 11
CRAWLER_LIB_PATH = "DatasetCrawler."
10 12
# Path to dataset processor implementations
11 13
PROCESSOR_LIB_PATH = "DatasetProcessing."
12 14

  
13 15

  
16
def check_last_update(config):
17
    """
18
    Loads integer from updated.txt in CrawlerLogs/"dataset_name"
19
    representing number of days from last update if number equals
20
    number in confing update period updates it and reset number of
21
    days to zero else increment the number
22

  
23
    Arguments:
24
        config loaded configuration file of dataset
25

  
26
    Returns:
27
       True if updating
28
       Else if incementing days from last update
29
    """
30
    dataset_name = config["dataset-name"]
31

  
32
    with open(CRAWLER_LOGS_PATH + dataset_name + "/" + "updated.txt", "r+") as file:
33
        last_update = file.read()
34
        last_update = int(last_update)
35
        file.seek(0)
36

  
37
        confing_update_period = int(config["update-period"])
38

  
39
        if config["update-period"] <= last_update:
40
            print("Dataset " + dataset_name + " is being updated")
41
            file.write("0")
42
            file.truncate()
43
            return True
44
        else:
45
            last_update_days = last_update + 1
46
            print("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days")
47
            file.write(str(last_update_days))
48
            file.truncate()
49
            return False
50

  
51

  
52

  
14 53
def crawl_data(config):
15 54
    """
16 55
      Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py
......
123 162

  
124 163
    if validation_test:
125 164
        load_data_to_database(config)
165
        print("Dataset " + dataset_name + " has been sucessfully updated\n")
166

  
167

  
168

  
169
def run_full_pipeline_crone(dataset_name):
170
    """
171
    Loads config file and starts full pipeline
172
    -crawl data
173
    -process data
174
    -load data to database
175

  
176
    Args:
177
        dataset_name: name of dataset that has existing configuration file
178
    """
179
    config = ConfigureFunctions.load_configuration(dataset_name)
180
    update_test = check_last_update(config)
181
    if update_test:
182
        crawl_data(config)
183
        process_data(config["dataset-name"])
184

  
185
        validation_test = validate_process_data(config)
126 186

  
187
        if validation_test:
188
            load_data_to_database(config)

Také k dispozici: Unified diff