Projekt

Obecné

Profil

« Předchozí | Další » 

Revize 43697fec

Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)

Re #7937
Pridano logovani do slozky CrawlerLogs/CommonRecords

Zobrazit rozdíly:

modules/crawler/Pipeline.py
1 1
from Utilities import FolderProcessor, ConfigureFunctions
2 2
from Utilities.Database import DatabaseLoader
3 3

  
4
import logging
5
from datetime import date
6

  
7

  
4 8
# Path to crawled data
5 9
CRAWLED_DATA_PATH = "CrawledData/"
6 10
# Path to processed data
......
13 17
PROCESSOR_LIB_PATH = "DatasetProcessing."
14 18

  
15 19

  
20
#logger
21
logging.basicConfig(filename=CRAWLER_LOGS_PATH + "CommonRecords/" + 'Applicationlog-' + date.today().strftime("%b-%Y") + '.log',
22
                    level=logging.INFO,
23
                    format='%(asctime)s %(message)s'
24
                    )
25

  
26

  
16 27
def check_last_update(config):
17 28
    """
18 29
    Loads integer from updated.txt in CrawlerLogs/"dataset_name"
......
37 48
        confing_update_period = int(config["update-period"])
38 49

  
39 50
        if config["update-period"] <= last_update:
40
            print("Dataset " + dataset_name + " is being updated")
51
            logging.info("Dataset " + dataset_name + " is being updated today")
41 52
            file.write("0")
42 53
            file.truncate()
43 54
            return True
44 55
        else:
45 56
            last_update_days = last_update + 1
46
            print("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days")
57
            logging.info("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days")
47 58
            file.write(str(last_update_days))
48 59
            file.truncate()
49 60
            return False
......
60 71
    """
61 72
    dataset_name = config["dataset-name"]
62 73

  
63
    my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
64
    my_function(config)
74
    crawl_func = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
75
    crawl_func(config)
65 76

  
66 77
    dataset_name += '/'
67 78

  
......
82 93
                                   ['process_file']).process_file
83 94

  
84 95
    not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path)
96
    logging.info(dataset_name + " has downloaded " + str(len(not_processed_files)) + " new files")
85 97

  
86 98
    for not_processed_file in not_processed_files:
87 99
        process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file)
88 100
        FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file)
89 101

  
102
    logging.info(dataset_name + " has processed " + str(len(not_processed_files)) + " newly crawled files")
103

  
90 104

  
91 105
def validate_process_data(config):
92 106
    """
......
105 119
    unknown_devices_size = len(unknown_devices_set)
106 120

  
107 121
    if unknown_devices_size != 0:
108
        print("There is " + str(unknown_devices_size) + " unknown devices\n")
109
        print("Adding devices to " + config["dataset-name"] + " config file\n")
122
        logging.info("There is " + str(unknown_devices_size) + " unknown devices")
123
        logging.info("Adding devices to " + config["dataset-name"] + " config file")
110 124
        ConfigureFunctions.update_configuration(config["dataset-name"], unknown_devices_set)
111 125
        return False
112 126

  
113 127
    for device in config["devices"]:
114 128
        device = config["devices"][device]
115 129
        if device["x"] == "UNKNOWN!" or device["y"] == "UNKNOWN!":
116
            print(config["dataset-name"] + " Config file contains devices with UNKOWN! values please update them\n")
130
            logging.info(config["dataset-name"] + " config file contains devices with UNKOWN! values please update them!!")
117 131
            return False
118 132

  
119 133
    return True
......
143 157
        DatabaseLoader.load_data_to_database(dataset_name, processed_data)
144 158
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
145 159

  
160
    logging.info(dataset_name + " has loaded to databse " + str(len(not_loaded_files)) + " newly processed files.")
161

  
146 162

  
147 163
def run_full_pipeline(dataset_name):
148 164
    """
......
154 170
    Args:
155 171
        dataset_name: name of dataset that has existing configuration file
156 172
    """
173
    logging.info("Starting pipeline for dataset " + dataset_name)
174
    
157 175
    config = ConfigureFunctions.load_configuration(dataset_name)
158 176
    crawl_data(config)
159 177
    process_data(config["dataset-name"])
......
162 180

  
163 181
    if validation_test:
164 182
        load_data_to_database(config)
165
        print("Dataset " + dataset_name + " has been sucessfully updated\n")
166 183

  
167 184

  
168 185

  
......
176 193
    Args:
177 194
        dataset_name: name of dataset that has existing configuration file
178 195
    """
196
    logging.info("Starting pipeline for dataset " + dataset_name)
197

  
179 198
    config = ConfigureFunctions.load_configuration(dataset_name)
180 199
    update_test = check_last_update(config)
181 200
    if update_test:
......
185 204
        validation_test = validate_process_data(config)
186 205

  
187 206
        if validation_test:
188
            load_data_to_database(config)
207
            load_data_to_database(config)
208
            

Také k dispozici: Unified diff