Projekt

Obecné

Profil

« Předchozí | Další » 

Revize 0a2832fb

Přidáno uživatelem Jakub Vašta před asi 4 roky(ů)

Re #8089
- pridana nova proměna do konfig souboru
- predalana tabulka v databasi s nazvem DATASETS
- nove obsahuje "key-name" jako klíč a "display-name" pro zobrazení

Zobrazit rozdíly:

modules/crawler/Pipeline.py
152 152
    # get all unprocessed files from dataset
153 153
    not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path)
154 154

  
155
    database_connection = DatabaseLoader.create_database_connection()
156

  
157
    DatabaseLoader.check_or_update_datasets_collection(database_connection,config)
158

  
159
    DatabaseLoader.update_devices_collection(database_connection,config)
160

  
161

  
162
    # load every file
163
    for not_loaded_file in not_loaded_files:
164
        #check if file is not in database already if it is skip
165
        test = DatabaseLoader.check_if_database_doesnt_contain_file(database_connection,dataset_name,not_loaded_file)
166
        if test == False:
167
            logging.info(dataset_name + " could not load " + not_loaded_file + " to database because he is already there.")
168
            FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
169
            continue
170
        # load processed data
171
        processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
172
        # load processed data to database
173
        DatabaseLoader.load_data_to_database(database_connection, dataset_name, processed_data, not_loaded_file)
174
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
175

  
176
    logging.info(dataset_name + " has loaded to database " + str(len(not_loaded_files)) + " newly processed files.")
177

  
178

  
179
def load_data_to_database_crone(config):
180
    """
181
    Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
182
    loads data appends coordination from configurations
183
    and exports it into the database
184
    After successful exporting updates ignore.txt
185

  
186
    Args:
187
        config: loaded configuration file of dataset
188
    """
189
    dataset_name = config["dataset-name"]
190
    dataset_path = dataset_name + '/'
191

  
192
    # get all unprocessed files from dataset
193
    not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path)
194

  
195
    database_connection = DatabaseLoader.create_database_connection()
196

  
155 197
    # load every file
156 198
    for not_loaded_file in not_loaded_files:
157 199
        # load processed data
158 200
        processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
159 201
        # load processed data to database
160
        DatabaseLoader.load_data_to_database(dataset_name, processed_data)
202
        DatabaseLoader.load_data_to_database(database_connection, dataset_name, processed_data, not_loaded_file)
161 203
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
162 204

  
163
    logging.info(dataset_name + " has loaded to databse " + str(len(not_loaded_files)) + " newly processed files.")
205
    logging.info(dataset_name + " has loaded to database " + str(len(not_loaded_files)) + " newly processed files.")
164 206

  
165 207

  
166 208
def run_full_pipeline(dataset_name):
......
185 227
        load_data_to_database(config)
186 228

  
187 229

  
188

  
189 230
def run_full_pipeline_crone(dataset_name):
190 231
    """
191 232
    Loads config file and starts full pipeline
......
207 248
        validation_test = validate_process_data(config)
208 249

  
209 250
        if validation_test:
210
            load_data_to_database(config)
251
            load_data_to_database_crone(config)
211 252
            

Také k dispozici: Unified diff