Revize 0a2832fb
Přidáno uživatelem Jakub Vašta před asi 4 roky(ů)
modules/crawler/Pipeline.py | ||
---|---|---|
152 | 152 |
# get all unprocessed files from dataset |
153 | 153 |
not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path) |
154 | 154 |
|
155 |
database_connection = DatabaseLoader.create_database_connection() |
|
156 |
|
|
157 |
DatabaseLoader.check_or_update_datasets_collection(database_connection,config) |
|
158 |
|
|
159 |
DatabaseLoader.update_devices_collection(database_connection,config) |
|
160 |
|
|
161 |
|
|
162 |
# load every file |
|
163 |
for not_loaded_file in not_loaded_files: |
|
164 |
#check if file is not in database already if it is skip |
|
165 |
test = DatabaseLoader.check_if_database_doesnt_contain_file(database_connection,dataset_name,not_loaded_file) |
|
166 |
if test == False: |
|
167 |
logging.info(dataset_name + " could not load " + not_loaded_file + " to database because he is already there.") |
|
168 |
FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file) |
|
169 |
continue |
|
170 |
# load processed data |
|
171 |
processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config) |
|
172 |
# load processed data to database |
|
173 |
DatabaseLoader.load_data_to_database(database_connection, dataset_name, processed_data, not_loaded_file) |
|
174 |
FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file) |
|
175 |
|
|
176 |
logging.info(dataset_name + " has loaded to database " + str(len(not_loaded_files)) + " newly processed files.") |
|
177 |
|
|
178 |
|
|
179 |
def load_data_to_database_crone(config): |
|
180 |
""" |
|
181 |
Goes trough every not loaded file(not contained in ProcessedData/ignore.txt) |
|
182 |
loads data appends coordination from configurations |
|
183 |
and exports it into the database |
|
184 |
After successful exporting updates ignore.txt |
|
185 |
|
|
186 |
Args: |
|
187 |
config: loaded configuration file of dataset |
|
188 |
""" |
|
189 |
dataset_name = config["dataset-name"] |
|
190 |
dataset_path = dataset_name + '/' |
|
191 |
|
|
192 |
# get all unprocessed files from dataset |
|
193 |
not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path) |
|
194 |
|
|
195 |
database_connection = DatabaseLoader.create_database_connection() |
|
196 |
|
|
155 | 197 |
# load every file |
156 | 198 |
for not_loaded_file in not_loaded_files: |
157 | 199 |
# load processed data |
158 | 200 |
processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config) |
159 | 201 |
# load processed data to database |
160 |
DatabaseLoader.load_data_to_database(dataset_name, processed_data)
|
|
202 |
DatabaseLoader.load_data_to_database(database_connection, dataset_name, processed_data, not_loaded_file)
|
|
161 | 203 |
FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file) |
162 | 204 |
|
163 |
logging.info(dataset_name + " has loaded to databse " + str(len(not_loaded_files)) + " newly processed files.") |
|
205 |
logging.info(dataset_name + " has loaded to database " + str(len(not_loaded_files)) + " newly processed files.")
|
|
164 | 206 |
|
165 | 207 |
|
166 | 208 |
def run_full_pipeline(dataset_name): |
... | ... | |
185 | 227 |
load_data_to_database(config) |
186 | 228 |
|
187 | 229 |
|
188 |
|
|
189 | 230 |
def run_full_pipeline_crone(dataset_name): |
190 | 231 |
""" |
191 | 232 |
Loads config file and starts full pipeline |
... | ... | |
207 | 248 |
validation_test = validate_process_data(config) |
208 | 249 |
|
209 | 250 |
if validation_test: |
210 |
load_data_to_database(config) |
|
251 |
load_data_to_database_crone(config)
|
|
211 | 252 |
|
Také k dispozici: Unified diff
Re #8089
- pridana nova proměna do konfig souboru
- predalana tabulka v databasi s nazvem DATASETS
- nove obsahuje "key-name" jako klíč a "display-name" pro zobrazení