Revize 34baf808
Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)
modules/crawler/CrawledData/JIS/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
|
2 |
OD_ZCU_JIS_10_2019.CSV |
|
3 |
OD_ZCU_JIS_03_2020.CSV |
|
4 |
OD_ZCU_JIS_02_2020.CSV |
|
5 |
OD_ZCU_JIS_00_2019.CSV |
|
6 |
OD_ZCU_JIS_08_2019.CSV |
|
7 |
OD_ZCU_JIS_12_2019.CSV |
|
8 |
OD_ZCU_JIS_09_2019.CSV |
|
9 |
OD_ZCU_JIS_01_2020.CSV |
|
10 |
OD_ZCU_JIS_06_2019.CSV |
|
11 |
OD_ZCU_JIS_11_2019.CSV |
|
12 |
OD_ZCU_JIS_07_2019.CSV |
|
1 |
ignore.txt |
modules/crawler/CrawledData/KOLOBEZKY/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
|
2 |
OD_ZCU_KOLOBEZKY_00_2019.CSV |
|
3 |
OD_ZCU_KOLOBEZKY_06_2019.CSV |
|
4 |
OD_ZCU_KOLOBEZKY_07_2019.CSV |
|
5 |
OD_ZCU_KOLOBEZKY_08_2019.CSV |
|
1 |
ignore.txt |
modules/crawler/CrawledData/WIFI/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
|
2 |
OD_ZCU_WIFI_07_2019.CSV |
|
3 |
OD_ZCU_WIFI_00_2019.CSV |
|
4 |
OD_ZCU_WIFI_06_2019.CSV |
|
5 |
OD_ZCU_WIFI_08_2019.CSV |
|
1 |
ignore.txt |
modules/crawler/CrawlerLogs/JIS/ignore.txt | ||
---|---|---|
1 |
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_JIS_00_2019_CSV.zip |
|
2 |
https://openstore.zcu.cz/OD_ZCU_01_2020/OD_ZCU_JIS_01_2020_CSV.zip |
|
3 |
https://openstore.zcu.cz/OD_ZCU_02_2020/OD_ZCU_JIS_02_2020_CSV.zip |
|
4 |
https://openstore.zcu.cz/OD_ZCU_03_2020/OD_ZCU_JIS_03_2020_CSV.zip |
|
5 |
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_JIS_06_2019_CSV.zip |
|
6 |
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_JIS_07_2019_CSV.zip |
|
7 |
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_JIS_08_2019_CSV.zip |
|
8 |
https://openstore.zcu.cz/OD_ZCU_09_2019/OD_ZCU_JIS_09_2019_CSV.zip |
|
9 |
https://openstore.zcu.cz/OD_ZCU_10_2019/OD_ZCU_JIS_10_2019_CSV.zip |
|
10 |
https://openstore.zcu.cz/OD_ZCU_11_2019/OD_ZCU_JIS_11_2019_CSV.zip |
|
11 |
https://openstore.zcu.cz/OD_ZCU_12_2019/OD_ZCU_JIS_12_2019_CSV.zip |
modules/crawler/CrawlerLogs/JIS/updated.txt | ||
---|---|---|
1 |
0 |
modules/crawler/CrawlerLogs/KOLOBEZKY/ignore.txt | ||
---|---|---|
1 |
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_KOLOBEZKY_00_2019_CSV.zip |
|
2 |
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_KOLOBEZKY_06_2019_CSV.zip |
|
3 |
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_KOLOBEZKY_07_2019_CSV.zip |
|
4 |
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_KOLOBEZKY_08_2019_CSV.zip |
modules/crawler/CrawlerLogs/KOLOBEZKY/updated.txt | ||
---|---|---|
1 |
0 |
modules/crawler/CrawlerLogs/WIFI/ignore.txt | ||
---|---|---|
1 |
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_WIFI_00_2019_CSV.zip |
|
2 |
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_WIFI_06_2019_CSV.zip |
|
3 |
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_WIFI_07_2019_CSV.zip |
|
4 |
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_WIFI_08_2019_CSV.zip |
modules/crawler/CrawlerLogs/WIFI/updated.txt | ||
---|---|---|
1 |
0 |
modules/crawler/CroneUpdateScript.py | ||
---|---|---|
1 |
import Pipeline |
|
2 |
import os |
|
3 | ||
4 |
# Path to configuration files |
|
5 |
CONFIG_FILES_PATH = "DatasetConfigs/" |
|
6 | ||
7 | ||
8 |
def run_pipeline_for_all_datasets(): |
|
9 |
""" |
|
10 |
Runs whole DataScript pipeline for every dataset that has existing configuration file |
|
11 |
""" |
|
12 |
files_in_dir = os.listdir(CONFIG_FILES_PATH) |
|
13 | ||
14 |
for file in files_in_dir: |
|
15 |
name = file.split('.') |
|
16 |
Pipeline.run_full_pipeline_crone(name[0]) |
|
17 | ||
18 | ||
19 |
run_pipeline_for_all_datasets() |
modules/crawler/ForceUpdateDataset.py | ||
---|---|---|
1 |
import Pipeline |
|
2 |
import os |
|
3 | ||
4 |
print("Zadejte jméno Datasetu který chcete upadtovat:\n") |
|
5 |
Pipeline.run_full_pipeline(input()) |
modules/crawler/ForceUpdateDatasets.py | ||
---|---|---|
1 |
import Pipeline |
|
2 |
import os |
|
3 | ||
4 |
# Path to configuration files |
|
5 |
CONFIG_FILES_PATH = "DatasetConfigs/" |
|
6 | ||
7 | ||
8 |
def run_pipeline_for_all_datasets(): |
|
9 |
""" |
|
10 |
Runs whole DataScript pipeline for every dataset that has existing configuration file |
|
11 |
""" |
|
12 |
files_in_dir = os.listdir(CONFIG_FILES_PATH) |
|
13 | ||
14 |
for file in files_in_dir: |
|
15 |
name = file.split('.') |
|
16 |
Pipeline.run_full_pipeline(name[0]) |
|
17 | ||
18 | ||
19 |
run_pipeline_for_all_datasets() |
modules/crawler/Pipeline.py | ||
---|---|---|
5 | 5 |
CRAWLED_DATA_PATH = "CrawledData/" |
6 | 6 |
# Path to processed data |
7 | 7 |
PROCESSED_DATA_PATH = "ProcessedData/" |
8 |
# Path to crawler logs |
|
9 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
|
8 | 10 |
# Path to dataset crawler implementations |
9 | 11 |
CRAWLER_LIB_PATH = "DatasetCrawler." |
10 | 12 |
# Path to dataset processor implementations |
11 | 13 |
PROCESSOR_LIB_PATH = "DatasetProcessing." |
12 | 14 | |
13 | 15 | |
16 |
def check_last_update(config): |
|
17 |
""" |
|
18 |
Loads integer from updated.txt in CrawlerLogs/"dataset_name" |
|
19 |
representing number of days from last update if number equals |
|
20 |
number in confing update period updates it and reset number of |
|
21 |
days to zero else increment the number |
|
22 | ||
23 |
Arguments: |
|
24 |
config loaded configuration file of dataset |
|
25 | ||
26 |
Returns: |
|
27 |
True if updating |
|
28 |
Else if incementing days from last update |
|
29 |
""" |
|
30 |
dataset_name = config["dataset-name"] |
|
31 | ||
32 |
with open(CRAWLER_LOGS_PATH + dataset_name + "/" + "updated.txt", "r+") as file: |
|
33 |
last_update = file.read() |
|
34 |
last_update = int(last_update) |
|
35 |
file.seek(0) |
|
36 | ||
37 |
confing_update_period = int(config["update-period"]) |
|
38 | ||
39 |
if config["update-period"] <= last_update: |
|
40 |
print("Dataset " + dataset_name + " is being updated") |
|
41 |
file.write("0") |
|
42 |
file.truncate() |
|
43 |
return True |
|
44 |
else: |
|
45 |
last_update_days = last_update + 1 |
|
46 |
print("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days") |
|
47 |
file.write(str(last_update_days)) |
|
48 |
file.truncate() |
|
49 |
return False |
|
50 | ||
51 | ||
52 | ||
14 | 53 |
def crawl_data(config): |
15 | 54 |
""" |
16 | 55 |
Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py |
... | ... | |
123 | 162 | |
124 | 163 |
if validation_test: |
125 | 164 |
load_data_to_database(config) |
165 |
print("Dataset " + dataset_name + " has been sucessfully updated\n") |
|
166 | ||
167 | ||
168 | ||
169 |
def run_full_pipeline_crone(dataset_name): |
|
170 |
""" |
|
171 |
Loads config file and starts full pipeline |
|
172 |
-crawl data |
|
173 |
-process data |
|
174 |
-load data to database |
|
175 | ||
176 |
Args: |
|
177 |
dataset_name: name of dataset that has existing configuration file |
|
178 |
""" |
|
179 |
config = ConfigureFunctions.load_configuration(dataset_name) |
|
180 |
update_test = check_last_update(config) |
|
181 |
if update_test: |
|
182 |
crawl_data(config) |
|
183 |
process_data(config["dataset-name"]) |
|
184 | ||
185 |
validation_test = validate_process_data(config) |
|
126 | 186 | |
187 |
if validation_test: |
|
188 |
load_data_to_database(config) |
modules/crawler/PrepareNewDataset.py | ||
---|---|---|
107 | 107 |
if text is not None: |
108 | 108 |
file.write(text + "\n") |
109 | 109 | |
110 |
def create_updated_file(path): |
|
111 |
""" |
|
112 |
Creates updated file |
|
113 |
Args: |
|
114 |
path: path to directory for creating updated.txt |
|
115 |
""" |
|
116 |
with open(path + "/updated.txt", "w") as file: |
|
117 |
file.write(str(0) + "\n") |
|
118 | ||
110 | 119 | |
111 | 120 |
def prepare_dataset_structure(dataset_name): |
112 | 121 |
""" |
... | ... | |
114 | 123 |
Args: |
115 | 124 |
dataset_name: Name of newly created dataset |
116 | 125 |
""" |
117 |
jump_folder = "../" |
|
118 | 126 | |
119 | 127 |
# create folder for crawled data |
120 | 128 |
try: |
... | ... | |
138 | 146 |
path = CRAWLER_LOGS_PATH + dataset_name |
139 | 147 |
os.mkdir(path) |
140 | 148 |
create_ignore_file(path, None) |
149 |
create_updated_file(path) |
|
141 | 150 |
except OSError: |
142 | 151 |
print("Creation of the directory %s failed" % path) |
143 | 152 |
modules/crawler/ProcessedData/JIS/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
|
1 |
ignore.txt |
modules/crawler/ProcessedData/KOLOBEZKY/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
|
2 |
OD_ZCU_KOLOBEZKY_08_2019.CSV |
|
3 |
OD_ZCU_KOLOBEZKY_00_2019.CSV |
|
4 |
OD_ZCU_KOLOBEZKY_07_2019.CSV |
|
5 |
OD_ZCU_KOLOBEZKY_06_2019.CSV |
|
1 |
ignore.txt |
modules/crawler/ProcessedData/WIFI/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
|
1 |
ignore.txt |
modules/crawler/ResetDataset.py | ||
---|---|---|
2 | 2 |
from Utilities import FolderProcessor |
3 | 3 | |
4 | 4 |
# Path to crawled data |
5 |
CRAWLED_DATA_PATH = "../CrawledData/"
|
|
5 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
6 | 6 |
# Path to processed data |
7 |
PROCESSED_DATA_PATH = "../ProcessedData/"
|
|
7 |
PROCESSED_DATA_PATH = "ProcessedData/" |
|
8 | 8 |
# Path to crawler logs |
9 |
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
|
|
9 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
|
10 | 10 |
# Path to dataset configuration files |
11 |
CONFIG_FILES_PATH = "../DatasetConfigs"
|
|
11 |
CONFIG_FILES_PATH = "DatasetConfigs" |
|
12 | 12 | |
13 | 13 | |
14 | 14 |
def create_ignore_file(path, text): |
... | ... | |
23 | 23 |
file.write(text + "\n") |
24 | 24 | |
25 | 25 | |
26 |
def create_updated_file(path): |
|
27 |
""" |
|
28 |
Creates updated file |
|
29 |
Args: |
|
30 |
path: path to directory for creating updated.txt |
|
31 |
""" |
|
32 |
with open(path + "/updated.txt", "w") as file: |
|
33 |
file.write(str(0) + "\n") |
|
34 | ||
35 | ||
26 | 36 |
def reset_dataset(dataset_name): |
27 | 37 |
""" |
28 | 38 |
Resets all saved data in dataset except config and implementation |
... | ... | |
40 | 50 |
path = CRAWLER_LOGS_PATH + dataset_name + "/" |
41 | 51 |
FolderProcessor.clean_folder(path) |
42 | 52 |
create_ignore_file(path, None) |
53 |
create_updated_file(path) |
|
43 | 54 | |
44 | 55 | |
45 | 56 |
def reset_all_datasets(): |
modules/crawler/main.py | ||
---|---|---|
1 |
import Pipeline |
|
2 |
import os |
|
3 | ||
4 |
# Path to configuration files |
|
5 |
CONFIG_FILES_PATH = "DatasetConfigs/" |
|
6 | ||
7 | ||
8 |
def run_pipeline_for_all_datasets(): |
|
9 |
""" |
|
10 |
Runs whole DataScript pipeline for every dataset that has existing configuration file |
|
11 |
""" |
|
12 |
files_in_dir = os.listdir(CONFIG_FILES_PATH) |
|
13 | ||
14 |
for file in files_in_dir: |
|
15 |
name = file.split('.') |
|
16 |
Pipeline.run_full_pipeline(name[0]) |
|
17 | ||
18 | ||
19 |
def run_pipeline_for_one_dataset(dataset_name): |
|
20 |
""" |
|
21 |
Runs whole DataScript pipeline for only one dataset |
|
22 | ||
23 |
Args: |
|
24 |
dataset_name: name of dataset that has existing configuration file |
|
25 |
""" |
|
26 |
Pipeline.run_full_pipeline(dataset_name) |
|
27 | ||
28 | ||
29 |
run_pipeline_for_all_datasets() |
Také k dispozici: Unified diff
Re #7965
implementovat skript pro crone který spouští upadte datasetu podle configu