Projekt

Obecné

Profil

« Předchozí | Další » 

Revize 34baf808

Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)

Re #7965
implementovat skript pro crone který spouští upadte datasetu podle configu

Zobrazit rozdíly:

modules/crawler/CrawledData/JIS/ignore.txt
1
ignore.txt
2
OD_ZCU_JIS_10_2019.CSV
3
OD_ZCU_JIS_03_2020.CSV
4
OD_ZCU_JIS_02_2020.CSV
5
OD_ZCU_JIS_00_2019.CSV
6
OD_ZCU_JIS_08_2019.CSV
7
OD_ZCU_JIS_12_2019.CSV
8
OD_ZCU_JIS_09_2019.CSV
9
OD_ZCU_JIS_01_2020.CSV
10
OD_ZCU_JIS_06_2019.CSV
11
OD_ZCU_JIS_11_2019.CSV
12
OD_ZCU_JIS_07_2019.CSV
1
ignore.txt
modules/crawler/CrawledData/KOLOBEZKY/ignore.txt
1
ignore.txt
2
OD_ZCU_KOLOBEZKY_00_2019.CSV
3
OD_ZCU_KOLOBEZKY_06_2019.CSV
4
OD_ZCU_KOLOBEZKY_07_2019.CSV
5
OD_ZCU_KOLOBEZKY_08_2019.CSV
1
ignore.txt
modules/crawler/CrawledData/WIFI/ignore.txt
1
ignore.txt
2
OD_ZCU_WIFI_07_2019.CSV
3
OD_ZCU_WIFI_00_2019.CSV
4
OD_ZCU_WIFI_06_2019.CSV
5
OD_ZCU_WIFI_08_2019.CSV
1
ignore.txt
modules/crawler/CrawlerLogs/JIS/ignore.txt
1
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_JIS_00_2019_CSV.zip
2
https://openstore.zcu.cz/OD_ZCU_01_2020/OD_ZCU_JIS_01_2020_CSV.zip
3
https://openstore.zcu.cz/OD_ZCU_02_2020/OD_ZCU_JIS_02_2020_CSV.zip
4
https://openstore.zcu.cz/OD_ZCU_03_2020/OD_ZCU_JIS_03_2020_CSV.zip
5
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_JIS_06_2019_CSV.zip
6
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_JIS_07_2019_CSV.zip
7
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_JIS_08_2019_CSV.zip
8
https://openstore.zcu.cz/OD_ZCU_09_2019/OD_ZCU_JIS_09_2019_CSV.zip
9
https://openstore.zcu.cz/OD_ZCU_10_2019/OD_ZCU_JIS_10_2019_CSV.zip
10
https://openstore.zcu.cz/OD_ZCU_11_2019/OD_ZCU_JIS_11_2019_CSV.zip
11
https://openstore.zcu.cz/OD_ZCU_12_2019/OD_ZCU_JIS_12_2019_CSV.zip
modules/crawler/CrawlerLogs/JIS/updated.txt
1
0
modules/crawler/CrawlerLogs/KOLOBEZKY/ignore.txt
1
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_KOLOBEZKY_00_2019_CSV.zip
2
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_KOLOBEZKY_06_2019_CSV.zip
3
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_KOLOBEZKY_07_2019_CSV.zip
4
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_KOLOBEZKY_08_2019_CSV.zip
modules/crawler/CrawlerLogs/KOLOBEZKY/updated.txt
1
0
modules/crawler/CrawlerLogs/WIFI/ignore.txt
1
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_WIFI_00_2019_CSV.zip
2
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_WIFI_06_2019_CSV.zip
3
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_WIFI_07_2019_CSV.zip
4
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_WIFI_08_2019_CSV.zip
modules/crawler/CrawlerLogs/WIFI/updated.txt
1
0
modules/crawler/CroneUpdateScript.py
1
import Pipeline
2
import os
3

  
4
# Path to configuration files
5
CONFIG_FILES_PATH = "DatasetConfigs/"
6

  
7

  
8
def run_pipeline_for_all_datasets():
9
    """
10
    Runs whole DataScript pipeline for every dataset that has existing configuration file
11
    """
12
    files_in_dir = os.listdir(CONFIG_FILES_PATH)
13

  
14
    for file in files_in_dir:
15
        name = file.split('.')
16
        Pipeline.run_full_pipeline_crone(name[0])
17

  
18

  
19
run_pipeline_for_all_datasets()
modules/crawler/ForceUpdateDataset.py
1
import Pipeline
2
import os
3

  
4
print("Zadejte jméno Datasetu který chcete upadtovat:\n")
5
Pipeline.run_full_pipeline(input())
modules/crawler/ForceUpdateDatasets.py
1
import Pipeline
2
import os
3

  
4
# Path to configuration files
5
CONFIG_FILES_PATH = "DatasetConfigs/"
6

  
7

  
8
def run_pipeline_for_all_datasets():
9
    """
10
    Runs whole DataScript pipeline for every dataset that has existing configuration file
11
    """
12
    files_in_dir = os.listdir(CONFIG_FILES_PATH)
13

  
14
    for file in files_in_dir:
15
        name = file.split('.')
16
        Pipeline.run_full_pipeline(name[0])
17

  
18

  
19
run_pipeline_for_all_datasets()
modules/crawler/Pipeline.py
5 5
CRAWLED_DATA_PATH = "CrawledData/"
6 6
# Path to processed data
7 7
PROCESSED_DATA_PATH = "ProcessedData/"
8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
8 10
# Path to dataset crawler implementations
9 11
CRAWLER_LIB_PATH = "DatasetCrawler."
10 12
# Path to dataset processor implementations
11 13
PROCESSOR_LIB_PATH = "DatasetProcessing."
12 14

  
13 15

  
16
def check_last_update(config):
17
    """
18
    Loads integer from updated.txt in CrawlerLogs/"dataset_name"
19
    representing number of days from last update if number equals
20
    number in confing update period updates it and reset number of
21
    days to zero else increment the number
22

  
23
    Arguments:
24
        config loaded configuration file of dataset
25

  
26
    Returns:
27
       True if updating
28
       Else if incementing days from last update
29
    """
30
    dataset_name = config["dataset-name"]
31

  
32
    with open(CRAWLER_LOGS_PATH + dataset_name + "/" + "updated.txt", "r+") as file:
33
        last_update = file.read()
34
        last_update = int(last_update)
35
        file.seek(0)
36

  
37
        confing_update_period = int(config["update-period"])
38

  
39
        if config["update-period"] <= last_update:
40
            print("Dataset " + dataset_name + " is being updated")
41
            file.write("0")
42
            file.truncate()
43
            return True
44
        else:
45
            last_update_days = last_update + 1
46
            print("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days")
47
            file.write(str(last_update_days))
48
            file.truncate()
49
            return False
50

  
51

  
52

  
14 53
def crawl_data(config):
15 54
    """
16 55
      Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py
......
123 162

  
124 163
    if validation_test:
125 164
        load_data_to_database(config)
165
        print("Dataset " + dataset_name + " has been sucessfully updated\n")
166

  
167

  
168

  
169
def run_full_pipeline_crone(dataset_name):
170
    """
171
    Loads config file and starts full pipeline
172
    -crawl data
173
    -process data
174
    -load data to database
175

  
176
    Args:
177
        dataset_name: name of dataset that has existing configuration file
178
    """
179
    config = ConfigureFunctions.load_configuration(dataset_name)
180
    update_test = check_last_update(config)
181
    if update_test:
182
        crawl_data(config)
183
        process_data(config["dataset-name"])
184

  
185
        validation_test = validate_process_data(config)
126 186

  
187
        if validation_test:
188
            load_data_to_database(config)
modules/crawler/PrepareNewDataset.py
107 107
        if text is not None:
108 108
            file.write(text + "\n")
109 109

  
110
def create_updated_file(path):
111
    """
112
    Creates updated file
113
    Args:
114
        path: path to directory for creating updated.txt
115
    """
116
    with open(path + "/updated.txt", "w") as file:
117
            file.write(str(0) + "\n")
118

  
110 119

  
111 120
def prepare_dataset_structure(dataset_name):
112 121
    """
......
114 123
    Args:
115 124
        dataset_name: Name of newly created dataset
116 125
    """
117
    jump_folder = "../"
118 126

  
119 127
    # create folder for crawled data
120 128
    try:
......
138 146
        path = CRAWLER_LOGS_PATH + dataset_name
139 147
        os.mkdir(path)
140 148
        create_ignore_file(path, None)
149
        create_updated_file(path)
141 150
    except OSError:
142 151
        print("Creation of the directory %s failed" % path)
143 152

  
modules/crawler/ProcessedData/JIS/ignore.txt
1
ignore.txt
1
ignore.txt
modules/crawler/ProcessedData/KOLOBEZKY/ignore.txt
1
ignore.txt
2
OD_ZCU_KOLOBEZKY_08_2019.CSV
3
OD_ZCU_KOLOBEZKY_00_2019.CSV
4
OD_ZCU_KOLOBEZKY_07_2019.CSV
5
OD_ZCU_KOLOBEZKY_06_2019.CSV
1
ignore.txt
modules/crawler/ProcessedData/WIFI/ignore.txt
1
ignore.txt
1
ignore.txt
modules/crawler/ResetDataset.py
2 2
from Utilities import FolderProcessor
3 3

  
4 4
# Path to crawled data
5
CRAWLED_DATA_PATH = "../CrawledData/"
5
CRAWLED_DATA_PATH = "CrawledData/"
6 6
# Path to processed data
7
PROCESSED_DATA_PATH = "../ProcessedData/"
7
PROCESSED_DATA_PATH = "ProcessedData/"
8 8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
10 10
# Path to dataset configuration files
11
CONFIG_FILES_PATH = "../DatasetConfigs"
11
CONFIG_FILES_PATH = "DatasetConfigs"
12 12

  
13 13

  
14 14
def create_ignore_file(path, text):
......
23 23
            file.write(text + "\n")
24 24

  
25 25

  
26
def create_updated_file(path):
27
    """
28
    Creates updated file
29
    Args:
30
        path: path to directory for creating updated.txt
31
    """
32
    with open(path + "/updated.txt", "w") as file:
33
            file.write(str(0) + "\n")
34

  
35

  
26 36
def reset_dataset(dataset_name):
27 37
    """
28 38
    Resets all saved data in dataset except config and implementation
......
40 50
    path = CRAWLER_LOGS_PATH + dataset_name + "/"
41 51
    FolderProcessor.clean_folder(path)
42 52
    create_ignore_file(path, None)
53
    create_updated_file(path)
43 54

  
44 55

  
45 56
def reset_all_datasets():
modules/crawler/main.py
1
import Pipeline
2
import os
3

  
4
# Path to configuration files
5
CONFIG_FILES_PATH = "DatasetConfigs/"
6

  
7

  
8
def run_pipeline_for_all_datasets():
9
    """
10
    Runs whole DataScript pipeline for every dataset that has existing configuration file
11
    """
12
    files_in_dir = os.listdir(CONFIG_FILES_PATH)
13

  
14
    for file in files_in_dir:
15
        name = file.split('.')
16
        Pipeline.run_full_pipeline(name[0])
17

  
18

  
19
def run_pipeline_for_one_dataset(dataset_name):
20
    """
21
    Runs whole DataScript pipeline for only one dataset
22

  
23
    Args:
24
        dataset_name: name of dataset that has existing configuration file
25
    """
26
    Pipeline.run_full_pipeline(dataset_name)
27

  
28

  
29
run_pipeline_for_all_datasets()

Také k dispozici: Unified diff