Projekt

Obecné

Profil

« Předchozí | Další » 

Revize 1752e732

Přidáno uživatelem Martin Šebela před téměř 3 roky(ů)

Fixed OBSAZENIMISTNOSTI crawler

Zobrazit rozdíly:

modules/crawler/DatasetCrawler/OBSAZENIMISTNOSTI_crawler.py
1
from Utilities import folder_processor
2
from Utilities.Crawler import basic_crawler_functions
3
from shared_types import ConfigType
4

  
5
# Path to crawled data
6
CRAWLED_DATA_PATH = "CrawledData/"
7

  
8

  
9
def crawl(config: ConfigType):
10
    """
11
    Implement crawl method that downloads new data to path_for_files
12
    For keeping the project structure
13
    url , regex, and dataset_name from config
14
    You can use already implemented functions from Utilities/Crawler/basic_crawler_functions.py
15

  
16
    Args:
17
        config: loaded configuration file of dataset
18
    """
19
    dataset_name = config["dataset-name"]
20
    url = config['url']
21
    regex = config['regex']
22
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
23

  
24
    first_level_links = basic_crawler_functions.get_all_links(url)
25

  
26
    filtered_first_level_links = basic_crawler_functions.filter_links(
27
        first_level_links, "^OD_ZCU")
28

  
29
    OFFSET_YEAR_START = -5
30
    OFFSET_YEAR_END = -1
31
    MONTH_SIZE = 2
32

  
33
    #Seperate links by year
34
    links_by_year = {}
35
    for item in filtered_first_level_links:
36
        if item[OFFSET_YEAR_START:OFFSET_YEAR_END] not in links_by_year:
37
            links_by_year[item[OFFSET_YEAR_START:OFFSET_YEAR_END]] = []
38
        else:
39
            links_by_year[item[OFFSET_YEAR_START:OFFSET_YEAR_END]].append(item)
40

  
41
    #Latest links of years to array
42
    links = []
43
    for _key, value in links_by_year.items():
44
        if not value:
45
            continue
46
        
47
        links.append(
48
            max(value,
49
                key=lambda x: int(x[OFFSET_YEAR_START - MONTH_SIZE - 1:
50
                                    OFFSET_YEAR_START - 1])))
51

  
52
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(
53
        links, url)
54

  
55
    files = []
56

  
57
    for link in absolute_first_level_links:
58
        second_level_links = basic_crawler_functions.get_all_links(link)
59
        filtered_second_level_links = basic_crawler_functions.filter_links(
60
            second_level_links, regex)
61
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(
62
            filtered_second_level_links, link)
63

  
64
        for file_link in absolute_second_level_links:
65
            files.append(file_link)
66

  
67
    files = basic_crawler_functions.remove_downloaded_links(
68
        files, dataset_name)
69

  
70
    for file in files:
71
        basic_crawler_functions.download_file_from_url(file, dataset_name)
72

  
73
    folder_processor.unzip_all_csv_zip_files_in_folder(path_for_files)
1
from Utilities import folder_processor
2
from Utilities.Crawler import basic_crawler_functions
3
from shared_types import ConfigType
4

  
5
# Path to crawled data
6
CRAWLED_DATA_PATH = "CrawledData/"
7

  
8

  
9
def crawl(config: ConfigType):
10
    """
11
    Implement crawl method that downloads new data to path_for_files
12
    For keeping the project structure
13
    url , regex, and dataset_name from config
14
    You can use already implemented functions from Utilities/Crawler/basic_crawler_functions.py
15

  
16
    Args:
17
        config: loaded configuration file of dataset
18
    """
19
    dataset_name = config["dataset-name"]
20
    url = config['url']
21
    regex = config['regex']
22
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
23

  
24
    first_level_links = basic_crawler_functions.get_all_links(url)
25

  
26
    filtered_first_level_links = basic_crawler_functions.filter_links(
27
        first_level_links, "^OD_ZCU")
28
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(
29
        filtered_first_level_links, url)
30

  
31
    files = []
32

  
33
    for link in absolute_first_level_links:
34
        second_level_links = basic_crawler_functions.get_all_links(link)
35
        filtered_second_level_links = basic_crawler_functions.filter_links(
36
            second_level_links, regex)
37
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(
38
            filtered_second_level_links, link)
39

  
40
        for file_link in absolute_second_level_links:
41
            files.append(file_link)
42

  
43
    files = basic_crawler_functions.remove_downloaded_links(
44
        files, dataset_name)
45

  
46
    for file in files:
47
        basic_crawler_functions.download_file_from_url(file, dataset_name)
48

  
49
    folder_processor.unzip_all_csv_zip_files_in_folder(path_for_files)

Také k dispozici: Unified diff