Projekt

Obecné

Profil

Stáhnout (2.37 KB) Statistiky
| Větev: | Revize:
1
from Utilities import folder_processor
2
from Utilities.Crawler import basic_crawler_functions
3

    
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6

    
7

    
8
def crawl(config):
9
    """
10
    Implement crawl method that downloads new data to path_for_files
11
    For keeping the project structure
12
    url , regex, and dataset_name from config
13
    You can use already implemented functions from Utilities/Crawler/basic_crawler_functions.py
14

    
15
    Args:
16
        config: loaded configuration file of dataset
17
    """
18
    dataset_name = config["dataset-name"]
19
    url = config['url']
20
    regex = config['regex']
21
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
22

    
23
    first_level_links = basic_crawler_functions.get_all_links(url)
24

    
25
    filtered_first_level_links = basic_crawler_functions.filter_links(
26
        first_level_links, "^OD_ZCU")
27

    
28
    OFFSET_YEAR_START = -5
29
    OFFSET_YEAR_END = -1
30
    MONTH_SIZE = 2
31

    
32
    #Seperate links by year
33
    links_by_year = {}
34
    for item in filtered_first_level_links:
35
        if item[OFFSET_YEAR_START:OFFSET_YEAR_END] not in links_by_year:
36
            links_by_year[item[OFFSET_YEAR_START:OFFSET_YEAR_END]] = []
37
        else:
38
            links_by_year[item[OFFSET_YEAR_START:OFFSET_YEAR_END]].append(item)
39

    
40
    #Latest links of years to array
41
    links = []
42
    for _key, value in links_by_year.items():
43
        links.append(
44
            max(value,
45
                key=lambda x: int(x[OFFSET_YEAR_START - MONTH_SIZE - 1:
46
                                    OFFSET_YEAR_START - 1])))
47

    
48
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(
49
        links, url)
50

    
51
    files = []
52

    
53
    for link in absolute_first_level_links:
54
        second_level_links = basic_crawler_functions.get_all_links(link)
55
        filtered_second_level_links = basic_crawler_functions.filter_links(
56
            second_level_links, regex)
57
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(
58
            filtered_second_level_links, link)
59

    
60
        for file_link in absolute_second_level_links:
61
            files.append(file_link)
62

    
63
    files = basic_crawler_functions.remove_downloaded_links(
64
        files, dataset_name)
65

    
66
    for file in files:
67
        basic_crawler_functions.download_file_from_url(file, dataset_name)
68

    
69
    folder_processor.unzip_all_csv_zip_files_in_folder(path_for_files)
(3-3/4)