Projekt

Obecné

Profil

Stáhnout (1.74 KB) Statistiky
| Větev: | Revize:
1 d6ca840d petrh
from Utilities import folder_processor
2
from Utilities.Crawler import basic_crawler_functions
3 af7609b5 Tomáš Ballák
from shared_types import ConfigType
4 d6ca840d petrh
5
# Path to crawled data
6
CRAWLED_DATA_PATH = "CrawledData/"
7
8
9 af7609b5 Tomáš Ballák
def crawl(config: ConfigType):
10 d6ca840d petrh
    """
11
    Implement crawl method that downloads new data to path_for_files
12
    For keeping the project structure
13
    url , regex, and dataset_name from config
14
    You can use already implemented functions from Utilities/Crawler/basic_crawler_functions.py
15
16
    Args:
17
        config: loaded configuration file of dataset
18
    """
19
    dataset_name = config["dataset-name"]
20
    url = config['url']
21
    regex = config['regex']
22
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
23
24
    first_level_links = basic_crawler_functions.get_all_links(url)
25 af7609b5 Tomáš Ballák
    filtered_first_level_links = basic_crawler_functions.filter_links(
26
        first_level_links, "^OD_ZCU")
27
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(
28
        filtered_first_level_links, url)
29 d6ca840d petrh
30
    files = []
31
32
    for link in absolute_first_level_links:
33
        second_level_links = basic_crawler_functions.get_all_links(link)
34 af7609b5 Tomáš Ballák
        filtered_second_level_links = basic_crawler_functions.filter_links(
35
            second_level_links, regex)
36
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(
37
            filtered_second_level_links, link)
38 d6ca840d petrh
39 3692d853 petrh
        for file_link in absolute_second_level_links:
40 d6ca840d petrh
            files.append(file_link)
41
42 af7609b5 Tomáš Ballák
    files = basic_crawler_functions.remove_downloaded_links(
43
        files, dataset_name)
44 3692d853 petrh
45 d6ca840d petrh
    for file in files:
46
        basic_crawler_functions.download_file_from_url(file, dataset_name)
47
48
    folder_processor.unzip_all_csv_zip_files_in_folder(path_for_files)