Projekt

Obecné

Profil

Stáhnout (1.63 KB) Statistiky
| Větev: | Revize:
1 c8f3051b petrh
from Utilities import FolderProcessor
2 34cf65cd petrh
from Utilities.Crawler import BasicCrawlerFunctions
3 c8f3051b petrh
4 04a2b5a4 petrh
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6 c8f3051b petrh
7
8 04a2b5a4 petrh
def crawl(config):
9
    """
10
    Implement crawl method that downloads new data to path_for_files
11
    For keeping the project structure
12
    url , regex, and dataset_name from config
13
    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py
14
15
    Args:
16
        config: loaded configuration file of dataset
17
    """
18 c8f3051b petrh
    dataset_name = config["dataset-name"]
19
    url = config['url']
20
    regex = config['regex']
21 04a2b5a4 petrh
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
22 c8f3051b petrh
23 34cf65cd petrh
    first_level_links = BasicCrawlerFunctions.get_all_links(url)
24
    filtered_first_level_links = BasicCrawlerFunctions.filter_links(first_level_links, "^OD_ZCU")
25
    absolute_first_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_first_level_links, url)
26 c8f3051b petrh
27
    files = []
28
29
    for link in absolute_first_level_links:
30 34cf65cd petrh
        second_level_links = BasicCrawlerFunctions.get_all_links(link)
31
        filtered_second_level_links = BasicCrawlerFunctions.filter_links(second_level_links, regex)
32
        absolute_second_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_second_level_links, link)
33
        final_links = BasicCrawlerFunctions.remove_downloaded_links(absolute_second_level_links, dataset_name)
34 c8f3051b petrh
35
        for file_link in final_links:
36
            files.append(file_link)
37
38
    for file in files:
39 1187e871 petrh
        BasicCrawlerFunctions.download_file_from_url(file, dataset_name)
40 c8f3051b petrh
41 1187e871 petrh
    FolderProcessor.unzip_all_csv_zip_files_in_folder(path_for_files)