Projekt

Obecné

Profil

Stáhnout (1.16 KB) Statistiky
| Větev: | Revize:
1 ead783ce petrh
from Utilities import FolderProcessor
2
from Utilities.Crawler import BasicCrawler
3
4
5
def crawl(config):
6
7
    dataset_name = config["dataset-name"]
8
    url = config['url']
9
    regex = config['regex']
10
11
    first_level_links = BasicCrawler.get_all_links(url)
12
    filtered_first_level_links = BasicCrawler.filter_links(first_level_links, "^OD_ZCU")
13
    absolute_first_level_links = BasicCrawler.create_absolute_links(filtered_first_level_links, url)
14
15
    files = []
16
17
    for link in absolute_first_level_links:
18
        second_level_links = BasicCrawler.get_all_links(link)
19
        filtered_second_level_links = BasicCrawler.filter_links(second_level_links, regex)
20
        absolute_second_level_links = BasicCrawler.create_absolute_links(filtered_second_level_links, link)
21
        final_links = BasicCrawler.remove_downloaded_links(absolute_second_level_links, dataset_name)
22
23
        for file_link in final_links:
24
            files.append(file_link)
25
26
    for file in files:
27
        BasicCrawler.download_file_from_url(file, "CrawledData/" + dataset_name + "/", dataset_name)
28
29
    FolderProcessor.unzip_all_csv_zip_files_in_folder("CrawledData/" + dataset_name + "/")