Projekt

Obecné

Profil

Stáhnout (1.16 KB) Statistiky
| Větev: | Revize:
1
from Utilities import FolderProcessor
2
from Utilities.Crawler import BasicCrawler
3

    
4

    
5
def crawl(config):
6

    
7
    dataset_name = config["dataset-name"]
8
    url = config['url']
9
    regex = config['regex']
10

    
11
    first_level_links = BasicCrawler.get_all_links(url)
12
    filtered_first_level_links = BasicCrawler.filter_links(first_level_links, "^OD_ZCU")
13
    absolute_first_level_links = BasicCrawler.create_absolute_links(filtered_first_level_links, url)
14

    
15
    files = []
16

    
17
    for link in absolute_first_level_links:
18
        second_level_links = BasicCrawler.get_all_links(link)
19
        filtered_second_level_links = BasicCrawler.filter_links(second_level_links, regex)
20
        absolute_second_level_links = BasicCrawler.create_absolute_links(filtered_second_level_links, link)
21
        final_links = BasicCrawler.remove_downloaded_links(absolute_second_level_links, dataset_name)
22

    
23
        for file_link in final_links:
24
            files.append(file_link)
25

    
26
    for file in files:
27
        BasicCrawler.download_file_from_url(file, "CrawledData/" + dataset_name + "/", dataset_name)
28

    
29
    FolderProcessor.unzip_all_csv_zip_files_in_folder("CrawledData/" + dataset_name + "/")
(1-1/2)