Revize 34cf65cd
Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)
python-module/DatasetCrawler/KOLOBEZKYCrawler.py | ||
---|---|---|
1 | 1 |
from Utilities import FolderProcessor |
2 |
from Utilities.Crawler import BasicCrawler |
|
2 |
from Utilities.Crawler import BasicCrawlerFunctions
|
|
3 | 3 |
|
4 | 4 |
# Path to crawled data |
5 | 5 |
CRAWLED_DATA_PATH = "CrawledData/" |
... | ... | |
20 | 20 |
regex = config['regex'] |
21 | 21 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
22 | 22 |
|
23 |
first_level_links = BasicCrawler.get_all_links(url) |
|
24 |
filtered_first_level_links = BasicCrawler.filter_links(first_level_links, "^OD_ZCU") |
|
25 |
absolute_first_level_links = BasicCrawler.create_absolute_links(filtered_first_level_links, url) |
|
23 |
first_level_links = BasicCrawlerFunctions.get_all_links(url)
|
|
24 |
filtered_first_level_links = BasicCrawlerFunctions.filter_links(first_level_links, "^OD_ZCU")
|
|
25 |
absolute_first_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_first_level_links, url)
|
|
26 | 26 |
|
27 | 27 |
files = [] |
28 | 28 |
|
29 | 29 |
for link in absolute_first_level_links: |
30 |
second_level_links = BasicCrawler.get_all_links(link) |
|
31 |
filtered_second_level_links = BasicCrawler.filter_links(second_level_links, regex) |
|
32 |
absolute_second_level_links = BasicCrawler.create_absolute_links(filtered_second_level_links, link) |
|
33 |
final_links = BasicCrawler.remove_downloaded_links(absolute_second_level_links, dataset_name) |
|
30 |
second_level_links = BasicCrawlerFunctions.get_all_links(link)
|
|
31 |
filtered_second_level_links = BasicCrawlerFunctions.filter_links(second_level_links, regex)
|
|
32 |
absolute_second_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_second_level_links, link)
|
|
33 |
final_links = BasicCrawlerFunctions.remove_downloaded_links(absolute_second_level_links, dataset_name)
|
|
34 | 34 |
|
35 | 35 |
for file_link in final_links: |
36 | 36 |
files.append(file_link) |
37 | 37 |
|
38 | 38 |
for file in files: |
39 |
BasicCrawler.download_file_from_url(file, "CrawledData/" + dataset_name + "/", dataset_name)
|
|
39 |
BasicCrawlerFunctions.download_file_from_url(file, CRAWLED_DATA_PATH + dataset_name + "/", dataset_name)
|
|
40 | 40 |
|
41 |
FolderProcessor.unzip_all_csv_zip_files_in_folder("CrawledData/" + dataset_name + "/") |
|
41 |
FolderProcessor.unzip_all_csv_zip_files_in_folder(CRAWLED_DATA_PATH + dataset_name + "/") |
Také k dispozici: Unified diff
Fixed issues found in code documentation