Revize af7609b5
Přidáno uživatelem Tomáš Ballák před více než 3 roky(ů)
modules/crawler/DatasetCrawler/JIS_crawler.py | ||
---|---|---|
1 | 1 |
from Utilities import folder_processor |
2 | 2 |
from Utilities.Crawler import basic_crawler_functions |
3 |
|
|
3 |
from shared_types import ConfigType |
|
4 | 4 |
# Path to crawled data |
5 | 5 |
CRAWLED_DATA_PATH = "CrawledData/" |
6 | 6 |
|
7 | 7 |
|
8 |
def crawl(config): |
|
8 |
def crawl(config: ConfigType):
|
|
9 | 9 |
""" |
10 | 10 |
Implement crawl method that downloads new data to path_for_files |
11 | 11 |
For keeping the project structure |
... | ... | |
21 | 21 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
22 | 22 |
|
23 | 23 |
first_level_links = basic_crawler_functions.get_all_links(url) |
24 |
filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU") |
|
25 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url) |
|
24 |
filtered_first_level_links = basic_crawler_functions.filter_links( |
|
25 |
first_level_links, "^OD_ZCU") |
|
26 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links( |
|
27 |
filtered_first_level_links, url) |
|
26 | 28 |
|
27 | 29 |
files = [] |
28 | 30 |
|
29 | 31 |
for link in absolute_first_level_links: |
30 | 32 |
second_level_links = basic_crawler_functions.get_all_links(link) |
31 |
filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex) |
|
32 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link) |
|
33 |
filtered_second_level_links = basic_crawler_functions.filter_links( |
|
34 |
second_level_links, regex) |
|
35 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links( |
|
36 |
filtered_second_level_links, link) |
|
33 | 37 |
|
34 | 38 |
for file_link in absolute_second_level_links: |
35 | 39 |
files.append(file_link) |
36 | 40 |
|
37 |
files = basic_crawler_functions.remove_downloaded_links(files, dataset_name) |
|
41 |
files = basic_crawler_functions.remove_downloaded_links( |
|
42 |
files, dataset_name) |
|
38 | 43 |
|
39 | 44 |
for file in files: |
40 | 45 |
basic_crawler_functions.download_file_from_url(file, dataset_name) |
Také k dispozici: Unified diff
Re #8193 - refactoring crawler