Revize 1752e732
Přidáno uživatelem Martin Šebela před více než 3 roky(ů)
modules/crawler/DatasetCrawler/OBSAZENIMISTNOSTI_crawler.py | ||
---|---|---|
1 |
from Utilities import folder_processor |
|
2 |
from Utilities.Crawler import basic_crawler_functions |
|
3 |
from shared_types import ConfigType |
|
4 |
|
|
5 |
# Path to crawled data |
|
6 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
7 |
|
|
8 |
|
|
9 |
def crawl(config: ConfigType): |
|
10 |
""" |
|
11 |
Implement crawl method that downloads new data to path_for_files |
|
12 |
For keeping the project structure |
|
13 |
url , regex, and dataset_name from config |
|
14 |
You can use already implemented functions from Utilities/Crawler/basic_crawler_functions.py |
|
15 |
|
|
16 |
Args: |
|
17 |
config: loaded configuration file of dataset |
|
18 |
""" |
|
19 |
dataset_name = config["dataset-name"] |
|
20 |
url = config['url'] |
|
21 |
regex = config['regex'] |
|
22 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
|
23 |
|
|
24 |
first_level_links = basic_crawler_functions.get_all_links(url) |
|
25 |
|
|
26 |
filtered_first_level_links = basic_crawler_functions.filter_links( |
|
27 |
first_level_links, "^OD_ZCU") |
|
28 |
|
|
29 |
OFFSET_YEAR_START = -5 |
|
30 |
OFFSET_YEAR_END = -1 |
|
31 |
MONTH_SIZE = 2 |
|
32 |
|
|
33 |
#Seperate links by year |
|
34 |
links_by_year = {} |
|
35 |
for item in filtered_first_level_links: |
|
36 |
if item[OFFSET_YEAR_START:OFFSET_YEAR_END] not in links_by_year: |
|
37 |
links_by_year[item[OFFSET_YEAR_START:OFFSET_YEAR_END]] = [] |
|
38 |
else: |
|
39 |
links_by_year[item[OFFSET_YEAR_START:OFFSET_YEAR_END]].append(item) |
|
40 |
|
|
41 |
#Latest links of years to array |
|
42 |
links = [] |
|
43 |
for _key, value in links_by_year.items(): |
|
44 |
if not value: |
|
45 |
continue |
|
46 |
|
|
47 |
links.append( |
|
48 |
max(value, |
|
49 |
key=lambda x: int(x[OFFSET_YEAR_START - MONTH_SIZE - 1: |
|
50 |
OFFSET_YEAR_START - 1]))) |
|
51 |
|
|
52 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links( |
|
53 |
links, url) |
|
54 |
|
|
55 |
files = [] |
|
56 |
|
|
57 |
for link in absolute_first_level_links: |
|
58 |
second_level_links = basic_crawler_functions.get_all_links(link) |
|
59 |
filtered_second_level_links = basic_crawler_functions.filter_links( |
|
60 |
second_level_links, regex) |
|
61 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links( |
|
62 |
filtered_second_level_links, link) |
|
63 |
|
|
64 |
for file_link in absolute_second_level_links: |
|
65 |
files.append(file_link) |
|
66 |
|
|
67 |
files = basic_crawler_functions.remove_downloaded_links( |
|
68 |
files, dataset_name) |
|
69 |
|
|
70 |
for file in files: |
|
71 |
basic_crawler_functions.download_file_from_url(file, dataset_name) |
|
72 |
|
|
73 |
folder_processor.unzip_all_csv_zip_files_in_folder(path_for_files) |
|
1 |
from Utilities import folder_processor |
|
2 |
from Utilities.Crawler import basic_crawler_functions |
|
3 |
from shared_types import ConfigType |
|
4 |
|
|
5 |
# Path to crawled data |
|
6 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
7 |
|
|
8 |
|
|
9 |
def crawl(config: ConfigType): |
|
10 |
""" |
|
11 |
Implement crawl method that downloads new data to path_for_files |
|
12 |
For keeping the project structure |
|
13 |
url , regex, and dataset_name from config |
|
14 |
You can use already implemented functions from Utilities/Crawler/basic_crawler_functions.py |
|
15 |
|
|
16 |
Args: |
|
17 |
config: loaded configuration file of dataset |
|
18 |
""" |
|
19 |
dataset_name = config["dataset-name"] |
|
20 |
url = config['url'] |
|
21 |
regex = config['regex'] |
|
22 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
|
23 |
|
|
24 |
first_level_links = basic_crawler_functions.get_all_links(url) |
|
25 |
|
|
26 |
filtered_first_level_links = basic_crawler_functions.filter_links( |
|
27 |
first_level_links, "^OD_ZCU") |
|
28 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links( |
|
29 |
filtered_first_level_links, url) |
|
30 |
|
|
31 |
files = [] |
|
32 |
|
|
33 |
for link in absolute_first_level_links: |
|
34 |
second_level_links = basic_crawler_functions.get_all_links(link) |
|
35 |
filtered_second_level_links = basic_crawler_functions.filter_links( |
|
36 |
second_level_links, regex) |
|
37 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links( |
|
38 |
filtered_second_level_links, link) |
|
39 |
|
|
40 |
for file_link in absolute_second_level_links: |
|
41 |
files.append(file_link) |
|
42 |
|
|
43 |
files = basic_crawler_functions.remove_downloaded_links( |
|
44 |
files, dataset_name) |
|
45 |
|
|
46 |
for file in files: |
|
47 |
basic_crawler_functions.download_file_from_url(file, dataset_name) |
|
48 |
|
|
49 |
folder_processor.unzip_all_csv_zip_files_in_folder(path_for_files) |
Také k dispozici: Unified diff
Fixed OBSAZENIMISTNOSTI crawler