Revize af7609b5
Přidáno uživatelem Tomáš Ballák před více než 3 roky(ů)
modules/crawler/Utilities/Crawler/basic_crawler_functions.py | ||
---|---|---|
3 | 3 |
from Utilities import folder_processor |
4 | 4 |
from Utilities.Database import database_record_logs |
5 | 5 |
from bs4 import BeautifulSoup |
6 |
from typing import List |
|
6 | 7 |
|
7 | 8 |
# Path to crawler logs |
8 | 9 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
9 | 10 |
# Path to crawled data |
10 | 11 |
CRAWLED_DATA_PATH = "CrawledData/" |
12 |
LinksType = List[str] |
|
11 | 13 |
|
12 | 14 |
|
13 |
def get_all_links(url):
|
|
15 |
def get_all_links(url: str) -> LinksType:
|
|
14 | 16 |
""" |
15 | 17 |
Sends http request to url, downloads all data, |
16 | 18 |
extract links |
... | ... | |
34 | 36 |
return links |
35 | 37 |
|
36 | 38 |
|
37 |
def filter_links(links, regex):
|
|
39 |
def filter_links(links: LinksType, regex: str) -> LinksType:
|
|
38 | 40 |
""" |
39 | 41 |
Filters list of links using regex |
40 | 42 |
|
... | ... | |
54 | 56 |
return filtered_links |
55 | 57 |
|
56 | 58 |
|
57 |
def create_absolute_links(links, archive):
|
|
59 |
def create_absolute_links(links: LinksType, archive: str) -> LinksType:
|
|
58 | 60 |
""" |
59 | 61 |
Appends archive path to every link in links |
60 | 62 |
Args: |
... | ... | |
72 | 74 |
return absolute_links |
73 | 75 |
|
74 | 76 |
|
75 |
def remove_downloaded_links(links, dataset_name):
|
|
77 |
def remove_downloaded_links(links: LinksType, dataset_name: str) -> LinksType:
|
|
76 | 78 |
""" |
77 | 79 |
Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt |
78 | 80 |
Args: |
... | ... | |
88 | 90 |
return final_links |
89 | 91 |
|
90 | 92 |
|
91 |
def download_file_from_url(url, dataset_name):
|
|
93 |
def download_file_from_url(url: str, dataset_name: str) -> None:
|
|
92 | 94 |
""" |
93 | 95 |
Downloads file on provided url and saves it to path |
94 | 96 |
Args: |
Také k dispozici: Unified diff
Re #8193 - refactoring crawler