Projekt

Obecné

Profil

« Předchozí | Další » 

Revize af7609b5

Přidáno uživatelem Tomáš Ballák před více než 3 roky(ů)

Re #8193 - refactoring crawler

Zobrazit rozdíly:

modules/crawler/Utilities/Crawler/basic_crawler_functions.py
3 3
from Utilities import folder_processor
4 4
from Utilities.Database import database_record_logs
5 5
from bs4 import BeautifulSoup
6
from typing import List
6 7

  
7 8
# Path to crawler logs
8 9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
9 10
# Path to crawled data
10 11
CRAWLED_DATA_PATH = "CrawledData/"
12
LinksType = List[str]
11 13

  
12 14

  
13
def get_all_links(url):
15
def get_all_links(url: str) -> LinksType:
14 16
    """
15 17
    Sends http request to url, downloads all data,
16 18
    extract links
......
34 36
    return links
35 37

  
36 38

  
37
def filter_links(links, regex):
39
def filter_links(links: LinksType, regex: str) -> LinksType:
38 40
    """
39 41
    Filters list of links using regex
40 42

  
......
54 56
    return filtered_links
55 57

  
56 58

  
57
def create_absolute_links(links, archive):
59
def create_absolute_links(links: LinksType, archive: str) -> LinksType:
58 60
    """
59 61
        Appends archive path to every link in links
60 62
    Args:
......
72 74
    return absolute_links
73 75

  
74 76

  
75
def remove_downloaded_links(links, dataset_name):
77
def remove_downloaded_links(links: LinksType, dataset_name: str) -> LinksType:
76 78
    """
77 79
    Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
78 80
    Args:
......
88 90
    return final_links
89 91

  
90 92

  
91
def download_file_from_url(url, dataset_name):
93
def download_file_from_url(url: str, dataset_name: str) -> None:
92 94
    """
93 95
    Downloads file on provided url and saves it to path
94 96
    Args:

Také k dispozici: Unified diff