Projekt

Obecné

Profil

Stáhnout (3.03 KB) Statistiky
| Větev: | Revize:
1 04a2b5a4 petrh
import requests
2
import re
3 d6ca840d petrh
from Utilities import folder_processor
4
from Utilities.Database import database_record_logs
5 04a2b5a4 petrh
from bs4 import BeautifulSoup
6 af7609b5 Tomáš Ballák
from typing import List
7 04a2b5a4 petrh
8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
10 1187e871 petrh
# Path to crawled data
11
CRAWLED_DATA_PATH = "CrawledData/"
12 af7609b5 Tomáš Ballák
LinksType = List[str]
13 04a2b5a4 petrh
14
15 af7609b5 Tomáš Ballák
def get_all_links(url: str) -> LinksType:
16 04a2b5a4 petrh
    """
17
    Sends http request to url, downloads all data,
18
    extract links
19
20
    Args:
21
        url: url of website we want to search
22
23
    Returns:
24
        list of all links
25
    """
26
    # create response object
27
    r = requests.get(url)
28
29
    # create beautiful-soup object
30
    soup = BeautifulSoup(r.content, 'html5lib')
31
    links = []
32
33
    for link in soup.findAll('a'):
34
        links.append(link.get('href'))
35
36
    return links
37
38
39 af7609b5 Tomáš Ballák
def filter_links(links: LinksType, regex: str) -> LinksType:
40 04a2b5a4 petrh
    """
41
    Filters list of links using regex
42
43
    Args:
44
        links: list of links
45
        regex: regex used for filtering
46
47
    Returns:
48
        filtered list of links
49
    """
50
    filtered_links = []
51
52
    for link in links:
53
        if re.search(regex, link):
54
            filtered_links.append(link)
55
56
    return filtered_links
57
58
59 af7609b5 Tomáš Ballák
def create_absolute_links(links: LinksType, archive: str) -> LinksType:
60 04a2b5a4 petrh
    """
61
        Appends archive path to every link in links
62
    Args:
63
        links: list of relative links
64
        archive: archive url
65
66
    Returns:
67
        list of absolute links
68
    """
69
    absolute_links = []
70
71
    for link in links:
72
        absolute_links.append(archive + link)
73
74
    return absolute_links
75
76
77 af7609b5 Tomáš Ballák
def remove_downloaded_links(links: LinksType, dataset_name: str) -> LinksType:
78 04a2b5a4 petrh
    """
79
    Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
80
    Args:
81
        links: list of links
82
        dataset_name: name of dataset that has existing configuration file
83
84
    Returns:
85
        List of links without already downloaded links
86
    """
87 d6ca840d petrh
    downloaded_links = database_record_logs.load_ignore_set_links(dataset_name)
88 04a2b5a4 petrh
    final_links = set(links) - downloaded_links
89
90
    return final_links
91
92
93 af7609b5 Tomáš Ballák
def download_file_from_url(url: str, dataset_name: str) -> None:
94 04a2b5a4 petrh
    """
95
    Downloads file on provided url and saves it to path
96
    Args:
97
        url: url file we want to download
98
        dataset_name: name of dataset that has existing configuration file
99
    """
100
    r = requests.get(url, stream=True)
101
102
    # splits url and extract last part that contains filename
103
    url_parts = url.split("/")
104 81980e82 ballakt
    file_name = url_parts[len(url_parts) - 1]
105 04a2b5a4 petrh
106 1187e871 petrh
    data_path = CRAWLED_DATA_PATH + dataset_name + '/'
107 04a2b5a4 petrh
108
    # download file chunk by chunk so we can download large files
109 1187e871 petrh
    with open(data_path + file_name, "wb") as file:
110 04a2b5a4 petrh
        for chunk in r.iter_content(chunk_size=1024):
111
112
            # writing one chunk at a time to file
113
            if chunk:
114
                file.write(chunk)
115
116
    # after successful download update list of already downloaded files
117 d6ca840d petrh
    database_record_logs.update_ignore_set_links(dataset_name, url)