Projekt

Obecné

Profil

Stáhnout (3.03 KB) Statistiky
| Větev: | Revize:
1
import requests
2
import re
3
from Utilities import folder_processor
4
from Utilities.Database import database_record_logs
5
from bs4 import BeautifulSoup
6
from typing import List
7

    
8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
10
# Path to crawled data
11
CRAWLED_DATA_PATH = "CrawledData/"
12
LinksType = List[str]
13

    
14

    
15
def get_all_links(url: str) -> LinksType:
16
    """
17
    Sends http request to url, downloads all data,
18
    extract links
19

    
20
    Args:
21
        url: url of website we want to search
22

    
23
    Returns:
24
        list of all links
25
    """
26
    # create response object
27
    r = requests.get(url)
28

    
29
    # create beautiful-soup object
30
    soup = BeautifulSoup(r.content, 'html5lib')
31
    links = []
32

    
33
    for link in soup.findAll('a'):
34
        links.append(link.get('href'))
35

    
36
    return links
37

    
38

    
39
def filter_links(links: LinksType, regex: str) -> LinksType:
40
    """
41
    Filters list of links using regex
42

    
43
    Args:
44
        links: list of links
45
        regex: regex used for filtering
46

    
47
    Returns:
48
        filtered list of links
49
    """
50
    filtered_links = []
51

    
52
    for link in links:
53
        if re.search(regex, link):
54
            filtered_links.append(link)
55

    
56
    return filtered_links
57

    
58

    
59
def create_absolute_links(links: LinksType, archive: str) -> LinksType:
60
    """
61
        Appends archive path to every link in links
62
    Args:
63
        links: list of relative links
64
        archive: archive url
65

    
66
    Returns:
67
        list of absolute links
68
    """
69
    absolute_links = []
70

    
71
    for link in links:
72
        absolute_links.append(archive + link)
73

    
74
    return absolute_links
75

    
76

    
77
def remove_downloaded_links(links: LinksType, dataset_name: str) -> LinksType:
78
    """
79
    Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
80
    Args:
81
        links: list of links
82
        dataset_name: name of dataset that has existing configuration file
83

    
84
    Returns:
85
        List of links without already downloaded links
86
    """
87
    downloaded_links = database_record_logs.load_ignore_set_links(dataset_name)
88
    final_links = set(links) - downloaded_links
89

    
90
    return final_links
91

    
92

    
93
def download_file_from_url(url: str, dataset_name: str) -> None:
94
    """
95
    Downloads file on provided url and saves it to path
96
    Args:
97
        url: url file we want to download
98
        dataset_name: name of dataset that has existing configuration file
99
    """
100
    r = requests.get(url, stream=True)
101

    
102
    # splits url and extract last part that contains filename
103
    url_parts = url.split("/")
104
    file_name = url_parts[len(url_parts) - 1]
105

    
106
    data_path = CRAWLED_DATA_PATH + dataset_name + '/'
107

    
108
    # download file chunk by chunk so we can download large files
109
    with open(data_path + file_name, "wb") as file:
110
        for chunk in r.iter_content(chunk_size=1024):
111

    
112
            # writing one chunk at a time to file
113
            if chunk:
114
                file.write(chunk)
115

    
116
    # after successful download update list of already downloaded files
117
    database_record_logs.update_ignore_set_links(dataset_name, url)
    (1-1/1)