Projekt

Obecné

Profil

Stáhnout (2.77 KB) Statistiky
| Větev: | Revize:
1
import requests
2
import re
3
from Utilities import FolderProcessor
4
from bs4 import BeautifulSoup
5

    
6
# Path to crawler logs
7
CRAWLER_LOGS_PATH = "CrawlerLogs/"
8

    
9

    
10
def get_all_links(url):
11
    """
12
    Sends http request to url, downloads all data,
13
    extract links
14

    
15
    Args:
16
        url: url of website we want to search
17

    
18
    Returns:
19
        list of all links
20
    """
21
    # create response object
22
    r = requests.get(url)
23

    
24
    # create beautiful-soup object
25
    soup = BeautifulSoup(r.content, 'html5lib')
26
    links = []
27

    
28
    for link in soup.findAll('a'):
29
        links.append(link.get('href'))
30

    
31
    return links
32

    
33

    
34
def filter_links(links, regex):
35
    """
36
    Filters list of links using regex
37

    
38
    Args:
39
        links: list of links
40
        regex: regex used for filtering
41

    
42
    Returns:
43
        filtered list of links
44
    """
45
    filtered_links = []
46

    
47
    for link in links:
48
        if re.search(regex, link):
49
            filtered_links.append(link)
50

    
51
    return filtered_links
52

    
53

    
54
def create_absolute_links(links, archive):
55
    """
56
        Appends archive path to every link in links
57
    Args:
58
        links: list of relative links
59
        archive: archive url
60

    
61
    Returns:
62
        list of absolute links
63
    """
64
    absolute_links = []
65

    
66
    for link in links:
67
        absolute_links.append(archive + link)
68

    
69
    return absolute_links
70

    
71

    
72
def remove_downloaded_links(links, dataset_name):
73
    """
74
    Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
75
    Args:
76
        links: list of links
77
        dataset_name: name of dataset that has existing configuration file
78

    
79
    Returns:
80
        List of links without already downloaded links
81
    """
82
    downloaded_links = FolderProcessor.load_ignore_set(CRAWLER_LOGS_PATH + dataset_name + "/")
83
    final_links = set(links) - downloaded_links
84

    
85
    return final_links
86

    
87

    
88
def download_file_from_url(url, dataset_name):
89
    """
90
    Downloads file on provided url and saves it to path
91
    Args:
92
        url: url file we want to download
93
        dataset_name: name of dataset that has existing configuration file
94
    """
95
    r = requests.get(url, stream=True)
96

    
97
    # splits url and extract last part that contains filename
98
    url_parts = url.split("/")
99
    file_name = url_parts[len(url_parts)-1]
100

    
101
    path = CRAWLER_LOGS_PATH + dataset_name + '/'
102

    
103
    # download file chunk by chunk so we can download large files
104
    with open(path + file_name, "wb") as file:
105
        for chunk in r.iter_content(chunk_size=1024):
106

    
107
            # writing one chunk at a time to file
108
            if chunk:
109
                file.write(chunk)
110

    
111
    # after successful download update list of already downloaded files
112
    FolderProcessor.update_ignore_set(CRAWLER_LOGS_PATH + dataset_name + "/", url)
    (1-1/1)