Projekt

Obecné

Profil

Stáhnout (2.89 KB) Statistiky
| Větev: | Revize:
1
import requests
2
import re
3
from Utilities import FolderProcessor
4
from bs4 import BeautifulSoup
5

    
6
# Path to crawler logs
7
CRAWLER_LOGS_PATH = "CrawlerLogs/"
8
# Path to crawled data
9
CRAWLED_DATA_PATH = "CrawledData/"
10

    
11

    
12
def get_all_links(url):
13
    """
14
    Sends http request to url, downloads all data,
15
    extract links
16

    
17
    Args:
18
        url: url of website we want to search
19

    
20
    Returns:
21
        list of all links
22
    """
23
    # create response object
24
    r = requests.get(url)
25

    
26
    # create beautiful-soup object
27
    soup = BeautifulSoup(r.content, 'html5lib')
28
    links = []
29

    
30
    for link in soup.findAll('a'):
31
        links.append(link.get('href'))
32

    
33
    return links
34

    
35

    
36
def filter_links(links, regex):
37
    """
38
    Filters list of links using regex
39

    
40
    Args:
41
        links: list of links
42
        regex: regex used for filtering
43

    
44
    Returns:
45
        filtered list of links
46
    """
47
    filtered_links = []
48

    
49
    for link in links:
50
        if re.search(regex, link):
51
            filtered_links.append(link)
52

    
53
    return filtered_links
54

    
55

    
56
def create_absolute_links(links, archive):
57
    """
58
        Appends archive path to every link in links
59
    Args:
60
        links: list of relative links
61
        archive: archive url
62

    
63
    Returns:
64
        list of absolute links
65
    """
66
    absolute_links = []
67

    
68
    for link in links:
69
        absolute_links.append(archive + link)
70

    
71
    return absolute_links
72

    
73

    
74
def remove_downloaded_links(links, dataset_name):
75
    """
76
    Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
77
    Args:
78
        links: list of links
79
        dataset_name: name of dataset that has existing configuration file
80

    
81
    Returns:
82
        List of links without already downloaded links
83
    """
84
    downloaded_links = FolderProcessor.load_ignore_set(CRAWLER_LOGS_PATH + dataset_name + "/")
85
    final_links = set(links) - downloaded_links
86

    
87
    return final_links
88

    
89

    
90
def download_file_from_url(url, dataset_name):
91
    """
92
    Downloads file on provided url and saves it to path
93
    Args:
94
        url: url file we want to download
95
        dataset_name: name of dataset that has existing configuration file
96
    """
97
    r = requests.get(url, stream=True)
98

    
99
    # splits url and extract last part that contains filename
100
    url_parts = url.split("/")
101
    file_name = url_parts[len(url_parts)-1]
102

    
103
    log_path = CRAWLER_LOGS_PATH + dataset_name + '/'
104
    data_path = CRAWLED_DATA_PATH + dataset_name + '/'
105

    
106
    # download file chunk by chunk so we can download large files
107
    with open(data_path + file_name, "wb") as file:
108
        for chunk in r.iter_content(chunk_size=1024):
109

    
110
            # writing one chunk at a time to file
111
            if chunk:
112
                file.write(chunk)
113

    
114
    # after successful download update list of already downloaded files
115
    FolderProcessor.update_ignore_set(CRAWLER_LOGS_PATH + dataset_name + "/", url)
    (1-1/1)