Projekt

Obecné

Profil

Stáhnout (2.86 KB) Statistiky
| Větev: | Revize:
1
import requests
2
import re
3
from Utilities import folder_processor
4
from Utilities.Database import database_record_logs
5
from bs4 import BeautifulSoup
6

    
7
# Path to crawler logs
8
CRAWLER_LOGS_PATH = "CrawlerLogs/"
9
# Path to crawled data
10
CRAWLED_DATA_PATH = "CrawledData/"
11

    
12

    
13
def get_all_links(url):
14
    """
15
    Sends http request to url, downloads all data,
16
    extract links
17

    
18
    Args:
19
        url: url of website we want to search
20

    
21
    Returns:
22
        list of all links
23
    """
24
    # create response object
25
    r = requests.get(url)
26

    
27
    # create beautiful-soup object
28
    soup = BeautifulSoup(r.content, 'html5lib')
29
    links = []
30

    
31
    for link in soup.findAll('a'):
32
        links.append(link.get('href'))
33

    
34
    return links
35

    
36

    
37
def filter_links(links, regex):
38
    """
39
    Filters list of links using regex
40

    
41
    Args:
42
        links: list of links
43
        regex: regex used for filtering
44

    
45
    Returns:
46
        filtered list of links
47
    """
48
    filtered_links = []
49

    
50
    for link in links:
51
        if re.search(regex, link):
52
            filtered_links.append(link)
53

    
54
    return filtered_links
55

    
56

    
57
def create_absolute_links(links, archive):
58
    """
59
        Appends archive path to every link in links
60
    Args:
61
        links: list of relative links
62
        archive: archive url
63

    
64
    Returns:
65
        list of absolute links
66
    """
67
    absolute_links = []
68

    
69
    for link in links:
70
        absolute_links.append(archive + link)
71

    
72
    return absolute_links
73

    
74

    
75
def remove_downloaded_links(links, dataset_name):
76
    """
77
    Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
78
    Args:
79
        links: list of links
80
        dataset_name: name of dataset that has existing configuration file
81

    
82
    Returns:
83
        List of links without already downloaded links
84
    """
85
    downloaded_links = database_record_logs.load_ignore_set_links(dataset_name)
86
    final_links = set(links) - downloaded_links
87

    
88
    return final_links
89

    
90

    
91
def download_file_from_url(url, dataset_name):
92
    """
93
    Downloads file on provided url and saves it to path
94
    Args:
95
        url: url file we want to download
96
        dataset_name: name of dataset that has existing configuration file
97
    """
98
    r = requests.get(url, stream=True)
99

    
100
    # splits url and extract last part that contains filename
101
    url_parts = url.split("/")
102
    file_name = url_parts[len(url_parts)-1]
103

    
104
    data_path = CRAWLED_DATA_PATH + dataset_name + '/'
105

    
106
    # download file chunk by chunk so we can download large files
107
    with open(data_path + file_name, "wb") as file:
108
        for chunk in r.iter_content(chunk_size=1024):
109

    
110
            # writing one chunk at a time to file
111
            if chunk:
112
                file.write(chunk)
113

    
114
    # after successful download update list of already downloaded files
115
    database_record_logs.update_ignore_set_links(dataset_name, url)
    (1-1/1)