1
|
import requests
|
2
|
import re
|
3
|
from Utilities import FolderProcessor
|
4
|
from bs4 import BeautifulSoup
|
5
|
|
6
|
# Path to crawler logs
|
7
|
CRAWLER_LOGS_PATH = "CrawlerLogs/"
|
8
|
# Path to crawled data
|
9
|
CRAWLED_DATA_PATH = "CrawledData/"
|
10
|
|
11
|
|
12
|
def get_all_links(url):
|
13
|
"""
|
14
|
Sends http request to url, downloads all data,
|
15
|
extract links
|
16
|
|
17
|
Args:
|
18
|
url: url of website we want to search
|
19
|
|
20
|
Returns:
|
21
|
list of all links
|
22
|
"""
|
23
|
# create response object
|
24
|
r = requests.get(url)
|
25
|
|
26
|
# create beautiful-soup object
|
27
|
soup = BeautifulSoup(r.content, 'html5lib')
|
28
|
links = []
|
29
|
|
30
|
for link in soup.findAll('a'):
|
31
|
links.append(link.get('href'))
|
32
|
|
33
|
return links
|
34
|
|
35
|
|
36
|
def filter_links(links, regex):
|
37
|
"""
|
38
|
Filters list of links using regex
|
39
|
|
40
|
Args:
|
41
|
links: list of links
|
42
|
regex: regex used for filtering
|
43
|
|
44
|
Returns:
|
45
|
filtered list of links
|
46
|
"""
|
47
|
filtered_links = []
|
48
|
|
49
|
for link in links:
|
50
|
if re.search(regex, link):
|
51
|
filtered_links.append(link)
|
52
|
|
53
|
return filtered_links
|
54
|
|
55
|
|
56
|
def create_absolute_links(links, archive):
|
57
|
"""
|
58
|
Appends archive path to every link in links
|
59
|
Args:
|
60
|
links: list of relative links
|
61
|
archive: archive url
|
62
|
|
63
|
Returns:
|
64
|
list of absolute links
|
65
|
"""
|
66
|
absolute_links = []
|
67
|
|
68
|
for link in links:
|
69
|
absolute_links.append(archive + link)
|
70
|
|
71
|
return absolute_links
|
72
|
|
73
|
|
74
|
def remove_downloaded_links(links, dataset_name):
|
75
|
"""
|
76
|
Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
|
77
|
Args:
|
78
|
links: list of links
|
79
|
dataset_name: name of dataset that has existing configuration file
|
80
|
|
81
|
Returns:
|
82
|
List of links without already downloaded links
|
83
|
"""
|
84
|
downloaded_links = FolderProcessor.load_ignore_set(CRAWLER_LOGS_PATH + dataset_name + "/")
|
85
|
final_links = set(links) - downloaded_links
|
86
|
|
87
|
return final_links
|
88
|
|
89
|
|
90
|
def download_file_from_url(url, dataset_name):
|
91
|
"""
|
92
|
Downloads file on provided url and saves it to path
|
93
|
Args:
|
94
|
url: url file we want to download
|
95
|
dataset_name: name of dataset that has existing configuration file
|
96
|
"""
|
97
|
r = requests.get(url, stream=True)
|
98
|
|
99
|
# splits url and extract last part that contains filename
|
100
|
url_parts = url.split("/")
|
101
|
file_name = url_parts[len(url_parts)-1]
|
102
|
|
103
|
log_path = CRAWLER_LOGS_PATH + dataset_name + '/'
|
104
|
data_path = CRAWLED_DATA_PATH + dataset_name + '/'
|
105
|
|
106
|
# download file chunk by chunk so we can download large files
|
107
|
with open(data_path + file_name, "wb") as file:
|
108
|
for chunk in r.iter_content(chunk_size=1024):
|
109
|
|
110
|
# writing one chunk at a time to file
|
111
|
if chunk:
|
112
|
file.write(chunk)
|
113
|
|
114
|
# after successful download update list of already downloaded files
|
115
|
FolderProcessor.update_ignore_set(CRAWLER_LOGS_PATH + dataset_name + "/", url)
|