1 |
04a2b5a4
|
petrh
|
import requests
|
2 |
|
|
import re
|
3 |
d6ca840d
|
petrh
|
from Utilities import folder_processor
|
4 |
|
|
from Utilities.Database import database_record_logs
|
5 |
04a2b5a4
|
petrh
|
from bs4 import BeautifulSoup
|
6 |
af7609b5
|
Tomáš Ballák
|
from typing import List
|
7 |
04a2b5a4
|
petrh
|
|
8 |
|
|
# Path to crawler logs
|
9 |
|
|
CRAWLER_LOGS_PATH = "CrawlerLogs/"
|
10 |
1187e871
|
petrh
|
# Path to crawled data
|
11 |
|
|
CRAWLED_DATA_PATH = "CrawledData/"
|
12 |
af7609b5
|
Tomáš Ballák
|
LinksType = List[str]
|
13 |
04a2b5a4
|
petrh
|
|
14 |
|
|
|
15 |
af7609b5
|
Tomáš Ballák
|
def get_all_links(url: str) -> LinksType:
|
16 |
04a2b5a4
|
petrh
|
"""
|
17 |
|
|
Sends http request to url, downloads all data,
|
18 |
|
|
extract links
|
19 |
|
|
|
20 |
|
|
Args:
|
21 |
|
|
url: url of website we want to search
|
22 |
|
|
|
23 |
|
|
Returns:
|
24 |
|
|
list of all links
|
25 |
|
|
"""
|
26 |
|
|
# create response object
|
27 |
|
|
r = requests.get(url)
|
28 |
|
|
|
29 |
|
|
# create beautiful-soup object
|
30 |
|
|
soup = BeautifulSoup(r.content, 'html5lib')
|
31 |
|
|
links = []
|
32 |
|
|
|
33 |
|
|
for link in soup.findAll('a'):
|
34 |
|
|
links.append(link.get('href'))
|
35 |
|
|
|
36 |
|
|
return links
|
37 |
|
|
|
38 |
|
|
|
39 |
af7609b5
|
Tomáš Ballák
|
def filter_links(links: LinksType, regex: str) -> LinksType:
|
40 |
04a2b5a4
|
petrh
|
"""
|
41 |
|
|
Filters list of links using regex
|
42 |
|
|
|
43 |
|
|
Args:
|
44 |
|
|
links: list of links
|
45 |
|
|
regex: regex used for filtering
|
46 |
|
|
|
47 |
|
|
Returns:
|
48 |
|
|
filtered list of links
|
49 |
|
|
"""
|
50 |
|
|
filtered_links = []
|
51 |
|
|
|
52 |
|
|
for link in links:
|
53 |
|
|
if re.search(regex, link):
|
54 |
|
|
filtered_links.append(link)
|
55 |
|
|
|
56 |
|
|
return filtered_links
|
57 |
|
|
|
58 |
|
|
|
59 |
af7609b5
|
Tomáš Ballák
|
def create_absolute_links(links: LinksType, archive: str) -> LinksType:
|
60 |
04a2b5a4
|
petrh
|
"""
|
61 |
|
|
Appends archive path to every link in links
|
62 |
|
|
Args:
|
63 |
|
|
links: list of relative links
|
64 |
|
|
archive: archive url
|
65 |
|
|
|
66 |
|
|
Returns:
|
67 |
|
|
list of absolute links
|
68 |
|
|
"""
|
69 |
|
|
absolute_links = []
|
70 |
|
|
|
71 |
|
|
for link in links:
|
72 |
|
|
absolute_links.append(archive + link)
|
73 |
|
|
|
74 |
|
|
return absolute_links
|
75 |
|
|
|
76 |
|
|
|
77 |
af7609b5
|
Tomáš Ballák
|
def remove_downloaded_links(links: LinksType, dataset_name: str) -> LinksType:
|
78 |
04a2b5a4
|
petrh
|
"""
|
79 |
|
|
Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
|
80 |
|
|
Args:
|
81 |
|
|
links: list of links
|
82 |
|
|
dataset_name: name of dataset that has existing configuration file
|
83 |
|
|
|
84 |
|
|
Returns:
|
85 |
|
|
List of links without already downloaded links
|
86 |
|
|
"""
|
87 |
d6ca840d
|
petrh
|
downloaded_links = database_record_logs.load_ignore_set_links(dataset_name)
|
88 |
04a2b5a4
|
petrh
|
final_links = set(links) - downloaded_links
|
89 |
|
|
|
90 |
|
|
return final_links
|
91 |
|
|
|
92 |
|
|
|
93 |
af7609b5
|
Tomáš Ballák
|
def download_file_from_url(url: str, dataset_name: str) -> None:
|
94 |
04a2b5a4
|
petrh
|
"""
|
95 |
|
|
Downloads file on provided url and saves it to path
|
96 |
|
|
Args:
|
97 |
|
|
url: url file we want to download
|
98 |
|
|
dataset_name: name of dataset that has existing configuration file
|
99 |
|
|
"""
|
100 |
|
|
r = requests.get(url, stream=True)
|
101 |
|
|
|
102 |
|
|
# splits url and extract last part that contains filename
|
103 |
|
|
url_parts = url.split("/")
|
104 |
81980e82
|
ballakt
|
file_name = url_parts[len(url_parts) - 1]
|
105 |
04a2b5a4
|
petrh
|
|
106 |
1187e871
|
petrh
|
data_path = CRAWLED_DATA_PATH + dataset_name + '/'
|
107 |
04a2b5a4
|
petrh
|
|
108 |
|
|
# download file chunk by chunk so we can download large files
|
109 |
1187e871
|
petrh
|
with open(data_path + file_name, "wb") as file:
|
110 |
04a2b5a4
|
petrh
|
for chunk in r.iter_content(chunk_size=1024):
|
111 |
|
|
|
112 |
|
|
# writing one chunk at a time to file
|
113 |
|
|
if chunk:
|
114 |
|
|
file.write(chunk)
|
115 |
|
|
|
116 |
|
|
# after successful download update list of already downloaded files
|
117 |
d6ca840d
|
petrh
|
database_record_logs.update_ignore_set_links(dataset_name, url)
|