/modules/crawler/Utilities/Crawler/basic_crawler_functions.py - Komentovat - Aplikace nad otevřenými daty (KIV) – BHVS - Redmine

aswi2020sebela-gitlab/modules/crawler/Utilities/Crawler/basic_crawler_functions.py @ af7609b5

1	04a2b5a4	petrh	import requests
2			import re
3	d6ca840d	petrh	from Utilities import folder_processor
4			from Utilities.Database import database_record_logs
5	04a2b5a4	petrh	from bs4 import BeautifulSoup
6	af7609b5	Tomáš Ballák	from typing import List
7	04a2b5a4	petrh
8			# Path to crawler logs
9			CRAWLER_LOGS_PATH = "CrawlerLogs/"
10	1187e871	petrh	# Path to crawled data
11			CRAWLED_DATA_PATH = "CrawledData/"
12	af7609b5	Tomáš Ballák	LinksType = List[str]
13	04a2b5a4	petrh
14
15	af7609b5	Tomáš Ballák	def get_all_links(url: str) -> LinksType:
16	04a2b5a4	petrh	"""
17			Sends http request to url, downloads all data,
18			extract links
19
20			Args:
21			url: url of website we want to search
22
23			Returns:
24			list of all links
25			"""
26			# create response object
27			r = requests.get(url)
28
29			# create beautiful-soup object
30			soup = BeautifulSoup(r.content, 'html5lib')
31			links = []
32
33			for link in soup.findAll('a'):
34			links.append(link.get('href'))
35
36			return links
37
38
39	af7609b5	Tomáš Ballák	def filter_links(links: LinksType, regex: str) -> LinksType:
40	04a2b5a4	petrh	"""
41			Filters list of links using regex
42
43			Args:
44			links: list of links
45			regex: regex used for filtering
46
47			Returns:
48			filtered list of links
49			"""
50			filtered_links = []
51
52			for link in links:
53			if re.search(regex, link):
54			filtered_links.append(link)
55
56			return filtered_links
57
58
59	af7609b5	Tomáš Ballák	def create_absolute_links(links: LinksType, archive: str) -> LinksType:
60	04a2b5a4	petrh	"""
61			Appends archive path to every link in links
62			Args:
63			links: list of relative links
64			archive: archive url
65
66			Returns:
67			list of absolute links
68			"""
69			absolute_links = []
70
71			for link in links:
72			absolute_links.append(archive + link)
73
74			return absolute_links
75
76
77	af7609b5	Tomáš Ballák	def remove_downloaded_links(links: LinksType, dataset_name: str) -> LinksType:
78	04a2b5a4	petrh	"""
79			Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
80			Args:
81			links: list of links
82			dataset_name: name of dataset that has existing configuration file
83
84			Returns:
85			List of links without already downloaded links
86			"""
87	d6ca840d	petrh	downloaded_links = database_record_logs.load_ignore_set_links(dataset_name)
88	04a2b5a4	petrh	final_links = set(links) - downloaded_links
89
90			return final_links
91
92
93	af7609b5	Tomáš Ballák	def download_file_from_url(url: str, dataset_name: str) -> None:
94	04a2b5a4	petrh	"""
95			Downloads file on provided url and saves it to path
96			Args:
97			url: url file we want to download
98			dataset_name: name of dataset that has existing configuration file
99			"""
100			r = requests.get(url, stream=True)
101
102			# splits url and extract last part that contains filename
103			url_parts = url.split("/")
104	81980e82	ballakt	file_name = url_parts[len(url_parts) - 1]
105	04a2b5a4	petrh
106	1187e871	petrh	data_path = CRAWLED_DATA_PATH + dataset_name + '/'
107	04a2b5a4	petrh
108			# download file chunk by chunk so we can download large files
109	1187e871	petrh	with open(data_path + file_name, "wb") as file:
110	04a2b5a4	petrh	for chunk in r.iter_content(chunk_size=1024):
111
112			# writing one chunk at a time to file
113			if chunk:
114			file.write(chunk)
115
116			# after successful download update list of already downloaded files
117	d6ca840d	petrh	database_record_logs.update_ignore_set_links(dataset_name, url)

Projekt

Obecné

Profil

ASWI - Pokročilé softwarové inženýrství » ASWI 2020 » Aplikace nad otevřenými daty (KIV) – BHVS