1
|
import requests
|
2
|
import re
|
3
|
from Utilities import FolderProcessor
|
4
|
from bs4 import BeautifulSoup
|
5
|
|
6
|
|
7
|
def get_all_links(url):
|
8
|
# create response object
|
9
|
r = requests.get(url)
|
10
|
|
11
|
# create beautiful-soup object
|
12
|
soup = BeautifulSoup(r.content, 'html5lib')
|
13
|
links = []
|
14
|
|
15
|
for link in soup.findAll('a'):
|
16
|
links.append(link.get('href'))
|
17
|
|
18
|
return links
|
19
|
|
20
|
|
21
|
def filter_links(links, regex):
|
22
|
fitlered_links = []
|
23
|
|
24
|
for link in links:
|
25
|
if re.search(regex,link):
|
26
|
fitlered_links.append(link)
|
27
|
|
28
|
return fitlered_links
|
29
|
|
30
|
|
31
|
def create_absolute_links(links, archive):
|
32
|
absolute_links = []
|
33
|
|
34
|
for link in links:
|
35
|
absolute_links.append(archive + link)
|
36
|
|
37
|
return absolute_links
|
38
|
|
39
|
|
40
|
def remove_downloaded_links(links,dataset_name):
|
41
|
|
42
|
downloaded_links = FolderProcessor.load_ignore_set("CrawlerLogs/" + dataset_name + "/")
|
43
|
final_links = set(links) - downloaded_links
|
44
|
|
45
|
return final_links
|
46
|
|
47
|
|
48
|
def download_file_from_url(url,path, dataset_name):
|
49
|
r = requests.get(url, stream=True)
|
50
|
|
51
|
url_parts = url.split("/")
|
52
|
file_name = url_parts[len(url_parts)-1]
|
53
|
|
54
|
with open(path + file_name, "wb") as file:
|
55
|
for chunk in r.iter_content(chunk_size=1024):
|
56
|
|
57
|
# writing one chunk at a time to pdf file
|
58
|
if chunk:
|
59
|
file.write(chunk)
|
60
|
|
61
|
FolderProcessor.update_ignore_set("CrawlerLogs/" + dataset_name + "/", url)
|