1 |
c8f3051b
|
petrh
|
import requests
|
2 |
|
|
import re
|
3 |
|
|
from Utilities import FolderProcessor
|
4 |
|
|
from bs4 import BeautifulSoup
|
5 |
|
|
|
6 |
|
|
|
7 |
|
|
def get_all_links(url):
|
8 |
|
|
# create response object
|
9 |
|
|
r = requests.get(url)
|
10 |
|
|
|
11 |
|
|
# create beautiful-soup object
|
12 |
|
|
soup = BeautifulSoup(r.content, 'html5lib')
|
13 |
|
|
links = []
|
14 |
|
|
|
15 |
|
|
for link in soup.findAll('a'):
|
16 |
|
|
links.append(link.get('href'))
|
17 |
|
|
|
18 |
|
|
return links
|
19 |
|
|
|
20 |
|
|
|
21 |
|
|
def filter_links(links, regex):
|
22 |
|
|
fitlered_links = []
|
23 |
|
|
|
24 |
|
|
for link in links:
|
25 |
|
|
if re.search(regex,link):
|
26 |
|
|
fitlered_links.append(link)
|
27 |
|
|
|
28 |
|
|
return fitlered_links
|
29 |
|
|
|
30 |
|
|
|
31 |
|
|
def create_absolute_links(links, archive):
|
32 |
|
|
absolute_links = []
|
33 |
|
|
|
34 |
|
|
for link in links:
|
35 |
|
|
absolute_links.append(archive + link)
|
36 |
|
|
|
37 |
|
|
return absolute_links
|
38 |
|
|
|
39 |
|
|
|
40 |
|
|
def remove_downloaded_links(links,dataset_name):
|
41 |
|
|
|
42 |
|
|
downloaded_links = FolderProcessor.load_ignore_set("CrawlerLogs/" + dataset_name + "/")
|
43 |
|
|
final_links = set(links) - downloaded_links
|
44 |
|
|
|
45 |
|
|
return final_links
|
46 |
|
|
|
47 |
|
|
|
48 |
|
|
def download_file_from_url(url,path, dataset_name):
|
49 |
|
|
r = requests.get(url, stream=True)
|
50 |
|
|
|
51 |
|
|
url_parts = url.split("/")
|
52 |
|
|
file_name = url_parts[len(url_parts)-1]
|
53 |
|
|
|
54 |
|
|
with open(path + file_name, "wb") as file:
|
55 |
|
|
for chunk in r.iter_content(chunk_size=1024):
|
56 |
|
|
|
57 |
|
|
# writing one chunk at a time to pdf file
|
58 |
|
|
if chunk:
|
59 |
|
|
file.write(chunk)
|
60 |
|
|
|
61 |
|
|
FolderProcessor.update_ignore_set("CrawlerLogs/" + dataset_name + "/", url)
|