Projekt

Obecné

Profil

Stáhnout (1.42 KB) Statistiky
| Větev: | Revize:
1
import requests
2
import re
3
from Utilities import FolderProcessor
4
from bs4 import BeautifulSoup
5

    
6

    
7
def get_all_links(url):
8
    # create response object
9
    r = requests.get(url)
10

    
11
    # create beautiful-soup object
12
    soup = BeautifulSoup(r.content, 'html5lib')
13
    links = []
14

    
15
    for link in soup.findAll('a'):
16
        links.append(link.get('href'))
17

    
18
    return links
19

    
20

    
21
def filter_links(links, regex):
22
    fitlered_links = []
23

    
24
    for link in links:
25
        if re.search(regex,link):
26
            fitlered_links.append(link)
27

    
28
    return fitlered_links
29

    
30

    
31
def create_absolute_links(links, archive):
32
    absolute_links = []
33

    
34
    for link in links:
35
        absolute_links.append(archive + link)
36

    
37
    return absolute_links
38

    
39

    
40
def remove_downloaded_links(links,dataset_name):
41

    
42
    downloaded_links = FolderProcessor.load_ignore_set("CrawlerLogs/" + dataset_name + "/")
43
    final_links = set(links) - downloaded_links
44

    
45
    return final_links
46

    
47

    
48
def download_file_from_url(url,path, dataset_name):
49
    r = requests.get(url, stream=True)
50

    
51
    url_parts = url.split("/")
52
    file_name = url_parts[len(url_parts)-1]
53

    
54
    with open(path + file_name, "wb") as file:
55
        for chunk in r.iter_content(chunk_size=1024):
56

    
57
            # writing one chunk at a time to pdf file
58
            if chunk:
59
                file.write(chunk)
60

    
61
    FolderProcessor.update_ignore_set("CrawlerLogs/" + dataset_name + "/", url)
    (1-1/1)