/modules/crawler/Utilities/Crawler/BasicCrawlerFunctions.py - Komentovat - Aplikace nad otevřenými daty (KIV) – BHVS - Redmine

aswi2020sebela-gitlab/modules/crawler/Utilities/Crawler/BasicCrawlerFunctions.py @ 0a2832fb

-a2b5a4
+import requests
 import re
 from Utilities import FolderProcessor
 from bs4 import BeautifulSoup
 # Path to crawler logs
 CRAWLER_LOGS_PATH = "CrawlerLogs/"
-e871
+# Path to crawled data
 CRAWLED_DATA_PATH = "CrawledData/"
-a2b5a4
+            petrh
 def get_all_links(url):
     """
     Sends http request to url, downloads all data,
     extract links
     Args:
         url: url of website we want to search
     Returns:
         list of all links
     """
     # create response object
     r = requests.get(url)
     # create beautiful-soup object
     soup = BeautifulSoup(r.content, 'html5lib')
     links = []
     for link in soup.findAll('a'):
         links.append(link.get('href'))
     return links
 def filter_links(links, regex):
     """
     Filters list of links using regex
     Args:
         links: list of links
         regex: regex used for filtering
     Returns:
         filtered list of links
     """
     filtered_links = []
     for link in links:
         if re.search(regex, link):
             filtered_links.append(link)
     return filtered_links
 def create_absolute_links(links, archive):
     """
         Appends archive path to every link in links
     Args:
         links: list of relative links
         archive: archive url
     Returns:
         list of absolute links
     """
     absolute_links = []
     for link in links:
         absolute_links.append(archive + link)
     return absolute_links
 def remove_downloaded_links(links, dataset_name):
     """
     Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
     Args:
         links: list of links
         dataset_name: name of dataset that has existing configuration file
     Returns:
         List of links without already downloaded links
     """
     downloaded_links = FolderProcessor.load_ignore_set(CRAWLER_LOGS_PATH + dataset_name + "/")
     final_links = set(links) - downloaded_links
     return final_links
 def download_file_from_url(url, dataset_name):
     """
     Downloads file on provided url and saves it to path
     Args:
         url: url file we want to download
         dataset_name: name of dataset that has existing configuration file
     """
     r = requests.get(url, stream=True)
     # splits url and extract last part that contains filename
     url_parts = url.split("/")
     file_name = url_parts[len(url_parts)-1]
-e871
+    log_path = CRAWLER_LOGS_PATH + dataset_name + '/'
     data_path = CRAWLED_DATA_PATH + dataset_name + '/'
-a2b5a4
+            petrh
     # download file chunk by chunk so we can download large files
-e871
+    with open(data_path + file_name, "wb") as file:
-a2b5a4
+        for chunk in r.iter_content(chunk_size=1024):
             # writing one chunk at a time to file
             if chunk:
                 file.write(chunk)
     # after successful download update list of already downloaded files
     FolderProcessor.update_ignore_set(CRAWLER_LOGS_PATH + dataset_name + "/", url)

Projekt

Obecné

Profil

ASWI - Pokročilé softwarové inženýrství » ASWI 2020 » Aplikace nad otevřenými daty (KIV) – BHVS