Revize 70217608
Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)
python-module/CrawledData/JIS/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
python-module/DatasetConfigs/JIS.yaml | ||
---|---|---|
1 |
# jmeno datasetu, pod kterym bude zobrazen v aplikaci |
|
2 |
dataset-name: JIS |
|
3 |
# pozice jednotlivych zarizeni, ktera jsou v datasetu |
|
4 |
devices: |
|
5 |
- example1: |
|
6 |
x: 12.3 |
|
7 |
y: 32.1 |
|
8 |
|
|
9 |
- example2: |
|
10 |
x: 32.1 |
|
11 |
y: 12.3 |
|
12 |
|
|
13 |
# root slozka, ktera obsahuje odkazy na dataset |
|
14 |
url: ZDE VLOZTE URL/ |
|
15 |
# voliteln? parameter, kter? specifikuje vzor jm?na dataset?, kter? se budou stahovat |
|
16 |
regex: ZDE VLOZTE REGEX |
|
17 |
# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, tak defaultni hodnota (dny) |
|
18 |
update-period: ZDE VLOZTE HODNOTU |
python-module/DatasetCrawler/JISCrawler.py | ||
---|---|---|
1 |
from Utilities import FolderProcessor |
|
2 |
from Utilities.Crawler import BasicCrawler |
|
3 |
|
|
4 |
|
|
5 |
def crawl(config): |
|
6 |
|
|
7 |
dataset_name = config["dataset-name"] |
|
8 |
url = config['url'] |
|
9 |
regex = config['regex'] |
|
10 |
|
|
11 |
first_level_links = BasicCrawler.get_all_links(url) |
|
12 |
filtered_first_level_links = BasicCrawler.filter_links(first_level_links, "^OD_ZCU") |
|
13 |
absolute_first_level_links = BasicCrawler.create_absolute_links(filtered_first_level_links, url) |
|
14 |
|
|
15 |
files = [] |
|
16 |
|
|
17 |
for link in absolute_first_level_links: |
|
18 |
second_level_links = BasicCrawler.get_all_links(link) |
|
19 |
filtered_second_level_links = BasicCrawler.filter_links(second_level_links, regex) |
|
20 |
absolute_second_level_links = BasicCrawler.create_absolute_links(filtered_second_level_links, link) |
|
21 |
final_links = BasicCrawler.remove_downloaded_links(absolute_second_level_links, dataset_name) |
|
22 |
|
|
23 |
for file_link in final_links: |
|
24 |
files.append(file_link) |
|
25 |
|
|
26 |
for file in files: |
|
27 |
BasicCrawler.download_file_from_url(file, "CrawledData/" + dataset_name + "/", dataset_name) |
|
28 |
|
|
29 |
FolderProcessor.unzip_all_csv_zip_files_in_folder("CrawledData/" + dataset_name + "/") |
python-module/DatasetProcessing/JISProcessor.py | ||
---|---|---|
1 |
def process_file(filename): |
|
2 |
print("You must implements process_file method first!") |
python-module/ProcessedData/JIS/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
Také k dispozici: Unified diff
Re #7930 Implementovan crawler pro JIS