Revize c8f3051b
Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)
.gitignore | ||
---|---|---|
3 | 3 |
.idea |
4 | 4 |
dev-dump |
5 | 5 |
*.lock |
6 |
<<<<<<< HEAD |
|
6 | 7 |
.vscode-server |
8 |
======= |
|
9 |
*__pycache__* |
|
10 |
*.CSV |
|
11 |
python-module/venv |
|
12 |
>>>>>>> Re #7924 beta version of datascript |
python-module/CrawledData/KOLOBEZKY/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
|
2 |
OD_ZCU_KOLOBEZKY_07_2019.CSV |
|
3 |
OD_ZCU_KOLOBEZKY_06_2019.CSV |
|
4 |
OD_ZCU_KOLOBEZKY_00_2019.CSV |
|
5 |
OD_ZCU_KOLOBEZKY_08_2019.CSV |
python-module/CrawlerLogs/KOLOBEZKY/ignore.txt | ||
---|---|---|
1 |
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_KOLOBEZKY_00_2019_CSV.zip |
|
2 |
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_KOLOBEZKY_06_2019_CSV.zip |
|
3 |
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_KOLOBEZKY_07_2019_CSV.zip |
|
4 |
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_KOLOBEZKY_08_2019_CSV.zip |
python-module/DatasetConfigs/KOLOBEZKY.yaml | ||
---|---|---|
1 |
# jméno datasetu, pod kterým bude zobrazen v aplikaci |
|
2 |
dataset-name: KOLOBEZKY |
|
3 |
# pozice jednotlivých zařízení, která jsou v datasetu |
|
4 |
devices: |
|
5 |
- stojan-knihovna: |
|
6 |
x: 13.353319 |
|
7 |
y: 49.725145 |
|
8 |
|
|
9 |
- stojan-Machovka: |
|
10 |
x: 13.368016 |
|
11 |
y: 49.725197 |
|
12 |
|
|
13 |
- stojan-FDU: |
|
14 |
x: 13.347909 |
|
15 |
y: 49.725360 |
|
16 |
|
|
17 |
- stojan-bory: |
|
18 |
x: 13.350861 |
|
19 |
y: 49.724576 |
|
20 |
|
|
21 |
- stojan-zcu: |
|
22 |
x: 13.365958 |
|
23 |
y: 49.726033 |
|
24 |
|
|
25 |
- stojan-borska: |
|
26 |
x: 13.359475 |
|
27 |
y: 49.734518 |
|
28 |
|
|
29 |
# root složka, která obsahuje odkazy na dataset |
|
30 |
url: https://openstore.zcu.cz/ |
|
31 |
# volitelný parameter, který specifikuje vzor jména datasetů, které se budou stahovat |
|
32 |
regex: OD_ZCU_KOLOBEZKY_[0-9][0-9]_[0-9][0-9][0-9][0-9]_CSV.zip |
|
33 |
# volitelný parametr, který udává jak často se budou hledat nové datasety, pokud prázdné, tak defaultní hodnota (dny) |
|
34 |
update-period: 24 |
python-module/DatasetCrawler/KOLOBEZKYCrawler.py | ||
---|---|---|
1 |
from Utilities import FolderProcessor |
|
2 |
from Utilities.Crawler import BasicCrawler |
|
3 |
|
|
4 |
|
|
5 |
def crawl(config): |
|
6 |
|
|
7 |
dataset_name = config["dataset-name"] |
|
8 |
url = config['url'] |
|
9 |
regex = config['regex'] |
|
10 |
|
|
11 |
first_level_links = BasicCrawler.get_all_links(url) |
|
12 |
filtered_first_level_links = BasicCrawler.filter_links(first_level_links, "^OD_ZCU") |
|
13 |
absolute_first_level_links = BasicCrawler.create_absolute_links(filtered_first_level_links, url) |
|
14 |
|
|
15 |
files = [] |
|
16 |
|
|
17 |
for link in absolute_first_level_links: |
|
18 |
second_level_links = BasicCrawler.get_all_links(link) |
|
19 |
filtered_second_level_links = BasicCrawler.filter_links(second_level_links, regex) |
|
20 |
absolute_second_level_links = BasicCrawler.create_absolute_links(filtered_second_level_links, link) |
|
21 |
final_links = BasicCrawler.remove_downloaded_links(absolute_second_level_links, dataset_name) |
|
22 |
|
|
23 |
for file_link in final_links: |
|
24 |
files.append(file_link) |
|
25 |
|
|
26 |
for file in files: |
|
27 |
BasicCrawler.download_file_from_url(file, "CrawledData/" + dataset_name + "/", dataset_name) |
|
28 |
|
|
29 |
FolderProcessor.unzip_all_csv_zip_files_in_folder("CrawledData/" + dataset_name + "/") |
python-module/DatasetProcessing/KOLOBEZKYProcessor.py | ||
---|---|---|
1 |
from Utilities.CSV import CSVDataLine, CSVutils |
|
2 |
from Utilities import DateFormating |
|
3 |
|
|
4 |
|
|
5 |
def process_file(filename): |
|
6 |
|
|
7 |
with open(filename, "r") as file: |
|
8 |
|
|
9 |
date_dict = dict() |
|
10 |
|
|
11 |
for line in file: |
|
12 |
|
|
13 |
array = line.split(";") |
|
14 |
|
|
15 |
date = DateFormating.date_time_formater(array[0][1:-1]) |
|
16 |
name = array[1][1:-1] |
|
17 |
|
|
18 |
if date not in date_dict: |
|
19 |
date_dict[date] = dict() |
|
20 |
|
|
21 |
if name in date_dict[date]: |
|
22 |
date_dict[date][name].occurence += 1 |
|
23 |
else: |
|
24 |
date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, 1) |
|
25 |
|
|
26 |
CSVutils.export_data_to_csv(filename, date_dict) |
python-module/Pipeline.py | ||
---|---|---|
1 |
from Utilities import FolderProcessor, ConfigureLoader |
|
2 |
from Utilities.Database import DatabaseLoader |
|
3 |
|
|
4 |
|
|
5 |
CONFIG_FILES_PATH = "DatasetConfigs/" |
|
6 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
7 |
PROCESSED_DATA_PATH = "ProcessedData/" |
|
8 |
CRAWLER_LIB_PATH = "DatasetCrawler." |
|
9 |
PROCESSOR_LIB_PATH = "DatasetProcessing." |
|
10 |
|
|
11 |
|
|
12 |
def crawl_data(config): |
|
13 |
|
|
14 |
dataset_name = config["dataset-name"] |
|
15 |
|
|
16 |
my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler",globals(),locals(),['crawl']).crawl |
|
17 |
my_function(config) |
|
18 |
|
|
19 |
dataset_name += '/' |
|
20 |
|
|
21 |
|
|
22 |
def process_data(dataset_name): |
|
23 |
dataset_path = dataset_name + '/' |
|
24 |
|
|
25 |
process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor",globals(),locals(), |
|
26 |
['process_file']).process_file |
|
27 |
|
|
28 |
# get all not processed files from dataset |
|
29 |
not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path) |
|
30 |
|
|
31 |
# process every file |
|
32 |
for not_processed_file in not_processed_files: |
|
33 |
# call processing for one file in dataset |
|
34 |
process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file) |
|
35 |
FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file) |
|
36 |
|
|
37 |
|
|
38 |
def load_data_to_database(config): |
|
39 |
|
|
40 |
dataset_name = config["dataset-name"] |
|
41 |
dataset_path = dataset_name + '/' |
|
42 |
|
|
43 |
# get all unprocessed files from dataset |
|
44 |
not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path) |
|
45 |
|
|
46 |
# load every file |
|
47 |
for not_loaded_file in not_loaded_files: |
|
48 |
# load processed data |
|
49 |
processed_data = DatabaseLoader.get_data_from_file(PROCESSED_DATA_PATH + dataset_path + not_loaded_file, |
|
50 |
config["devices"]) |
|
51 |
# load processed data to database |
|
52 |
DatabaseLoader.load_data_to_database(dataset_name, processed_data) |
|
53 |
FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file) |
|
54 |
|
|
55 |
|
|
56 |
def run_full_pipeline(dataset_name): |
|
57 |
config = ConfigureLoader.load_configuration(CONFIG_FILES_PATH + dataset_name) |
|
58 |
crawl_data(config) |
|
59 |
process_data(config["dataset-name"]) |
|
60 |
load_data_to_database(config) |
python-module/ProcessedData/KOLOBEZKY/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
|
2 |
OD_ZCU_KOLOBEZKY_07_2019.CSV |
|
3 |
OD_ZCU_KOLOBEZKY_06_2019.CSV |
|
4 |
OD_ZCU_KOLOBEZKY_00_2019.CSV |
|
5 |
OD_ZCU_KOLOBEZKY_08_2019.CSV |
python-module/Scripts/PrepareNewDataset.py | ||
---|---|---|
1 |
import os |
|
2 |
|
|
3 |
CRAWLED_DATA_PATH = "../CrawledData/" |
|
4 |
PROCESSED_DATA_PATH = "../ProcessedData/" |
|
5 |
CRAWLER_LOGS_PATH = "../CrawlerLogs/" |
|
6 |
CRAWLER_PROGRAM_PATH = "../DatasetCrawler" |
|
7 |
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing" |
|
8 |
CONFIG_FILES_PATH = "../DatasetConfigs" |
|
9 |
|
|
10 |
|
|
11 |
def create_default_config_file(dataset_name): |
|
12 |
|
|
13 |
with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file: |
|
14 |
file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n") |
|
15 |
file.write("dataset-name: " + dataset_name + "\n") |
|
16 |
file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n") |
|
17 |
file.write("devices:\n") |
|
18 |
file.write(" - example1:\n") |
|
19 |
file.write(" x: 12.3\n") |
|
20 |
file.write(" y: 32.1\n") |
|
21 |
file.write("\n") |
|
22 |
file.write(" - example2:\n") |
|
23 |
file.write(" x: 32.1\n") |
|
24 |
file.write(" y: 12.3\n") |
|
25 |
file.write("\n") |
|
26 |
file.write("# root slozka, ktera obsahuje odkazy na dataset\n") |
|
27 |
file.write("url: ZDE VLOZTE URL/\n") |
|
28 |
file.write("# volitelný parameter, který specifikuje vzor jména datasetů, které se budou stahovat\n") |
|
29 |
file.write("regex: ZDE VLOZTE REGEX\n") |
|
30 |
file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, " |
|
31 |
"tak defaultni hodnota (dny)\n") |
|
32 |
file.write("update-period: ZDE VLOZTE HODNOTU\n") |
|
33 |
|
|
34 |
|
|
35 |
def create_default_processor(dataset_name): |
|
36 |
with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file: |
|
37 |
file.write("def process_file(filename):\n") |
|
38 |
file.write(" print(\"You must implements process_file method first!\")\n") |
|
39 |
|
|
40 |
|
|
41 |
def create_default_crawler(dataset_name): |
|
42 |
|
|
43 |
with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file: |
|
44 |
file.write("def crawl(config):\n") |
|
45 |
file.write(" print(\"You must implements Crawl method first!\")\n") |
|
46 |
|
|
47 |
|
|
48 |
def create_ignore_file(path,text): |
|
49 |
|
|
50 |
with open(path + "/ignore.txt", "w") as file: |
|
51 |
if text is not None: |
|
52 |
file.write(text + "\n") |
|
53 |
|
|
54 |
|
|
55 |
def prepare_dataset_structure(dataset_name): |
|
56 |
jump_folder = "../" |
|
57 |
|
|
58 |
# create folder for crawled data |
|
59 |
try: |
|
60 |
path = CRAWLED_DATA_PATH+dataset_name |
|
61 |
os.mkdir(path) |
|
62 |
create_ignore_file(path,"ignore.txt") |
|
63 |
except os.error as e: |
|
64 |
print(e) |
|
65 |
print("Creation of the directory %s failed" % path) |
|
66 |
|
|
67 |
# create folder for processed data |
|
68 |
try: |
|
69 |
path = PROCESSED_DATA_PATH + dataset_name |
|
70 |
os.mkdir(path) |
|
71 |
create_ignore_file(path, "ignore.txt") |
|
72 |
except OSError: |
|
73 |
print("Creation of the directory %s failed" % path) |
|
74 |
|
|
75 |
# create folder for crawler logs |
|
76 |
try: |
|
77 |
path = CRAWLER_LOGS_PATH + dataset_name |
|
78 |
os.mkdir(path) |
|
79 |
create_ignore_file(path, None) |
|
80 |
except OSError: |
|
81 |
print("Creation of the directory %s failed" % path) |
|
82 |
|
|
83 |
create_default_crawler(dataset_name) |
|
84 |
create_default_processor(dataset_name) |
|
85 |
create_default_config_file(dataset_name) |
|
86 |
|
|
87 |
|
|
88 |
prepare_dataset_structure("JIS") |
python-module/Utilities/CSV/CSVDataLine.py | ||
---|---|---|
1 |
class CSVDataLine: |
|
2 |
|
|
3 |
def __init__(self, name, date, occurence): |
|
4 |
self.name = name |
|
5 |
self.date = date |
|
6 |
self.occurence = occurence |
|
7 |
|
|
8 |
def to_csv(self): |
|
9 |
return self.name + ";" + str(self.occurence) + ";" + self.date |
|
10 |
|
python-module/Utilities/CSV/CSVutils.py | ||
---|---|---|
1 |
PROCESSED_DATA_PATH = "ProcessedData/" |
|
2 |
|
|
3 |
def get_unique_names_from_file(filename, column_number): |
|
4 |
f = open(filename, "r") |
|
5 |
|
|
6 |
# create set of unique names |
|
7 |
name_set = set() |
|
8 |
|
|
9 |
# go through every line of line |
|
10 |
for x in f: |
|
11 |
# split by csv splitter ; |
|
12 |
array = x.split(";") |
|
13 |
# add string from chosen column to set |
|
14 |
name_set.add(array[column_number]) |
|
15 |
|
|
16 |
f.close() |
|
17 |
|
|
18 |
return name_set |
|
19 |
|
|
20 |
|
|
21 |
def export_data_to_csv(filename, data_dict): |
|
22 |
with open(PROCESSED_DATA_PATH + filename[12:], "w+") as file: |
|
23 |
|
|
24 |
for date in data_dict: |
|
25 |
for data in data_dict[date]: |
|
26 |
file.write(data_dict[date][data].to_csv() + '\n') |
python-module/Utilities/ConfigureLoader.py | ||
---|---|---|
1 |
import yaml |
|
2 |
|
|
3 |
|
|
4 |
def load_configuration(configure_file_name): |
|
5 |
|
|
6 |
with open(configure_file_name) as f: |
|
7 |
data = yaml.load(f, Loader=yaml.FullLoader) |
|
8 |
|
|
9 |
devices_dic = dict() |
|
10 |
|
|
11 |
for item in data["devices"]: |
|
12 |
devices_dic.update(item) |
|
13 |
|
|
14 |
data["devices"] = devices_dic |
|
15 |
|
|
16 |
return data |
python-module/Utilities/Crawler/BasicCrawler.py | ||
---|---|---|
1 |
import requests |
|
2 |
import re |
|
3 |
from Utilities import FolderProcessor |
|
4 |
from bs4 import BeautifulSoup |
|
5 |
|
|
6 |
|
|
7 |
def get_all_links(url): |
|
8 |
# create response object |
|
9 |
r = requests.get(url) |
|
10 |
|
|
11 |
# create beautiful-soup object |
|
12 |
soup = BeautifulSoup(r.content, 'html5lib') |
|
13 |
links = [] |
|
14 |
|
|
15 |
for link in soup.findAll('a'): |
|
16 |
links.append(link.get('href')) |
|
17 |
|
|
18 |
return links |
|
19 |
|
|
20 |
|
|
21 |
def filter_links(links, regex): |
|
22 |
fitlered_links = [] |
|
23 |
|
|
24 |
for link in links: |
|
25 |
if re.search(regex,link): |
|
26 |
fitlered_links.append(link) |
|
27 |
|
|
28 |
return fitlered_links |
|
29 |
|
|
30 |
|
|
31 |
def create_absolute_links(links, archive): |
|
32 |
absolute_links = [] |
|
33 |
|
|
34 |
for link in links: |
|
35 |
absolute_links.append(archive + link) |
|
36 |
|
|
37 |
return absolute_links |
|
38 |
|
|
39 |
|
|
40 |
def remove_downloaded_links(links,dataset_name): |
|
41 |
|
|
42 |
downloaded_links = FolderProcessor.load_ignore_set("CrawlerLogs/" + dataset_name + "/") |
|
43 |
final_links = set(links) - downloaded_links |
|
44 |
|
|
45 |
return final_links |
|
46 |
|
|
47 |
|
|
48 |
def download_file_from_url(url,path, dataset_name): |
|
49 |
r = requests.get(url, stream=True) |
|
50 |
|
|
51 |
url_parts = url.split("/") |
|
52 |
file_name = url_parts[len(url_parts)-1] |
|
53 |
|
|
54 |
with open(path + file_name, "wb") as file: |
|
55 |
for chunk in r.iter_content(chunk_size=1024): |
|
56 |
|
|
57 |
# writing one chunk at a time to pdf file |
|
58 |
if chunk: |
|
59 |
file.write(chunk) |
|
60 |
|
|
61 |
FolderProcessor.update_ignore_set("CrawlerLogs/" + dataset_name + "/", url) |
python-module/Utilities/DateFormating.py | ||
---|---|---|
1 |
def date_formater(string_date): |
|
2 |
if string_date[11].isspace(): |
|
3 |
pos = 0 |
|
4 |
srr = "" |
|
5 |
for i in string_date: |
|
6 |
|
|
7 |
if pos == 10: |
|
8 |
srr = srr + '0' |
|
9 |
else: |
|
10 |
srr = srr + i |
|
11 |
|
|
12 |
pos = pos + 1 |
|
13 |
|
|
14 |
string_date = srr |
|
15 |
|
|
16 |
return_date = string_date[:2] + string_date[3:5] + string_date[6:10] |
|
17 |
|
|
18 |
return return_date |
|
19 |
|
|
20 |
|
|
21 |
def date_time_formater(string_date): |
|
22 |
if string_date[11].isspace(): |
|
23 |
pos = 0 |
|
24 |
srr = "" |
|
25 |
for i in string_date: |
|
26 |
|
|
27 |
if pos == 10: |
|
28 |
srr = srr + '0' |
|
29 |
else: |
|
30 |
srr = srr + i |
|
31 |
|
|
32 |
pos = pos + 1 |
|
33 |
|
|
34 |
string_date = srr |
|
35 |
|
|
36 |
return_date = string_date[:2] + string_date[3:5] + string_date[6:10] + string_date[11:13] |
|
37 |
|
|
38 |
return return_date |
python-module/Utilities/FolderProcessor.py | ||
---|---|---|
1 |
import os |
|
2 |
import zipfile |
|
3 |
from Utilities import ConfigureLoader |
|
4 |
|
|
5 |
|
|
6 |
def list_of_all_files(path): |
|
7 |
files_in_dir = os.listdir(path) |
|
8 |
|
|
9 |
ignore_set = load_ignore_set(path) |
|
10 |
|
|
11 |
return set(files_in_dir).difference(ignore_set) |
|
12 |
|
|
13 |
|
|
14 |
def load_ignore_set(path): |
|
15 |
ignore_set = set() |
|
16 |
|
|
17 |
with open(path + "ignore.txt", "r") as file: |
|
18 |
|
|
19 |
for line in file: |
|
20 |
ignore_set.add(line[:-1]) |
|
21 |
|
|
22 |
return ignore_set |
|
23 |
|
|
24 |
|
|
25 |
def update_ignore_set(path,file_name): |
|
26 |
|
|
27 |
with open(path + "ignore.txt", "a") as file: |
|
28 |
file.write(file_name + '\n') |
|
29 |
|
|
30 |
|
|
31 |
def unzip_all_csv_zip_files_in_folder(folder): |
|
32 |
|
|
33 |
files_in_dir = os.listdir(folder) |
|
34 |
zips = [] |
|
35 |
|
|
36 |
for file in files_in_dir: |
|
37 |
if file.endswith(".zip"): |
|
38 |
zips.append(folder + file) |
|
39 |
|
|
40 |
for zip_file in zips: |
|
41 |
|
|
42 |
with zipfile.ZipFile(zip_file, "r") as unziped_file: |
|
43 |
unziped_file.extractall(folder) |
|
44 |
|
|
45 |
os.remove(zip_file) |
|
46 |
|
|
47 |
|
python-module/main.py | ||
---|---|---|
1 |
import Pipeline |
|
2 |
import os |
|
3 |
|
|
4 |
CONFIG_FILES_PATH = "DatasetConfigs/" |
|
5 |
|
|
6 |
|
|
7 |
def run_pipeline_for_all_datasets(): |
|
8 |
files_in_dir = os.listdir(CONFIG_FILES_PATH) |
|
9 |
|
|
10 |
for file in files_in_dir: |
|
11 |
Pipeline.run_full_pipeline(file) |
|
12 |
|
|
13 |
|
|
14 |
def run_pipeline_for_one_dataset(dataset_name): |
|
15 |
Pipeline.run_full_pipeline(dataset_name) |
|
16 |
|
|
17 |
|
|
18 |
run_pipeline_for_all_datasets() |
python-module/requirements.txt | ||
---|---|---|
1 |
beautifulsoup4==4.9.0 |
|
2 |
certifi==2020.4.5.1 |
|
3 |
chardet==3.0.4 |
|
4 |
html5lib==1.0.1 |
|
5 |
idna==2.9 |
|
6 |
pymongo==3.10.1 |
|
7 |
PyYAML==5.3.1 |
|
8 |
requests==2.23.0 |
|
9 |
six==1.14.0 |
|
10 |
soupsieve==2.0 |
|
11 |
urllib3==1.25.9 |
|
12 |
webencodings==0.5.1 |
Také k dispozici: Unified diff
Re #7924 beta version of datascript