Revize 2d129043
Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)
modules/crawler/.gitignore | ||
---|---|---|
1 | 1 |
*__pycache__* |
2 | 2 |
*.CSV |
3 |
python-module/venv
|
|
3 |
CrawlerLogs/CommonRecords/*
|
modules/crawler/DatasetProcessing/JISProcessor.py | ||
---|---|---|
17 | 17 |
False if not implemented |
18 | 18 |
True when implemented |
19 | 19 |
""" |
20 |
with open(filename, "r", encoding="utf-8") as file:
|
|
20 |
date_dict = dict()
|
|
21 | 21 |
|
22 |
date_dict = dict()
|
|
22 |
with open(filename, "r", encoding="utf-8") as file:
|
|
23 | 23 |
|
24 | 24 |
for line in file: |
25 | 25 |
|
... | ... | |
35 | 35 |
if name in date_dict[date]: |
36 | 36 |
date_dict[date][name].occurrence += int(occurrence) |
37 | 37 |
else: |
38 |
date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, int(occurrence))
|
|
38 |
date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, occurrence)
|
|
39 | 39 |
|
40 |
CSVutils.export_data_to_csv(filename, date_dict) |
|
41 |
return True |
|
40 |
return date_dict |
|
42 | 41 |
|
modules/crawler/DatasetProcessing/KOLOBEZKYProcessor.py | ||
---|---|---|
17 | 17 |
False if not implemented |
18 | 18 |
True when implemented |
19 | 19 |
""" |
20 |
with open(filename, "r") as file:
|
|
20 |
date_dict = dict()
|
|
21 | 21 |
|
22 |
date_dict = dict()
|
|
22 |
with open(filename, "r") as file:
|
|
23 | 23 |
|
24 | 24 |
for line in file: |
25 | 25 |
|
... | ... | |
36 | 36 |
else: |
37 | 37 |
date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, 1) |
38 | 38 |
|
39 |
CSVutils.export_data_to_csv(filename, date_dict) |
|
40 |
return True |
|
39 |
return date_dict |
|
41 | 40 |
|
modules/crawler/DatasetProcessing/WIFIProcessor.py | ||
---|---|---|
17 | 17 |
False if not implemented |
18 | 18 |
True when implemented |
19 | 19 |
""" |
20 |
date_dict = dict() |
|
21 |
|
|
20 | 22 |
with open(filename, "r", encoding="utf-8") as file: |
21 | 23 |
|
22 |
date_dict = dict() |
|
23 |
|
|
24 | 24 |
for line in file: |
25 | 25 |
|
26 | 26 |
array = line.split(";") |
... | ... | |
37 | 37 |
else: |
38 | 38 |
date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, int(occurrence)) |
39 | 39 |
|
40 |
CSVutils.export_data_to_csv(filename, date_dict) |
|
41 |
return True |
|
40 |
return date_dict |
|
42 | 41 |
|
modules/crawler/Pipeline.py | ||
---|---|---|
1 | 1 |
from Utilities import FolderProcessor, ConfigureFunctions |
2 | 2 |
from Utilities.Database import DatabaseLoader |
3 |
from Utilities.CSV import CSVutils |
|
3 | 4 |
|
4 | 5 |
import logging |
5 | 6 |
from datetime import date |
... | ... | |
96 | 97 |
logging.info(dataset_name + " has downloaded " + str(len(not_processed_files)) + " new files") |
97 | 98 |
|
98 | 99 |
for not_processed_file in not_processed_files: |
99 |
process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file) |
|
100 |
path = CRAWLED_DATA_PATH + dataset_path + not_processed_file |
|
101 |
date_dic = process_file_func(path) |
|
102 |
CSVutils.export_data_to_csv(path, date_dic) |
|
100 | 103 |
FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file) |
101 | 104 |
|
102 | 105 |
logging.info(dataset_name + " has processed " + str(len(not_processed_files)) + " newly crawled files") |
modules/crawler/PrepareNewDataset.py | ||
---|---|---|
48 | 48 |
file.write("\n") |
49 | 49 |
file.write("def process_file(filename):\n") |
50 | 50 |
file.write(" \"\"\"\n") |
51 |
file.write(" Method that take path to crawled file and outputs date dictionary using method:\n") |
|
52 |
file.write(" CSVutils.export_data_to_csv(filename, date_dict)\n") |
|
51 |
file.write(" Method that take path to crawled file and outputs date dictionary:\n") |
|
53 | 52 |
file.write(" Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n") |
54 | 53 |
file.write(" and value is dictionary where keys devices (specified in configuration file)\n") |
55 | 54 |
file.write(" and value is CSVDataLine.CSVDataLine with device,date and occurrence\n") |
... | ... | |
58 | 57 |
file.write(" filename: name of processed file\n") |
59 | 58 |
file.write("\n") |
60 | 59 |
file.write(" Returns:\n") |
61 |
file.write(" False if not implemented\n")
|
|
62 |
file.write(" True when implemented\n")
|
|
60 |
file.write(" None if not implemented\n")
|
|
61 |
file.write(" date_dict when implemented\n")
|
|
63 | 62 |
file.write(" \"\"\"\n") |
63 |
file.write(" date_dict = dict()\n") |
|
64 |
file.write("\n") |
|
64 | 65 |
file.write(" #with open(filename, \"r\") as file:\n") |
65 | 66 |
file.write(" print(\"You must implements process_file method first!\")\n") |
66 |
file.write(" #CSVutils.export_data_to_csv(filename, date_dict)\n") |
|
67 |
file.write(" return False\n") |
|
67 |
file.write(" return None\n") |
|
68 | 68 |
|
69 | 69 |
|
70 | 70 |
def create_default_crawler(dataset_name): |
modules/crawler/Utilities/CSV/CSVDataLine.py | ||
---|---|---|
3 | 3 |
Class that specifies the look of data line in processed csv file |
4 | 4 |
prepared for database |
5 | 5 |
""" |
6 |
|
|
6 | 7 |
def __init__(self, name, date, occurrence): |
8 |
try: |
|
9 |
test_val = int(occurrence) |
|
10 |
except ValueError: |
|
11 |
print("Occurence should be and integer value!") |
|
12 |
|
|
13 |
if len(date) != 10: |
|
14 |
raise ValueError("Invalid date format ddmmYYYYhh expected!") |
|
15 |
|
|
7 | 16 |
self.name = name |
8 | 17 |
self.date = date |
9 |
self.occurrence = occurrence
|
|
18 |
self.occurrence = test_val
|
|
10 | 19 |
|
11 | 20 |
def to_csv(self): |
12 | 21 |
return self.name + ";" + str(self.occurrence) + ";" + self.date |
modules/crawler/Utilities/CSV/CSVutils.py | ||
---|---|---|
1 |
import inspect |
|
2 |
from Utilities.CSV import CSVDataLine |
|
3 |
|
|
1 | 4 |
# Path to processed data |
2 | 5 |
PROCESSED_DATA_PATH = "ProcessedData/" |
3 | 6 |
|
... | ... | |
36 | 39 |
with open(PROCESSED_DATA_PATH + filename[12:], "w+") as file: |
37 | 40 |
|
38 | 41 |
for date in data_dict: |
42 |
if len(date) != 10: |
|
43 |
raise ValueError("Invalid date format for key value --> ddmmYYYYhh expected!") |
|
39 | 44 |
for data in data_dict[date]: |
40 |
file.write(data_dict[date][data].to_csv() + '\n') |
|
45 |
csv_line = data_dict[date][data] |
|
46 |
if not isinstance(csv_line,CSVDataLine.CSVDataLine): |
|
47 |
raise ValueError("data_dict is expected to have CSVDataLine as values") |
|
48 |
file.write(csv_line.to_csv() + '\n') |
modules/crawler/Utilities/Database/DatabaseLoader.py | ||
---|---|---|
62 | 62 |
csv_column = line.split(";") |
63 | 63 |
|
64 | 64 |
name = csv_column[0] |
65 |
|
|
65 | 66 |
occurrence = csv_column[1] |
66 | 67 |
date = csv_column[2] |
67 | 68 |
|
69 |
|
|
70 |
|
|
68 | 71 |
database_data_line = DatabaseDataLine.DatabaseDataLine(name, devices[name]["x"] |
69 | 72 |
, devices[name]["y"], date, occurrence) |
70 | 73 |
|
Také k dispozici: Unified diff
Re #7939
- upravena struktura processorů v pipeline
- pridani kontroly validity dat