Projekt

Obecné

Profil

« Předchozí | Další » 

Revize 2d129043

Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)

Re #7939
- upravena struktura processorů v pipeline
- pridani kontroly validity dat

Zobrazit rozdíly:

modules/crawler/.gitignore
1 1
*__pycache__*
2 2
*.CSV
3
python-module/venv
3
CrawlerLogs/CommonRecords/*
modules/crawler/DatasetProcessing/JISProcessor.py
17 17
        False if not implemented
18 18
        True when implemented
19 19
    """
20
    with open(filename, "r", encoding="utf-8") as file:
20
    date_dict = dict()
21 21

  
22
        date_dict = dict()
22
    with open(filename, "r", encoding="utf-8") as file:
23 23

  
24 24
        for line in file:
25 25

  
......
35 35
            if name in date_dict[date]:
36 36
                date_dict[date][name].occurrence += int(occurrence)
37 37
            else:
38
                date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, int(occurrence))
38
                date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, occurrence)
39 39

  
40
    CSVutils.export_data_to_csv(filename, date_dict)
41
    return True
40
    return date_dict
42 41

  
modules/crawler/DatasetProcessing/KOLOBEZKYProcessor.py
17 17
        False if not implemented
18 18
        True when implemented
19 19
    """
20
    with open(filename, "r") as file:
20
    date_dict = dict()
21 21

  
22
        date_dict = dict()
22
    with open(filename, "r") as file:
23 23

  
24 24
        for line in file:
25 25

  
......
36 36
            else:
37 37
                date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, 1)
38 38

  
39
    CSVutils.export_data_to_csv(filename, date_dict)
40
    return True
39
    return date_dict
41 40

  
modules/crawler/DatasetProcessing/WIFIProcessor.py
17 17
        False if not implemented
18 18
        True when implemented
19 19
    """
20
    date_dict = dict()
21
    
20 22
    with open(filename, "r", encoding="utf-8") as file:
21 23

  
22
        date_dict = dict()
23

  
24 24
        for line in file:
25 25

  
26 26
            array = line.split(";")
......
37 37
            else:
38 38
                date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, int(occurrence))
39 39

  
40
    CSVutils.export_data_to_csv(filename, date_dict)
41
    return True
40
    return date_dict
42 41

  
modules/crawler/Pipeline.py
1 1
from Utilities import FolderProcessor, ConfigureFunctions
2 2
from Utilities.Database import DatabaseLoader
3
from Utilities.CSV import CSVutils
3 4

  
4 5
import logging
5 6
from datetime import date
......
96 97
    logging.info(dataset_name + " has downloaded " + str(len(not_processed_files)) + " new files")
97 98

  
98 99
    for not_processed_file in not_processed_files:
99
        process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file)
100
        path = CRAWLED_DATA_PATH + dataset_path + not_processed_file
101
        date_dic = process_file_func(path)
102
        CSVutils.export_data_to_csv(path, date_dic)
100 103
        FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file)
101 104

  
102 105
    logging.info(dataset_name + " has processed " + str(len(not_processed_files)) + " newly crawled files")
modules/crawler/PrepareNewDataset.py
48 48
        file.write("\n")
49 49
        file.write("def process_file(filename):\n")
50 50
        file.write("    \"\"\"\n")
51
        file.write("    Method that take path to crawled file and outputs date dictionary using method:\n")
52
        file.write("    CSVutils.export_data_to_csv(filename, date_dict)\n")
51
        file.write("    Method that take path to crawled file and outputs date dictionary:\n")
53 52
        file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
54 53
        file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
55 54
        file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
......
58 57
        file.write("    filename: name of processed file\n")
59 58
        file.write("\n")
60 59
        file.write("    Returns:\n")
61
        file.write("    False if not implemented\n")
62
        file.write("    True when implemented\n")
60
        file.write("    None if not implemented\n")
61
        file.write("    date_dict when implemented\n")
63 62
        file.write("    \"\"\"\n")
63
        file.write("    date_dict = dict()\n")
64
        file.write("\n")
64 65
        file.write("    #with open(filename, \"r\") as file:\n")
65 66
        file.write("    print(\"You must implements process_file method first!\")\n")
66
        file.write("    #CSVutils.export_data_to_csv(filename, date_dict)\n")
67
        file.write("    return False\n")
67
        file.write("    return None\n")
68 68

  
69 69

  
70 70
def create_default_crawler(dataset_name):
modules/crawler/Utilities/CSV/CSVDataLine.py
3 3
    Class that specifies the look of data line in processed csv file
4 4
    prepared for database
5 5
    """
6

  
6 7
    def __init__(self, name, date, occurrence):
8
        try:
9
            test_val = int(occurrence)
10
        except ValueError:
11
            print("Occurence should be and integer value!")
12

  
13
        if len(date) != 10:
14
            raise ValueError("Invalid date format ddmmYYYYhh expected!")    
15

  
7 16
        self.name = name
8 17
        self.date = date
9
        self.occurrence = occurrence
18
        self.occurrence = test_val
10 19

  
11 20
    def to_csv(self):
12 21
        return self.name + ";" + str(self.occurrence) + ";" + self.date
modules/crawler/Utilities/CSV/CSVutils.py
1
import inspect
2
from Utilities.CSV import CSVDataLine
3

  
1 4
# Path to processed data
2 5
PROCESSED_DATA_PATH = "ProcessedData/"
3 6

  
......
36 39
    with open(PROCESSED_DATA_PATH + filename[12:], "w+") as file:
37 40

  
38 41
        for date in data_dict:
42
            if len(date) != 10:
43
                raise ValueError("Invalid date format for key value --> ddmmYYYYhh expected!")   
39 44
            for data in data_dict[date]:
40
                file.write(data_dict[date][data].to_csv() + '\n')
45
                csv_line = data_dict[date][data]
46
                if not isinstance(csv_line,CSVDataLine.CSVDataLine):
47
                    raise ValueError("data_dict is expected to have CSVDataLine as values")
48
                file.write(csv_line.to_csv() + '\n')
modules/crawler/Utilities/Database/DatabaseLoader.py
62 62
        csv_column = line.split(";")
63 63

  
64 64
        name = csv_column[0]
65

  
65 66
        occurrence = csv_column[1]
66 67
        date = csv_column[2]
67 68

  
69

  
70

  
68 71
        database_data_line = DatabaseDataLine.DatabaseDataLine(name, devices[name]["x"]
69 72
                                                               , devices[name]["y"], date, occurrence)
70 73

  

Také k dispozici: Unified diff