Projekt

Obecné

Profil

« Předchozí | Další » 

Revize af7609b5

Přidáno uživatelem Tomáš Ballák před více než 3 roky(ů)

Re #8193 - refactoring crawler

Zobrazit rozdíly:

modules/crawler/prepare_new_dataset.py
1 1
import os
2

  
3 2
# Path to crawled data
4 3
CRAWLED_DATA_PATH = "CrawledData/"
5 4
# Path to processed data
......
14 13
DEFAULT_COLOR = "#000000"
15 14

  
16 15

  
17
def create_default_config_file(dataset_name: str):
16
def create_default_config_file(dataset_name: str) -> None:
18 17
    """
19 18
    Creates default config file
20 19

  
......
22 21
        dataset_name: Name of newly created dataset
23 22
    """
24 23
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
25
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
24
        file.write("# Name of the dataset inside the application\n")
26 25
        file.write("display-name: " + dataset_name + "\n")
27 26
        file.write(
28
            "# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře\n")
29
        file.write("display-color: " + DEFAULT_COLOR + "\n")
27
            "# Color for the dataset in a hex value (default value #000000)\n")
28
        file.write(f'display-color: \'{DEFAULT_COLOR}\' \n')
30 29
        file.write(
31
            "# barva pro tento dataset v hexadecimální hodnotě (#000000)\n")
30
            "# One word dataset name (structure of all modules will be affected by this)\n"
31
        )
32 32
        file.write("dataset-name: " + dataset_name + "\n")
33
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
34
        file.write("url: ZDE VLOZTE URL\n")
33
        file.write("# Url for the source of this dataset\n")
34
        file.write("url: ENTER URL HERE\n")
35
        file.write(
36
            "# Optional parameter which specifies a pattern of the datasets name\n"
37
        )
38
        file.write(
39
            "# Example: DATASET_NAME_[0-9][0-9]_[0-9][0-9][0-9][0-9].zip\n")
40
        file.write(
41
            "# - DATASET_NAME_01_2020.zip where '01_2020' specifies date in this dataset\n"
42
        )
43
        file.write("regex: ENTER REGEX HERE\n")
35 44
        file.write(
36
            "# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
37
        file.write("regex: ZDE VLOZTE REGEX\n")
38
        file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
39
                   "tak defaultni hodnota (dny)\n")
40
        file.write("update-period: ZDE VLOZTE HODNOTU\n")
41
        file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
45
            "# Optional parameter which specifies the way of searching new datasets (if empty the period is set to every day)\n"
46
        )
47
        file.write("update-period: ENTER UPDATE PERIOD HERE\n")
48
        file.write("# Coordinates of every datasets device (entinty)\n")
42 49
        file.write("devices:\n")
43 50

  
44 51

  
45
def create_default_processor(dataset_name):
52
def create_default_processor(dataset_name: str) -> None:
46 53
    """
47 54
    Creates default processor for dataset
48 55

  
49 56
    Args:
50 57
        dataset_name: Name of newly created dataset
51 58
    """
52
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "_processor.py", "w") as file:
53
        file.write("from Utilities.CSV import csv_data_line")
59
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "_processor.py",
60
              "w") as file:
61
        file.write("from Utilities.CSV import csv_data_line\n")
62
        file.write("from shared_types import DateDict")
54 63
        file.write("\n")
55 64
        file.write("\n")
56
        file.write("def process_file(filename):\n")
65
        file.write("def process_file(filename: str) -> DateDict:\n")
57 66
        file.write("    \"\"\"\n")
58 67
        file.write(
59
            "    Method that take path to crawled file and outputs date dictionary:\n")
68
            "    Method that takes the path to crawled file and outputs date dictionary:\n"
69
        )
60 70
        file.write(
61
            "    Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)\n")
71
            "    Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)\n"
72
        )
62 73
        file.write(
63
            "    and value is dictionary where keys are devices (specified in configuration file)\n")
74
            "    and value is dictionary where keys are devices (specified in configuration file)\n"
75
        )
64 76
        file.write(
65
            "    and value is CSVDataLine.csv_data_line with device,date and occurrence\n")
77
            "    and value is CSVDataLine.csv_data_line with device,date and occurrence\n"
78
        )
66 79
        file.write("\n")
67 80
        file.write("    Args:\n")
68
        file.write("    filename: name of processed file\n")
81
        file.write("    filename: name of the processed file\n")
69 82
        file.write("\n")
70 83
        file.write("    Returns:\n")
71 84
        file.write("    None if not implemented\n")
72 85
        file.write("    date_dict when implemented\n")
73 86
        file.write("    \"\"\"\n")
74
        file.write("    date_dict = dict()\n")
87
        file.write("    date_dict: DateDict = {}\n")
75 88
        file.write("\n")
76 89
        file.write("    #with open(filename, \"r\") as file:\n")
77 90
        file.write(
78
            "    print(\"You must implements process_file method first!\")\n")
79
        file.write("    return None\n")
91
            "    print(\"You must implement the process_file method first!\")\n"
92
        )
93
        file.write("    return date_dict\n")
80 94

  
81 95

  
82
def create_default_crawler(dataset_name):
96
def create_default_crawler(dataset_name: str) -> None:
83 97
    """
84 98
    Creates default crawler for dataset
85 99

  
......
87 101
        dataset_name: Name of newly created dataset
88 102
    """
89 103

  
90
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "_crawler.py", "w") as file:
104
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "_crawler.py",
105
              "w") as file:
106
        file.write("from shared_types import ConfigType\n")
91 107
        file.write("# Path to crawled data\n")
92
        file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
108
        file.write(f'CRAWLED_DATA_PATH = "{CRAWLED_DATA_PATH}" \n')
93 109
        file.write("\n")
94 110
        file.write("\n")
95
        file.write("def crawl(config):\n")
111
        file.write("def crawl(config: ConfigType):\n")
96 112
        file.write("    \"\"\"\n")
97 113
        file.write(
98
            "    Implement crawl method that downloads new data to path_for_files\n")
114
            "    Implementation the crawl method which downloads new data to the path_for_files\n"
115
        )
99 116
        file.write("    For keeping the project structure\n")
100 117
        file.write("    url , regex, and dataset_name from config\n")
101 118
        file.write(
102
            "    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
119
            "    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n"
120
        )
103 121
        file.write("\n")
104 122
        file.write("    Args:\n")
105 123
        file.write("        config: loaded configuration file of dataset\n")
......
109 127
        file.write("    regex = config['regex']\n")
110 128
        file.write(
111 129
            "    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
112
        file.write("    print(\"You must implements Crawl method first!\")\n")
130
        file.write(
131
            "    print(\"Není implementován crawler pro získávání dat!\")\n")
113 132

  
114 133

  
115
def prepare_dataset_structure(dataset_name):
134
def prepare_dataset_structure(dataset_name: str) -> None:
116 135
    """
117 136
    Prepares folders for new dataset
118 137
    Args:
......
120 139
    """
121 140

  
122 141
    # create folder for crawled data
123
    path = CRAWLED_DATA_PATH+dataset_name
142
    path = CRAWLED_DATA_PATH + dataset_name
124 143
    try:
125 144
        os.mkdir(path)
126 145
    except os.error as e:
......
132 151
    try:
133 152
        os.mkdir(path)
134 153
    except OSError:
135
        print("Creation of the directory %s failed" % path)
154
        print("Nelze vytvořit adresář %s" % path)
136 155

  
137 156
    create_default_crawler(dataset_name)
138 157
    create_default_processor(dataset_name)
139 158
    create_default_config_file(dataset_name)
140 159

  
141 160

  
142
print("Zadejte jméno nového datasetu:\n")
161
def main() -> None:
162
    print("Zadejte jméno nového datasetu:\n")
163
    dataset_name = input().upper()
164

  
165
    if dataset_name.isalpha():
166
        prepare_dataset_structure(dataset_name)
167
        print("Architektura vytvořena \n")
168
    else:
169
        print("Jméno musí obsahovat pouze písmena z abecedy (bez mezer)\n")
143 170

  
144
dataset_name = input().upper()
145 171

  
146
if dataset_name.isalpha():
147
    prepare_dataset_structure(dataset_name)
148
    print("Architektura vytvořena \n")
149
else:
150
    print("Jméno musí obsahovat pouze písmena z abecedy (bez mezer)\n")
172
if __name__ == "__main__":
173
    main()

Také k dispozici: Unified diff