Revize af7609b5
Přidáno uživatelem Tomáš Ballák před více než 3 roky(ů)
modules/crawler/prepare_new_dataset.py | ||
---|---|---|
1 | 1 |
import os |
2 |
|
|
3 | 2 |
# Path to crawled data |
4 | 3 |
CRAWLED_DATA_PATH = "CrawledData/" |
5 | 4 |
# Path to processed data |
... | ... | |
14 | 13 |
DEFAULT_COLOR = "#000000" |
15 | 14 |
|
16 | 15 |
|
17 |
def create_default_config_file(dataset_name: str): |
|
16 |
def create_default_config_file(dataset_name: str) -> None:
|
|
18 | 17 |
""" |
19 | 18 |
Creates default config file |
20 | 19 |
|
... | ... | |
22 | 21 |
dataset_name: Name of newly created dataset |
23 | 22 |
""" |
24 | 23 |
with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file: |
25 |
file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
|
|
24 |
file.write("# Name of the dataset inside the application\n")
|
|
26 | 25 |
file.write("display-name: " + dataset_name + "\n") |
27 | 26 |
file.write( |
28 |
"# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře\n")
|
|
29 |
file.write("display-color: " + DEFAULT_COLOR + "\n")
|
|
27 |
"# Color for the dataset in a hex value (default value #000000)\n")
|
|
28 |
file.write(f'display-color: \'{DEFAULT_COLOR}\' \n')
|
|
30 | 29 |
file.write( |
31 |
"# barva pro tento dataset v hexadecimální hodnotě (#000000)\n") |
|
30 |
"# One word dataset name (structure of all modules will be affected by this)\n" |
|
31 |
) |
|
32 | 32 |
file.write("dataset-name: " + dataset_name + "\n") |
33 |
file.write("# root slozka, ktera obsahuje odkazy na dataset\n") |
|
34 |
file.write("url: ZDE VLOZTE URL\n") |
|
33 |
file.write("# Url for the source of this dataset\n") |
|
34 |
file.write("url: ENTER URL HERE\n") |
|
35 |
file.write( |
|
36 |
"# Optional parameter which specifies a pattern of the datasets name\n" |
|
37 |
) |
|
38 |
file.write( |
|
39 |
"# Example: DATASET_NAME_[0-9][0-9]_[0-9][0-9][0-9][0-9].zip\n") |
|
40 |
file.write( |
|
41 |
"# - DATASET_NAME_01_2020.zip where '01_2020' specifies date in this dataset\n" |
|
42 |
) |
|
43 |
file.write("regex: ENTER REGEX HERE\n") |
|
35 | 44 |
file.write( |
36 |
"# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n") |
|
37 |
file.write("regex: ZDE VLOZTE REGEX\n") |
|
38 |
file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, " |
|
39 |
"tak defaultni hodnota (dny)\n") |
|
40 |
file.write("update-period: ZDE VLOZTE HODNOTU\n") |
|
41 |
file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n") |
|
45 |
"# Optional parameter which specifies the way of searching new datasets (if empty the period is set to every day)\n" |
|
46 |
) |
|
47 |
file.write("update-period: ENTER UPDATE PERIOD HERE\n") |
|
48 |
file.write("# Coordinates of every datasets device (entinty)\n") |
|
42 | 49 |
file.write("devices:\n") |
43 | 50 |
|
44 | 51 |
|
45 |
def create_default_processor(dataset_name):
|
|
52 |
def create_default_processor(dataset_name: str) -> None:
|
|
46 | 53 |
""" |
47 | 54 |
Creates default processor for dataset |
48 | 55 |
|
49 | 56 |
Args: |
50 | 57 |
dataset_name: Name of newly created dataset |
51 | 58 |
""" |
52 |
with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "_processor.py", "w") as file: |
|
53 |
file.write("from Utilities.CSV import csv_data_line") |
|
59 |
with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "_processor.py", |
|
60 |
"w") as file: |
|
61 |
file.write("from Utilities.CSV import csv_data_line\n") |
|
62 |
file.write("from shared_types import DateDict") |
|
54 | 63 |
file.write("\n") |
55 | 64 |
file.write("\n") |
56 |
file.write("def process_file(filename):\n")
|
|
65 |
file.write("def process_file(filename: str) -> DateDict:\n")
|
|
57 | 66 |
file.write(" \"\"\"\n") |
58 | 67 |
file.write( |
59 |
" Method that take path to crawled file and outputs date dictionary:\n") |
|
68 |
" Method that takes the path to crawled file and outputs date dictionary:\n" |
|
69 |
) |
|
60 | 70 |
file.write( |
61 |
" Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)\n") |
|
71 |
" Date dictionary is a dictionary where keys are dates in format YYYY-mm-dd-hh (2018-04-08-15)\n" |
|
72 |
) |
|
62 | 73 |
file.write( |
63 |
" and value is dictionary where keys are devices (specified in configuration file)\n") |
|
74 |
" and value is dictionary where keys are devices (specified in configuration file)\n" |
|
75 |
) |
|
64 | 76 |
file.write( |
65 |
" and value is CSVDataLine.csv_data_line with device,date and occurrence\n") |
|
77 |
" and value is CSVDataLine.csv_data_line with device,date and occurrence\n" |
|
78 |
) |
|
66 | 79 |
file.write("\n") |
67 | 80 |
file.write(" Args:\n") |
68 |
file.write(" filename: name of processed file\n") |
|
81 |
file.write(" filename: name of the processed file\n")
|
|
69 | 82 |
file.write("\n") |
70 | 83 |
file.write(" Returns:\n") |
71 | 84 |
file.write(" None if not implemented\n") |
72 | 85 |
file.write(" date_dict when implemented\n") |
73 | 86 |
file.write(" \"\"\"\n") |
74 |
file.write(" date_dict = dict()\n")
|
|
87 |
file.write(" date_dict: DateDict = {}\n")
|
|
75 | 88 |
file.write("\n") |
76 | 89 |
file.write(" #with open(filename, \"r\") as file:\n") |
77 | 90 |
file.write( |
78 |
" print(\"You must implements process_file method first!\")\n") |
|
79 |
file.write(" return None\n") |
|
91 |
" print(\"You must implement the process_file method first!\")\n" |
|
92 |
) |
|
93 |
file.write(" return date_dict\n") |
|
80 | 94 |
|
81 | 95 |
|
82 |
def create_default_crawler(dataset_name):
|
|
96 |
def create_default_crawler(dataset_name: str) -> None:
|
|
83 | 97 |
""" |
84 | 98 |
Creates default crawler for dataset |
85 | 99 |
|
... | ... | |
87 | 101 |
dataset_name: Name of newly created dataset |
88 | 102 |
""" |
89 | 103 |
|
90 |
with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "_crawler.py", "w") as file: |
|
104 |
with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "_crawler.py", |
|
105 |
"w") as file: |
|
106 |
file.write("from shared_types import ConfigType\n") |
|
91 | 107 |
file.write("# Path to crawled data\n") |
92 |
file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
|
|
108 |
file.write(f'CRAWLED_DATA_PATH = "{CRAWLED_DATA_PATH}" \n')
|
|
93 | 109 |
file.write("\n") |
94 | 110 |
file.write("\n") |
95 |
file.write("def crawl(config):\n") |
|
111 |
file.write("def crawl(config: ConfigType):\n")
|
|
96 | 112 |
file.write(" \"\"\"\n") |
97 | 113 |
file.write( |
98 |
" Implement crawl method that downloads new data to path_for_files\n") |
|
114 |
" Implementation the crawl method which downloads new data to the path_for_files\n" |
|
115 |
) |
|
99 | 116 |
file.write(" For keeping the project structure\n") |
100 | 117 |
file.write(" url , regex, and dataset_name from config\n") |
101 | 118 |
file.write( |
102 |
" You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n") |
|
119 |
" You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n" |
|
120 |
) |
|
103 | 121 |
file.write("\n") |
104 | 122 |
file.write(" Args:\n") |
105 | 123 |
file.write(" config: loaded configuration file of dataset\n") |
... | ... | |
109 | 127 |
file.write(" regex = config['regex']\n") |
110 | 128 |
file.write( |
111 | 129 |
" path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n") |
112 |
file.write(" print(\"You must implements Crawl method first!\")\n") |
|
130 |
file.write( |
|
131 |
" print(\"Není implementován crawler pro získávání dat!\")\n") |
|
113 | 132 |
|
114 | 133 |
|
115 |
def prepare_dataset_structure(dataset_name):
|
|
134 |
def prepare_dataset_structure(dataset_name: str) -> None:
|
|
116 | 135 |
""" |
117 | 136 |
Prepares folders for new dataset |
118 | 137 |
Args: |
... | ... | |
120 | 139 |
""" |
121 | 140 |
|
122 | 141 |
# create folder for crawled data |
123 |
path = CRAWLED_DATA_PATH+dataset_name
|
|
142 |
path = CRAWLED_DATA_PATH + dataset_name
|
|
124 | 143 |
try: |
125 | 144 |
os.mkdir(path) |
126 | 145 |
except os.error as e: |
... | ... | |
132 | 151 |
try: |
133 | 152 |
os.mkdir(path) |
134 | 153 |
except OSError: |
135 |
print("Creation of the directory %s failed" % path)
|
|
154 |
print("Nelze vytvořit adresář %s" % path)
|
|
136 | 155 |
|
137 | 156 |
create_default_crawler(dataset_name) |
138 | 157 |
create_default_processor(dataset_name) |
139 | 158 |
create_default_config_file(dataset_name) |
140 | 159 |
|
141 | 160 |
|
142 |
print("Zadejte jméno nového datasetu:\n") |
|
161 |
def main() -> None: |
|
162 |
print("Zadejte jméno nového datasetu:\n") |
|
163 |
dataset_name = input().upper() |
|
164 |
|
|
165 |
if dataset_name.isalpha(): |
|
166 |
prepare_dataset_structure(dataset_name) |
|
167 |
print("Architektura vytvořena \n") |
|
168 |
else: |
|
169 |
print("Jméno musí obsahovat pouze písmena z abecedy (bez mezer)\n") |
|
143 | 170 |
|
144 |
dataset_name = input().upper() |
|
145 | 171 |
|
146 |
if dataset_name.isalpha(): |
|
147 |
prepare_dataset_structure(dataset_name) |
|
148 |
print("Architektura vytvořena \n") |
|
149 |
else: |
|
150 |
print("Jméno musí obsahovat pouze písmena z abecedy (bez mezer)\n") |
|
172 |
if __name__ == "__main__": |
|
173 |
main() |
Také k dispozici: Unified diff
Re #8193 - refactoring crawler