Revize 04a2b5a4
Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)
python-module/Scripts/PrepareNewDataset.py | ||
---|---|---|
1 | 1 |
import os |
2 | 2 |
|
3 |
# Path to crawled data |
|
3 | 4 |
CRAWLED_DATA_PATH = "../CrawledData/" |
5 |
# Path to processed data |
|
4 | 6 |
PROCESSED_DATA_PATH = "../ProcessedData/" |
7 |
# Path to crawler logs |
|
5 | 8 |
CRAWLER_LOGS_PATH = "../CrawlerLogs/" |
9 |
# Path for DatasetCrawlers implementations |
|
6 | 10 |
CRAWLER_PROGRAM_PATH = "../DatasetCrawler" |
11 |
# Path for DatasetProcessors implementations |
|
7 | 12 |
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing" |
13 |
# Path to dataset configuration files |
|
8 | 14 |
CONFIG_FILES_PATH = "../DatasetConfigs" |
9 | 15 |
|
10 | 16 |
|
11 | 17 |
def create_default_config_file(dataset_name): |
18 |
""" |
|
19 |
Creates default config file |
|
12 | 20 |
|
21 |
Args: |
|
22 |
dataset_name: Name of newly created dataset |
|
23 |
""" |
|
13 | 24 |
with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file: |
14 | 25 |
file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n") |
15 | 26 |
file.write("dataset-name: " + dataset_name + "\n") |
16 | 27 |
file.write("# root slozka, ktera obsahuje odkazy na dataset\n") |
17 |
file.write("url: ZDE VLOZTE URL/\n")
|
|
28 |
file.write("url: ZDE VLOZTE URL\n") |
|
18 | 29 |
file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n") |
19 | 30 |
file.write("regex: ZDE VLOZTE REGEX\n") |
20 | 31 |
file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, " |
... | ... | |
25 | 36 |
|
26 | 37 |
|
27 | 38 |
def create_default_processor(dataset_name): |
39 |
""" |
|
40 |
Creates default processor for dataset |
|
41 |
|
|
42 |
Args: |
|
43 |
dataset_name: Name of newly created dataset |
|
44 |
""" |
|
28 | 45 |
with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file: |
46 |
file.write("from Utilities.CSV import CSVDataLine, CSVutils") |
|
47 |
file.write("\n") |
|
48 |
file.write("\n") |
|
29 | 49 |
file.write("def process_file(filename):\n") |
50 |
file.write(" \"\"\"\n") |
|
51 |
file.write(" Method that take path to crawled file and outputs date dictionary using method:\n") |
|
52 |
file.write(" CSVutils.export_data_to_csv(filename, date_dict)\n") |
|
53 |
file.write(" Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n") |
|
54 |
file.write(" and value is dictionary where keys devices (specified in configuration file)\n") |
|
55 |
file.write(" and value is CSVDataLine.CSVDataLine with device,date and occurrence\n") |
|
56 |
file.write("\n") |
|
57 |
file.write(" Args:\n") |
|
58 |
file.write(" filename: name of processed file\n") |
|
59 |
file.write("\n") |
|
60 |
file.write(" Returns:\n") |
|
61 |
file.write(" False if not implemented\n") |
|
62 |
file.write(" True when implemented\n") |
|
63 |
file.write(" \"\"\"\n") |
|
30 | 64 |
file.write(" print(\"You must implements process_file method first!\")\n") |
65 |
file.write(" #CSVutils.export_data_to_csv(filename, date_dict)\n") |
|
66 |
file.write(" return False\n") |
|
31 | 67 |
|
32 | 68 |
|
33 | 69 |
def create_default_crawler(dataset_name): |
70 |
""" |
|
71 |
Creates default crawler for dataset |
|
72 |
|
|
73 |
Args: |
|
74 |
dataset_name: Name of newly created dataset |
|
75 |
""" |
|
34 | 76 |
|
35 | 77 |
with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file: |
78 |
file.write("# Path to crawled data\n") |
|
79 |
file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n") |
|
80 |
file.write("\n") |
|
81 |
file.write("\n") |
|
36 | 82 |
file.write("def crawl(config):\n") |
83 |
file.write(" \"\"\"\n") |
|
84 |
file.write(" Implement crawl method that downloads new data to path_for_files\n") |
|
85 |
file.write(" For keeping the project structure\n") |
|
86 |
file.write(" url , regex, and dataset_name from config\n") |
|
87 |
file.write(" You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n") |
|
88 |
file.write("\n") |
|
89 |
file.write(" Args:\n") |
|
90 |
file.write(" config: loaded configuration file of dataset\n") |
|
91 |
file.write(" \"\"\"\n") |
|
92 |
file.write(" dataset_name = config[\"dataset-name\"]\n") |
|
93 |
file.write(" url = config['url']\n") |
|
94 |
file.write(" regex = config['regex']\n") |
|
95 |
file.write(" path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n") |
|
37 | 96 |
file.write(" print(\"You must implements Crawl method first!\")\n") |
38 | 97 |
|
39 | 98 |
|
40 |
def create_ignore_file(path,text): |
|
41 |
|
|
99 |
def create_ignore_file(path, text): |
|
100 |
""" |
|
101 |
Creates ignore file |
|
102 |
Args: |
|
103 |
path: path to directory for creating ignore.txt |
|
104 |
text: text that will be on first line of ignore.txt can be None |
|
105 |
""" |
|
42 | 106 |
with open(path + "/ignore.txt", "w") as file: |
43 | 107 |
if text is not None: |
44 | 108 |
file.write(text + "\n") |
45 | 109 |
|
46 | 110 |
|
47 | 111 |
def prepare_dataset_structure(dataset_name): |
112 |
""" |
|
113 |
Prepares folders for new dataset |
|
114 |
Args: |
|
115 |
dataset_name: Name of newly created dataset |
|
116 |
""" |
|
48 | 117 |
jump_folder = "../" |
49 | 118 |
|
50 | 119 |
# create folder for crawled data |
51 | 120 |
try: |
52 | 121 |
path = CRAWLED_DATA_PATH+dataset_name |
53 | 122 |
os.mkdir(path) |
54 |
create_ignore_file(path,"ignore.txt") |
|
123 |
create_ignore_file(path, "ignore.txt")
|
|
55 | 124 |
except os.error as e: |
56 | 125 |
print(e) |
57 | 126 |
print("Creation of the directory %s failed" % path) |
... | ... | |
77 | 146 |
create_default_config_file(dataset_name) |
78 | 147 |
|
79 | 148 |
|
80 |
prepare_dataset_structure("WIFI") |
|
149 |
prepare_dataset_structure("TEST") |
Také k dispozici: Unified diff
Re #7939
- pridana dokumentace metod a trid
- korekce chyb v jmenech promenych
- pridani informaci pro vygenerovane skripty