Revize 728f8c5d
Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)
modules/crawler/CrawledData/JIS/ignore.txt | ||
---|---|---|
1 | 1 |
ignore.txt |
2 |
OD_ZCU_JIS_10_2019.CSV |
|
3 |
OD_ZCU_JIS_03_2020.CSV |
|
4 |
OD_ZCU_JIS_02_2020.CSV |
|
5 |
OD_ZCU_JIS_00_2019.CSV |
|
6 |
OD_ZCU_JIS_08_2019.CSV |
|
7 |
OD_ZCU_JIS_12_2019.CSV |
|
8 |
OD_ZCU_JIS_09_2019.CSV |
|
9 |
OD_ZCU_JIS_01_2020.CSV |
|
10 |
OD_ZCU_JIS_06_2019.CSV |
|
11 |
OD_ZCU_JIS_11_2019.CSV |
|
12 |
OD_ZCU_JIS_07_2019.CSV |
modules/crawler/CrawledData/KOLOBEZKY/ignore.txt | ||
---|---|---|
1 | 1 |
ignore.txt |
2 |
OD_ZCU_KOLOBEZKY_00_2019.CSV |
|
3 |
OD_ZCU_KOLOBEZKY_06_2019.CSV |
|
4 |
OD_ZCU_KOLOBEZKY_07_2019.CSV |
|
5 |
OD_ZCU_KOLOBEZKY_08_2019.CSV |
modules/crawler/CrawledData/WIFI/ignore.txt | ||
---|---|---|
1 | 1 |
ignore.txt |
2 |
OD_ZCU_WIFI_07_2019.CSV |
|
3 |
OD_ZCU_WIFI_00_2019.CSV |
|
4 |
OD_ZCU_WIFI_06_2019.CSV |
|
5 |
OD_ZCU_WIFI_08_2019.CSV |
modules/crawler/CrawlerLogs/JIS/ignore.txt | ||
---|---|---|
1 |
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_JIS_00_2019_CSV.zip |
|
2 |
https://openstore.zcu.cz/OD_ZCU_01_2020/OD_ZCU_JIS_01_2020_CSV.zip |
|
3 |
https://openstore.zcu.cz/OD_ZCU_02_2020/OD_ZCU_JIS_02_2020_CSV.zip |
|
4 |
https://openstore.zcu.cz/OD_ZCU_03_2020/OD_ZCU_JIS_03_2020_CSV.zip |
|
5 |
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_JIS_06_2019_CSV.zip |
|
6 |
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_JIS_07_2019_CSV.zip |
|
7 |
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_JIS_08_2019_CSV.zip |
|
8 |
https://openstore.zcu.cz/OD_ZCU_09_2019/OD_ZCU_JIS_09_2019_CSV.zip |
|
9 |
https://openstore.zcu.cz/OD_ZCU_10_2019/OD_ZCU_JIS_10_2019_CSV.zip |
|
10 |
https://openstore.zcu.cz/OD_ZCU_11_2019/OD_ZCU_JIS_11_2019_CSV.zip |
|
11 |
https://openstore.zcu.cz/OD_ZCU_12_2019/OD_ZCU_JIS_12_2019_CSV.zip |
modules/crawler/CrawlerLogs/KOLOBEZKY/ignore.txt | ||
---|---|---|
1 |
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_KOLOBEZKY_00_2019_CSV.zip |
|
2 |
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_KOLOBEZKY_06_2019_CSV.zip |
|
3 |
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_KOLOBEZKY_07_2019_CSV.zip |
|
4 |
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_KOLOBEZKY_08_2019_CSV.zip |
modules/crawler/CrawlerLogs/WIFI/ignore.txt | ||
---|---|---|
1 |
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_WIFI_00_2019_CSV.zip |
|
2 |
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_WIFI_06_2019_CSV.zip |
|
3 |
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_WIFI_07_2019_CSV.zip |
|
4 |
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_WIFI_08_2019_CSV.zip |
modules/crawler/DatasetConfigs/JIS.yaml | ||
---|---|---|
8 | 8 |
update-period: 24 |
9 | 9 |
# pozice jednotlivych zarizeni, ktera jsou v datasetu |
10 | 10 |
devices: |
11 |
- US 005 - z?vora vjezd: |
|
12 |
x: UNKNOWN! |
|
13 |
y: UNKNOWN! |
|
14 | ||
15 |
- STUD_KL20: |
|
16 |
x: UNKNOWN! |
|
17 |
y: UNKNOWN! |
|
18 | ||
19 |
- US 005 - m?? vjezd: |
|
20 |
x: UNKNOWN! |
|
21 |
y: UNKNOWN! |
|
22 | ||
23 |
- Menza4-kasa3: |
|
24 |
x: UNKNOWN! |
|
25 |
y: UNKNOWN! |
|
26 | ||
27 |
- NTIS-BUFET: |
|
28 |
x: UNKNOWN! |
|
29 |
y: UNKNOWN! |
|
30 | ||
31 |
- Parkoviste-vjezd: |
|
32 |
x: UNKNOWN! |
|
33 |
y: UNKNOWN! |
|
34 | ||
35 |
- Menza4-kasa1: |
|
36 |
x: UNKNOWN! |
|
37 |
y: UNKNOWN! |
|
38 | ||
39 |
- Menza1-kasa-p: |
|
40 |
x: UNKNOWN! |
|
41 |
y: UNKNOWN! |
|
42 | ||
43 |
- Menza4-kasa2: |
|
44 |
x: UNKNOWN! |
|
45 |
y: UNKNOWN! |
|
46 | ||
47 |
- L2: |
|
48 |
x: UNKNOWN! |
|
49 |
y: UNKNOWN! |
|
50 | ||
51 |
- EP-BUFET: |
|
52 |
x: UNKNOWN! |
|
53 |
y: UNKNOWN! |
|
54 | ||
55 |
- KolaBory-vnitrni: |
|
56 |
x: UNKNOWN! |
|
57 |
y: UNKNOWN! |
|
58 | ||
59 |
- Menza1-kasa-l: |
|
60 |
x: UNKNOWN! |
|
61 |
y: UNKNOWN! |
|
62 | ||
63 |
- Zavora-FEL: |
|
64 |
x: UNKNOWN! |
|
65 |
y: UNKNOWN! |
|
66 | ||
67 |
- Zavora-NTIS-vjezd: |
|
68 |
x: UNKNOWN! |
|
69 |
y: UNKNOWN! |
|
70 | ||
71 |
- L-Posilovna: |
|
72 |
x: UNKNOWN! |
|
73 |
y: UNKNOWN! |
|
74 | ||
75 |
- STUD_ST407: |
|
76 |
x: UNKNOWN! |
|
77 |
y: UNKNOWN! |
|
78 | ||
79 |
- M16: |
|
80 |
x: UNKNOWN! |
|
81 |
y: UNKNOWN! |
|
82 | ||
83 |
- Zavora-Kaplirova: |
|
84 |
x: UNKNOWN! |
|
85 |
y: UNKNOWN! |
|
86 | ||
87 |
- Zavora-FDU: |
|
88 |
x: UNKNOWN! |
|
89 |
y: UNKNOWN! |
|
90 | ||
91 |
- KolaBory-vnejsi: |
|
92 |
x: UNKNOWN! |
|
93 |
y: UNKNOWN! |
|
94 | ||
95 |
- STUD_VC53: |
|
96 |
x: UNKNOWN! |
|
97 |
y: UNKNOWN! |
|
98 | ||
99 |
- Menza4-kasa4: |
|
100 |
x: UNKNOWN! |
|
101 |
y: UNKNOWN! |
|
102 | ||
103 |
- KL-Posilovna: |
|
104 |
x: UNKNOWN! |
|
105 |
y: UNKNOWN! |
|
106 | ||
107 |
- VC-VYJEZD: |
|
108 |
x: UNKNOWN! |
|
109 |
y: UNKNOWN! |
|
110 | ||
111 |
- EXT/kola-B: |
|
112 |
x: UNKNOWN! |
|
113 |
y: UNKNOWN! |
|
114 | ||
115 |
- Menza4-kasa5: |
|
116 |
x: UNKNOWN! |
|
117 |
y: UNKNOWN! |
|
118 | ||
119 |
- L1L2-vchod: |
|
120 |
x: UNKNOWN! |
|
121 |
y: UNKNOWN! |
|
122 | ||
123 |
- VC-VJEZD: |
|
124 |
x: UNKNOWN! |
|
125 |
y: UNKNOWN! |
|
126 | ||
127 |
- A3: |
|
128 |
x: UNKNOWN! |
|
129 |
y: UNKNOWN! |
|
130 | ||
131 |
- STUD_UB113: |
|
132 |
x: UNKNOWN! |
|
133 |
y: UNKNOWN! |
|
134 | ||
135 |
- STUD_CHEB: |
|
136 |
x: UNKNOWN! |
|
137 |
y: UNKNOWN! |
|
138 | ||
139 |
- STUD_PRA1: |
|
140 |
x: UNKNOWN! |
|
141 |
y: UNKNOWN! |
|
142 | ||
143 |
- L1: |
|
144 |
x: UNKNOWN! |
|
145 |
y: UNKNOWN! |
|
146 | ||
147 |
- STUD_UB211: |
|
148 |
x: UNKNOWN! |
|
149 |
y: UNKNOWN! |
|
150 | ||
151 |
- A1: |
|
152 |
x: UNKNOWN! |
|
153 |
y: UNKNOWN! |
|
154 | ||
155 |
- EXT/kola: |
|
156 |
x: UNKNOWN! |
|
157 |
y: UNKNOWN! |
|
158 | ||
159 |
- STUD_KL87: |
|
160 |
x: UNKNOWN! |
|
161 |
y: UNKNOWN! |
|
162 | ||
163 |
- UV1-Bufet: |
|
164 |
x: UNKNOWN! |
|
165 |
y: UNKNOWN! |
|
166 | ||
167 |
- M14: |
|
168 |
x: UNKNOWN! |
|
169 |
y: UNKNOWN! |
|
170 | ||
171 |
- Zavora-NTIS-vyjezd: |
|
172 |
x: UNKNOWN! |
|
173 |
y: UNKNOWN! |
|
174 | ||
175 |
- B3-kolarna: |
|
176 |
x: UNKNOWN! |
|
177 |
y: UNKNOWN! |
|
178 | ||
179 |
- B3-LEVY: |
|
180 |
x: UNKNOWN! |
|
181 |
y: UNKNOWN! |
|
182 | ||
183 |
- MenzaKL-vydej: |
|
184 |
x: UNKNOWN! |
|
185 |
y: UNKNOWN! |
|
186 | ||
187 |
- A2-Hlavni vchod: |
|
188 |
x: UNKNOWN! |
|
189 |
y: UNKNOWN! |
|
190 | ||
191 |
- Parkoviste-vyjezd: |
|
192 |
x: UNKNOWN! |
|
193 |
y: UNKNOWN! |
|
194 | ||
11 |
- Menza4-kasa5: |
|
12 |
x: UNKNOWN! |
|
13 |
y: UNKNOWN! |
|
14 | ||
15 |
- Zavora-FDU: |
|
16 |
x: UNKNOWN! |
|
17 |
y: UNKNOWN! |
|
18 | ||
19 |
- STUD_PRA1: |
|
20 |
x: UNKNOWN! |
|
21 |
y: UNKNOWN! |
|
22 | ||
23 |
- B3-LEVY: |
|
24 |
x: UNKNOWN! |
|
25 |
y: UNKNOWN! |
|
26 | ||
27 |
- KolaBory-vnejsi: |
|
28 |
x: UNKNOWN! |
|
29 |
y: UNKNOWN! |
|
30 | ||
31 |
- VC-VYJEZD: |
|
32 |
x: UNKNOWN! |
|
33 |
y: UNKNOWN! |
|
34 | ||
35 |
- L-Posilovna: |
|
36 |
x: UNKNOWN! |
|
37 |
y: UNKNOWN! |
|
38 | ||
39 |
- A3: |
|
40 |
x: UNKNOWN! |
|
41 |
y: UNKNOWN! |
|
42 | ||
43 |
- Menza4-kasa1: |
|
44 |
x: UNKNOWN! |
|
45 |
y: UNKNOWN! |
|
46 | ||
47 |
- US 005 - závora vjezd: |
|
48 |
x: UNKNOWN! |
|
49 |
y: UNKNOWN! |
|
50 | ||
51 |
- EP-BUFET: |
|
52 |
x: UNKNOWN! |
|
53 |
y: UNKNOWN! |
|
54 | ||
55 |
- Zavora-FEL: |
|
56 |
x: UNKNOWN! |
|
57 |
y: UNKNOWN! |
|
58 | ||
59 |
- US 005 - mříž vjezd: |
|
60 |
x: UNKNOWN! |
|
61 |
y: UNKNOWN! |
|
62 | ||
63 |
- STUD_VC53: |
|
64 |
x: UNKNOWN! |
|
65 |
y: UNKNOWN! |
|
66 | ||
67 |
- NTIS-BUFET: |
|
68 |
x: UNKNOWN! |
|
69 |
y: UNKNOWN! |
|
70 | ||
71 |
- Zavora-NTIS-vjezd: |
|
72 |
x: UNKNOWN! |
|
73 |
y: UNKNOWN! |
|
74 | ||
75 |
- EXT/kola: |
|
76 |
x: UNKNOWN! |
|
77 |
y: UNKNOWN! |
|
78 | ||
79 |
- VC-VJEZD: |
|
80 |
x: UNKNOWN! |
|
81 |
y: UNKNOWN! |
|
82 | ||
83 |
- Zavora-Kaplirova: |
|
84 |
x: UNKNOWN! |
|
85 |
y: UNKNOWN! |
|
86 | ||
87 |
- M16: |
|
88 |
x: UNKNOWN! |
|
89 |
y: UNKNOWN! |
|
90 | ||
91 |
- Menza1-kasa-p: |
|
92 |
x: UNKNOWN! |
|
93 |
y: UNKNOWN! |
|
94 | ||
95 |
- STUD_CHEB: |
|
96 |
x: UNKNOWN! |
|
97 |
y: UNKNOWN! |
|
98 | ||
99 |
- M14: |
|
100 |
x: UNKNOWN! |
|
101 |
y: UNKNOWN! |
|
102 | ||
103 |
- L2: |
|
104 |
x: UNKNOWN! |
|
105 |
y: UNKNOWN! |
|
106 | ||
107 |
- STUD_UB113: |
|
108 |
x: UNKNOWN! |
|
109 |
y: UNKNOWN! |
|
110 | ||
111 |
- B3-kolarna: |
|
112 |
x: UNKNOWN! |
|
113 |
y: UNKNOWN! |
|
114 | ||
115 |
- L1: |
|
116 |
x: UNKNOWN! |
|
117 |
y: UNKNOWN! |
|
118 | ||
119 |
- Menza1-kasa-l: |
|
120 |
x: UNKNOWN! |
|
121 |
y: UNKNOWN! |
|
122 | ||
123 |
- Zavora-NTIS-vyjezd: |
|
124 |
x: UNKNOWN! |
|
125 |
y: UNKNOWN! |
|
126 | ||
127 |
- Menza4-kasa4: |
|
128 |
x: UNKNOWN! |
|
129 |
y: UNKNOWN! |
|
130 | ||
131 |
- MenzaKL-vydej: |
|
132 |
x: UNKNOWN! |
|
133 |
y: UNKNOWN! |
|
134 | ||
135 |
- Parkoviste-vjezd: |
|
136 |
x: UNKNOWN! |
|
137 |
y: UNKNOWN! |
|
138 | ||
139 |
- KL-Posilovna: |
|
140 |
x: UNKNOWN! |
|
141 |
y: UNKNOWN! |
|
142 | ||
143 |
- A1: |
|
144 |
x: UNKNOWN! |
|
145 |
y: UNKNOWN! |
|
146 | ||
147 |
- KolaBory-vnitrni: |
|
148 |
x: UNKNOWN! |
|
149 |
y: UNKNOWN! |
|
150 | ||
151 |
- EXT/kola-B: |
|
152 |
x: UNKNOWN! |
|
153 |
y: UNKNOWN! |
|
154 | ||
155 |
- A2-Hlavni vchod: |
|
156 |
x: UNKNOWN! |
|
157 |
y: UNKNOWN! |
|
158 | ||
159 |
- STUD_ST407: |
|
160 |
x: UNKNOWN! |
|
161 |
y: UNKNOWN! |
|
162 | ||
163 |
- STUD_KL87: |
|
164 |
x: UNKNOWN! |
|
165 |
y: UNKNOWN! |
|
166 | ||
167 |
- Menza4-kasa2: |
|
168 |
x: UNKNOWN! |
|
169 |
y: UNKNOWN! |
|
170 | ||
171 |
- L1L2-vchod: |
|
172 |
x: UNKNOWN! |
|
173 |
y: UNKNOWN! |
|
174 | ||
175 |
- Menza4-kasa3: |
|
176 |
x: UNKNOWN! |
|
177 |
y: UNKNOWN! |
|
178 | ||
179 |
- Parkoviste-vyjezd: |
|
180 |
x: UNKNOWN! |
|
181 |
y: UNKNOWN! |
|
182 | ||
183 |
- STUD_KL20: |
|
184 |
x: UNKNOWN! |
|
185 |
y: UNKNOWN! |
|
186 | ||
187 |
- UV1-Bufet: |
|
188 |
x: UNKNOWN! |
|
189 |
y: UNKNOWN! |
|
190 | ||
191 |
- STUD_UB211: |
|
192 |
x: UNKNOWN! |
|
193 |
y: UNKNOWN! |
|
194 |
modules/crawler/PrepareNewDataset.py | ||
---|---|---|
1 |
import os |
|
2 | ||
3 |
# Path to crawled data |
|
4 |
CRAWLED_DATA_PATH = "../CrawledData/" |
|
5 |
# Path to processed data |
|
6 |
PROCESSED_DATA_PATH = "../ProcessedData/" |
|
7 |
# Path to crawler logs |
|
8 |
CRAWLER_LOGS_PATH = "../CrawlerLogs/" |
|
9 |
# Path for DatasetCrawlers implementations |
|
10 |
CRAWLER_PROGRAM_PATH = "../DatasetCrawler" |
|
11 |
# Path for DatasetProcessors implementations |
|
12 |
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing" |
|
13 |
# Path to dataset configuration files |
|
14 |
CONFIG_FILES_PATH = "../DatasetConfigs" |
|
15 | ||
16 | ||
17 |
def create_default_config_file(dataset_name): |
|
18 |
""" |
|
19 |
Creates default config file |
|
20 | ||
21 |
Args: |
|
22 |
dataset_name: Name of newly created dataset |
|
23 |
""" |
|
24 |
with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file: |
|
25 |
file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n") |
|
26 |
file.write("dataset-name: " + dataset_name + "\n") |
|
27 |
file.write("# root slozka, ktera obsahuje odkazy na dataset\n") |
|
28 |
file.write("url: ZDE VLOZTE URL\n") |
|
29 |
file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n") |
|
30 |
file.write("regex: ZDE VLOZTE REGEX\n") |
|
31 |
file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, " |
|
32 |
"tak defaultni hodnota (dny)\n") |
|
33 |
file.write("update-period: ZDE VLOZTE HODNOTU\n") |
|
34 |
file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n") |
|
35 |
file.write("devices:\n") |
|
36 | ||
37 | ||
38 |
def create_default_processor(dataset_name): |
|
39 |
""" |
|
40 |
Creates default processor for dataset |
|
41 | ||
42 |
Args: |
|
43 |
dataset_name: Name of newly created dataset |
|
44 |
""" |
|
45 |
with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file: |
|
46 |
file.write("from Utilities.CSV import CSVDataLine, CSVutils") |
|
47 |
file.write("\n") |
|
48 |
file.write("\n") |
|
49 |
file.write("def process_file(filename):\n") |
|
50 |
file.write(" \"\"\"\n") |
|
51 |
file.write(" Method that take path to crawled file and outputs date dictionary using method:\n") |
|
52 |
file.write(" CSVutils.export_data_to_csv(filename, date_dict)\n") |
|
53 |
file.write(" Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n") |
|
54 |
file.write(" and value is dictionary where keys devices (specified in configuration file)\n") |
|
55 |
file.write(" and value is CSVDataLine.CSVDataLine with device,date and occurrence\n") |
|
56 |
file.write("\n") |
|
57 |
file.write(" Args:\n") |
|
58 |
file.write(" filename: name of processed file\n") |
|
59 |
file.write("\n") |
|
60 |
file.write(" Returns:\n") |
|
61 |
file.write(" False if not implemented\n") |
|
62 |
file.write(" True when implemented\n") |
|
63 |
file.write(" \"\"\"\n") |
|
64 |
file.write(" print(\"You must implements process_file method first!\")\n") |
|
65 |
file.write(" #CSVutils.export_data_to_csv(filename, date_dict)\n") |
|
66 |
file.write(" return False\n") |
|
67 | ||
68 | ||
69 |
def create_default_crawler(dataset_name): |
|
70 |
""" |
|
71 |
Creates default crawler for dataset |
|
72 | ||
73 |
Args: |
|
74 |
dataset_name: Name of newly created dataset |
|
75 |
""" |
|
76 | ||
77 |
with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file: |
|
78 |
file.write("# Path to crawled data\n") |
|
79 |
file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n") |
|
80 |
file.write("\n") |
|
81 |
file.write("\n") |
|
82 |
file.write("def crawl(config):\n") |
|
83 |
file.write(" \"\"\"\n") |
|
84 |
file.write(" Implement crawl method that downloads new data to path_for_files\n") |
|
85 |
file.write(" For keeping the project structure\n") |
|
86 |
file.write(" url , regex, and dataset_name from config\n") |
|
87 |
file.write(" You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n") |
|
88 |
file.write("\n") |
|
89 |
file.write(" Args:\n") |
|
90 |
file.write(" config: loaded configuration file of dataset\n") |
|
91 |
file.write(" \"\"\"\n") |
|
92 |
file.write(" dataset_name = config[\"dataset-name\"]\n") |
|
93 |
file.write(" url = config['url']\n") |
|
94 |
file.write(" regex = config['regex']\n") |
|
95 |
file.write(" path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n") |
|
96 |
file.write(" print(\"You must implements Crawl method first!\")\n") |
|
97 | ||
98 | ||
99 |
def create_ignore_file(path, text): |
|
100 |
""" |
|
101 |
Creates ignore file |
|
102 |
Args: |
|
103 |
path: path to directory for creating ignore.txt |
|
104 |
text: text that will be on first line of ignore.txt can be None |
|
105 |
""" |
|
106 |
with open(path + "/ignore.txt", "w") as file: |
|
107 |
if text is not None: |
|
108 |
file.write(text + "\n") |
|
109 | ||
110 | ||
111 |
def prepare_dataset_structure(dataset_name): |
|
112 |
""" |
|
113 |
Prepares folders for new dataset |
|
114 |
Args: |
|
115 |
dataset_name: Name of newly created dataset |
|
116 |
""" |
|
117 |
jump_folder = "../" |
|
118 | ||
119 |
# create folder for crawled data |
|
120 |
try: |
|
121 |
path = CRAWLED_DATA_PATH+dataset_name |
|
122 |
os.mkdir(path) |
|
123 |
create_ignore_file(path, "ignore.txt") |
|
124 |
except os.error as e: |
|
125 |
print(e) |
|
126 |
print("Creation of the directory %s failed" % path) |
|
127 | ||
128 |
# create folder for processed data |
|
129 |
try: |
|
130 |
path = PROCESSED_DATA_PATH + dataset_name |
|
131 |
os.mkdir(path) |
|
132 |
create_ignore_file(path, "ignore.txt") |
|
133 |
except OSError: |
|
134 |
print("Creation of the directory %s failed" % path) |
|
135 | ||
136 |
# create folder for crawler logs |
|
137 |
try: |
|
138 |
path = CRAWLER_LOGS_PATH + dataset_name |
|
139 |
os.mkdir(path) |
|
140 |
create_ignore_file(path, None) |
|
141 |
except OSError: |
|
142 |
print("Creation of the directory %s failed" % path) |
|
143 | ||
144 |
create_default_crawler(dataset_name) |
|
145 |
create_default_processor(dataset_name) |
|
146 |
create_default_config_file(dataset_name) |
|
147 | ||
148 | ||
149 |
prepare_dataset_structure("TEST") |
modules/crawler/ProcessedData/KOLOBEZKY/ignore.txt | ||
---|---|---|
1 | 1 |
ignore.txt |
2 |
OD_ZCU_KOLOBEZKY_08_2019.CSV |
|
3 |
OD_ZCU_KOLOBEZKY_00_2019.CSV |
|
4 |
OD_ZCU_KOLOBEZKY_07_2019.CSV |
|
5 |
OD_ZCU_KOLOBEZKY_06_2019.CSV |
modules/crawler/RemoveDataset.py | ||
---|---|---|
1 |
import os |
|
2 |
import shutil |
|
3 | ||
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
6 |
# Path to processed data |
|
7 |
PROCESSED_DATA_PATH = "ProcessedData/" |
|
8 |
# Path to crawler logs |
|
9 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
|
10 |
# Path to dataset configuration files |
|
11 |
CONFIG_FILES_PATH = "DatasetConfigs" |
|
12 |
# Path for DatasetCrawlers implementations |
|
13 |
CRAWLER_PROGRAM_PATH = "DatasetCrawler" |
|
14 |
# Path for DatasetProcessors implementations |
|
15 |
PROCESSOR_PROGRAM_PATH = "DatasetProcessing" |
|
16 | ||
17 | ||
18 |
def remove_dataset(dataset_name): |
|
19 |
""" |
|
20 |
Remove dataset |
|
21 |
Args: |
|
22 |
dataset_name: name of dataset that has existing configuration file |
|
23 |
""" |
|
24 |
shutil.rmtree(CRAWLED_DATA_PATH + dataset_name + "/") |
|
25 |
shutil.rmtree(PROCESSED_DATA_PATH + dataset_name + "/") |
|
26 |
shutil.rmtree(CRAWLER_LOGS_PATH + dataset_name + "/") |
|
27 | ||
28 |
os.remove(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml") |
|
29 |
os.remove(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py") |
|
30 |
os.remove(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py") |
|
31 | ||
32 |
print("Dataset: " + dataset_name + " removed") |
|
33 | ||
34 |
remove_dataset("TEST") |
modules/crawler/RemoveDatasetDatabase.py | ||
---|---|---|
1 |
from Utilities.Database import DatabaseLoader |
|
2 | ||
3 | ||
4 |
def remove_dataset_database(dataset_name): |
|
5 |
""" |
|
6 |
Removes dataset entries from database |
|
7 |
Args: |
|
8 |
dataset_name: name of dataset that has existing configuration file |
|
9 |
""" |
|
10 |
# Creating connection |
|
11 |
mydb = DatabaseLoader.create_database_connection() |
|
12 | ||
13 |
# collection where are specified aviable datasets |
|
14 |
collection_datasets = mydb[DatabaseLoader.MONGODB_DATASET_COLLECTION] |
|
15 | ||
16 |
collection_datasets.delete_one({"name": dataset_name}) |
|
17 |
print("Removing record from DATASETS collection") |
|
18 | ||
19 | ||
20 |
# Retrieve list of all collections |
|
21 |
collections = mydb.list_collection_names() |
|
22 | ||
23 |
# Drop of all collections |
|
24 |
for name in collections: |
|
25 |
if name.startswith(dataset_name): |
|
26 |
mydb[name].drop() |
|
27 |
print("Dropping: " + name) |
|
28 | ||
29 |
print("Database Cleaned") |
|
30 | ||
31 | ||
32 |
remove_dataset_database("KOLOBEZKY") |
modules/crawler/ResetDatabaseData.py | ||
---|---|---|
1 |
from Utilities.Database import DatabaseLoader |
|
2 | ||
3 | ||
4 |
def clean_database(): |
|
5 |
""" |
|
6 |
Deletes all collections from database |
|
7 |
""" |
|
8 |
# Create connection |
|
9 |
mydb = DatabaseLoader.create_database_connection() |
|
10 | ||
11 |
# Retrieve list of all collections |
|
12 |
collections = mydb.list_collection_names() |
|
13 | ||
14 |
# Drop of all collections |
|
15 |
for name in collections: |
|
16 |
mydb[name].drop() |
|
17 | ||
18 |
print("Database Cleaned") |
|
19 | ||
20 | ||
21 |
clean_database() |
modules/crawler/ResetDataset.py | ||
---|---|---|
1 |
import os |
|
2 |
from Utilities import FolderProcessor |
|
3 | ||
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "../CrawledData/" |
|
6 |
# Path to processed data |
|
7 |
PROCESSED_DATA_PATH = "../ProcessedData/" |
|
8 |
# Path to crawler logs |
|
9 |
CRAWLER_LOGS_PATH = "../CrawlerLogs/" |
|
10 |
# Path to dataset configuration files |
|
11 |
CONFIG_FILES_PATH = "../DatasetConfigs" |
|
12 | ||
13 | ||
14 |
def create_ignore_file(path, text): |
|
15 |
""" |
|
16 |
Creates ignore file |
|
17 |
Args: |
|
18 |
path: path to directory for creating ignore.txt |
|
19 |
text: text that will be on first line of ignore.txt can be None |
|
20 |
""" |
|
21 |
with open(path + "/ignore.txt", "w") as file: |
|
22 |
if text is not None: |
|
23 |
file.write(text + "\n") |
|
24 | ||
25 | ||
26 |
def reset_dataset(dataset_name): |
|
27 |
""" |
|
28 |
Resets all saved data in dataset except config and implementation |
|
29 |
Args: |
|
30 |
dataset_name: name of dataset that has existing configuration file |
|
31 |
""" |
|
32 |
path = CRAWLED_DATA_PATH + dataset_name + "/" |
|
33 |
FolderProcessor.clean_folder(path) |
|
34 |
create_ignore_file(path, "ignore.txt") |
|
35 | ||
36 |
path = PROCESSED_DATA_PATH + dataset_name + "/" |
|
37 |
FolderProcessor.clean_folder(path) |
|
38 |
create_ignore_file(path, "ignore.txt") |
|
39 | ||
40 |
path = CRAWLER_LOGS_PATH + dataset_name + "/" |
|
41 |
FolderProcessor.clean_folder(path) |
|
42 |
create_ignore_file(path, None) |
|
43 | ||
44 | ||
45 |
def reset_all_datasets(): |
|
46 |
""" |
|
47 |
Resets all saved data in all datasets with config file except configs and implementation |
|
48 |
""" |
|
49 |
datasets = os.listdir(CONFIG_FILES_PATH) |
|
50 | ||
51 |
for dataset in datasets: |
|
52 |
reset_dataset(dataset.split('.')[0]) |
|
53 | ||
54 | ||
55 |
reset_all_datasets() |
modules/crawler/Scripts/PrepareNewDataset.py | ||
---|---|---|
1 |
import os |
|
2 | ||
3 |
# Path to crawled data |
|
4 |
CRAWLED_DATA_PATH = "../CrawledData/" |
|
5 |
# Path to processed data |
|
6 |
PROCESSED_DATA_PATH = "../ProcessedData/" |
|
7 |
# Path to crawler logs |
|
8 |
CRAWLER_LOGS_PATH = "../CrawlerLogs/" |
|
9 |
# Path for DatasetCrawlers implementations |
|
10 |
CRAWLER_PROGRAM_PATH = "../DatasetCrawler" |
|
11 |
# Path for DatasetProcessors implementations |
|
12 |
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing" |
|
13 |
# Path to dataset configuration files |
|
14 |
CONFIG_FILES_PATH = "../DatasetConfigs" |
|
15 | ||
16 | ||
17 |
def create_default_config_file(dataset_name): |
|
18 |
""" |
|
19 |
Creates default config file |
|
20 | ||
21 |
Args: |
|
22 |
dataset_name: Name of newly created dataset |
|
23 |
""" |
|
24 |
with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file: |
|
25 |
file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n") |
|
26 |
file.write("dataset-name: " + dataset_name + "\n") |
|
27 |
file.write("# root slozka, ktera obsahuje odkazy na dataset\n") |
|
28 |
file.write("url: ZDE VLOZTE URL\n") |
|
29 |
file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n") |
|
30 |
file.write("regex: ZDE VLOZTE REGEX\n") |
|
31 |
file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, " |
|
32 |
"tak defaultni hodnota (dny)\n") |
|
33 |
file.write("update-period: ZDE VLOZTE HODNOTU\n") |
|
34 |
file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n") |
|
35 |
file.write("devices:\n") |
|
36 | ||
37 | ||
38 |
def create_default_processor(dataset_name): |
|
39 |
""" |
|
40 |
Creates default processor for dataset |
|
41 | ||
42 |
Args: |
|
43 |
dataset_name: Name of newly created dataset |
|
44 |
""" |
|
45 |
with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file: |
|
46 |
file.write("from Utilities.CSV import CSVDataLine, CSVutils") |
|
47 |
file.write("\n") |
|
48 |
file.write("\n") |
|
49 |
file.write("def process_file(filename):\n") |
|
50 |
file.write(" \"\"\"\n") |
|
51 |
file.write(" Method that take path to crawled file and outputs date dictionary using method:\n") |
|
52 |
file.write(" CSVutils.export_data_to_csv(filename, date_dict)\n") |
|
53 |
file.write(" Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n") |
|
54 |
file.write(" and value is dictionary where keys devices (specified in configuration file)\n") |
|
55 |
file.write(" and value is CSVDataLine.CSVDataLine with device,date and occurrence\n") |
|
56 |
file.write("\n") |
|
57 |
file.write(" Args:\n") |
|
58 |
file.write(" filename: name of processed file\n") |
|
59 |
file.write("\n") |
|
60 |
file.write(" Returns:\n") |
|
61 |
file.write(" False if not implemented\n") |
|
62 |
file.write(" True when implemented\n") |
|
63 |
file.write(" \"\"\"\n") |
|
64 |
file.write(" print(\"You must implements process_file method first!\")\n") |
|
65 |
file.write(" #CSVutils.export_data_to_csv(filename, date_dict)\n") |
|
66 |
file.write(" return False\n") |
|
67 | ||
68 | ||
69 |
def create_default_crawler(dataset_name): |
|
70 |
""" |
|
71 |
Creates default crawler for dataset |
|
72 | ||
73 |
Args: |
|
74 |
dataset_name: Name of newly created dataset |
|
75 |
""" |
|
76 | ||
77 |
with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file: |
|
78 |
file.write("# Path to crawled data\n") |
|
79 |
file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n") |
|
80 |
file.write("\n") |
|
81 |
file.write("\n") |
|
82 |
file.write("def crawl(config):\n") |
|
83 |
file.write(" \"\"\"\n") |
|
84 |
file.write(" Implement crawl method that downloads new data to path_for_files\n") |
|
85 |
file.write(" For keeping the project structure\n") |
|
86 |
file.write(" url , regex, and dataset_name from config\n") |
|
87 |
file.write(" You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n") |
|
88 |
file.write("\n") |
|
89 |
file.write(" Args:\n") |
|
90 |
file.write(" config: loaded configuration file of dataset\n") |
|
91 |
file.write(" \"\"\"\n") |
|
92 |
file.write(" dataset_name = config[\"dataset-name\"]\n") |
|
93 |
file.write(" url = config['url']\n") |
|
94 |
file.write(" regex = config['regex']\n") |
|
95 |
file.write(" path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n") |
|
96 |
file.write(" print(\"You must implements Crawl method first!\")\n") |
|
97 | ||
98 | ||
99 |
def create_ignore_file(path, text): |
|
100 |
""" |
|
101 |
Creates ignore file |
|
102 |
Args: |
|
103 |
path: path to directory for creating ignore.txt |
|
104 |
text: text that will be on first line of ignore.txt can be None |
|
105 |
""" |
|
106 |
with open(path + "/ignore.txt", "w") as file: |
|
107 |
if text is not None: |
|
108 |
file.write(text + "\n") |
|
109 | ||
110 | ||
111 |
def prepare_dataset_structure(dataset_name): |
|
112 |
""" |
|
113 |
Prepares folders for new dataset |
|
114 |
Args: |
|
115 |
dataset_name: Name of newly created dataset |
|
116 |
""" |
|
117 |
jump_folder = "../" |
|
118 | ||
119 |
# create folder for crawled data |
|
120 |
try: |
|
121 |
path = CRAWLED_DATA_PATH+dataset_name |
|
122 |
os.mkdir(path) |
|
123 |
create_ignore_file(path, "ignore.txt") |
|
124 |
except os.error as e: |
|
125 |
print(e) |
|
126 |
print("Creation of the directory %s failed" % path) |
|
127 | ||
128 |
# create folder for processed data |
|
129 |
try: |
|
130 |
path = PROCESSED_DATA_PATH + dataset_name |
|
131 |
os.mkdir(path) |
|
132 |
create_ignore_file(path, "ignore.txt") |
|
133 |
except OSError: |
|
134 |
print("Creation of the directory %s failed" % path) |
|
135 | ||
136 |
# create folder for crawler logs |
|
137 |
try: |
|
138 |
path = CRAWLER_LOGS_PATH + dataset_name |
|
139 |
os.mkdir(path) |
|
140 |
create_ignore_file(path, None) |
|
141 |
except OSError: |
|
142 |
print("Creation of the directory %s failed" % path) |
|
143 | ||
144 |
create_default_crawler(dataset_name) |
|
145 |
create_default_processor(dataset_name) |
|
146 |
create_default_config_file(dataset_name) |
|
147 | ||
148 | ||
149 |
prepare_dataset_structure("TEST") |
modules/crawler/Scripts/RemoveDataset.py | ||
---|---|---|
1 |
import os |
|
2 |
import shutil |
|
3 | ||
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "../CrawledData/" |
|
6 |
# Path to processed data |
|
7 |
PROCESSED_DATA_PATH = "../ProcessedData/" |
|
8 |
# Path to crawler logs |
|
9 |
CRAWLER_LOGS_PATH = "../CrawlerLogs/" |
|
10 |
# Path to dataset configuration files |
|
11 |
CONFIG_FILES_PATH = "../DatasetConfigs" |
|
12 |
# Path for DatasetCrawlers implementations |
|
13 |
CRAWLER_PROGRAM_PATH = "../DatasetCrawler" |
|
14 |
# Path for DatasetProcessors implementations |
|
15 |
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing" |
|
16 | ||
17 | ||
18 |
def remove_dataset(dataset_name): |
|
19 |
""" |
|
20 |
Remove dataset |
|
21 |
Args: |
|
22 |
dataset_name: name of dataset that has existing configuration file |
|
23 |
""" |
|
24 |
shutil.rmtree(CRAWLED_DATA_PATH + dataset_name + "/") |
|
25 |
shutil.rmtree(PROCESSED_DATA_PATH + dataset_name + "/") |
|
26 |
shutil.rmtree(CRAWLER_LOGS_PATH + dataset_name + "/") |
|
27 | ||
28 |
os.remove(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml") |
|
29 |
os.remove(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py") |
|
30 |
os.remove(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py") |
|
31 | ||
32 |
print("Dataset: " + dataset_name + " removed"); |
|
33 | ||
34 |
remove_dataset("TEST"); |
modules/crawler/Scripts/RemoveDatasetDatabase.py | ||
---|---|---|
1 |
from Utilities.Database import DatabaseLoader |
|
2 | ||
3 | ||
4 |
def remove_dataset_database(dataset_name): |
|
5 |
""" |
|
6 |
Removes dataset entries from database |
|
7 |
Args: |
|
8 |
dataset_name: name of dataset that has existing configuration file |
|
9 |
""" |
|
10 |
# Creating connection |
|
11 |
mydb = DatabaseLoader.create_database_connection(); |
|
12 | ||
13 |
# collection where are specified aviable datasets |
|
14 |
collection_datasets = mydb[DatabaseLoader.MONGODB_DATASET_COLLECTION] |
|
15 | ||
16 |
collection_datasets.delete_one({"name": dataset_name}) |
|
17 |
print("Removing record from DATASETS collection") |
|
18 | ||
19 | ||
20 |
# Retrieve list of all collections |
|
21 |
collections = mydb.list_collection_names() |
|
22 | ||
23 |
# Drop of all collections |
|
24 |
for name in collections: |
|
25 |
if name.startswith(dataset_name): |
|
26 |
mydb[name].drop() |
|
27 |
print("Dropping: " + name) |
|
28 | ||
29 |
print("Database Cleaned") |
|
30 | ||
31 | ||
32 |
remove_dataset_database("KOLOBEZKY") |
modules/crawler/Scripts/ResetDatabaseData.py | ||
---|---|---|
1 |
from Utilities.Database import DatabaseLoader |
|
2 | ||
3 | ||
4 |
def clean_database(): |
|
5 |
""" |
|
6 |
Deletes all collections from database |
|
7 |
""" |
|
8 |
# Create connection |
|
9 |
mydb = DatabaseLoader.create_database_connection() |
|
10 | ||
11 |
# Retrieve list of all collections |
|
12 |
collections = mydb.list_collection_names() |
|
13 | ||
14 |
# Drop of all collections |
|
15 |
for name in collections: |
|
16 |
mydb[name].drop() |
|
17 | ||
18 |
print("Database Cleaned") |
|
19 | ||
20 | ||
21 |
clean_database(); |
modules/crawler/Scripts/ResetDataset.py | ||
---|---|---|
1 |
import os |
|
2 |
from Utilities import FolderProcessor |
|
3 | ||
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "../CrawledData/" |
|
6 |
# Path to processed data |
|
7 |
PROCESSED_DATA_PATH = "../ProcessedData/" |
|
8 |
# Path to crawler logs |
|
9 |
CRAWLER_LOGS_PATH = "../CrawlerLogs/" |
|
10 |
# Path to dataset configuration files |
|
11 |
CONFIG_FILES_PATH = "../DatasetConfigs" |
|
12 | ||
13 | ||
14 |
def create_ignore_file(path, text): |
|
15 |
""" |
|
16 |
Creates ignore file |
|
17 |
Args: |
|
18 |
path: path to directory for creating ignore.txt |
|
19 |
text: text that will be on first line of ignore.txt can be None |
|
20 |
""" |
|
21 |
with open(path + "/ignore.txt", "w") as file: |
|
22 |
if text is not None: |
|
23 |
file.write(text + "\n") |
|
24 | ||
25 | ||
26 |
def reset_dataset(dataset_name): |
|
27 |
""" |
|
28 |
Resets all saved data in dataset except config and implementation |
|
29 |
Args: |
|
30 |
dataset_name: name of dataset that has existing configuration file |
|
31 |
""" |
|
32 |
path = CRAWLED_DATA_PATH + dataset_name + "/" |
|
33 |
FolderProcessor.clean_folder(path) |
|
34 |
create_ignore_file(path, "ignore.txt") |
|
35 | ||
36 |
path = PROCESSED_DATA_PATH + dataset_name + "/" |
|
37 |
FolderProcessor.clean_folder(path) |
|
38 |
create_ignore_file(path, "ignore.txt") |
|
39 | ||
40 |
path = CRAWLER_LOGS_PATH + dataset_name + "/" |
|
41 |
FolderProcessor.clean_folder(path) |
|
42 |
create_ignore_file(path, None) |
|
43 | ||
44 | ||
45 |
def reset_all_datasets(): |
|
46 |
""" |
|
47 |
Resets all saved data in all datasets with config file except configs and implementation |
|
48 |
""" |
|
49 |
datasets = os.listdir(CONFIG_FILES_PATH) |
|
50 | ||
51 |
for dataset in datasets: |
|
52 |
reset_dataset(dataset.split('.')[0]) |
|
53 | ||
54 | ||
55 |
reset_all_datasets() |
modules/crawler/Utilities/Database/DatabaseLoader.py | ||
---|---|---|
2 | 2 |
import pymongo |
3 | 3 | |
4 | 4 |
# specify mongodb connection |
5 |
MONGODB_CONNECTION = "mongodb://localhost:27017/"
|
|
5 |
MONGODB_CONNECTION = "mongodb://root:root@database"
|
|
6 | 6 |
# mongodb account name |
7 | 7 |
MONGODB_ACC_NAME = "root" |
8 | 8 |
# mongodb account password |
Také k dispozici: Unified diff
hot fix