Projekt

Obecné

Profil

« Předchozí | Další » 

Revize 728f8c5d

Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)

hot fix

Zobrazit rozdíly:

modules/crawler/CrawledData/JIS/ignore.txt
1 1
ignore.txt
2
OD_ZCU_JIS_10_2019.CSV
3
OD_ZCU_JIS_03_2020.CSV
4
OD_ZCU_JIS_02_2020.CSV
5
OD_ZCU_JIS_00_2019.CSV
6
OD_ZCU_JIS_08_2019.CSV
7
OD_ZCU_JIS_12_2019.CSV
8
OD_ZCU_JIS_09_2019.CSV
9
OD_ZCU_JIS_01_2020.CSV
10
OD_ZCU_JIS_06_2019.CSV
11
OD_ZCU_JIS_11_2019.CSV
12
OD_ZCU_JIS_07_2019.CSV
modules/crawler/CrawledData/KOLOBEZKY/ignore.txt
1 1
ignore.txt
2
OD_ZCU_KOLOBEZKY_00_2019.CSV
3
OD_ZCU_KOLOBEZKY_06_2019.CSV
4
OD_ZCU_KOLOBEZKY_07_2019.CSV
5
OD_ZCU_KOLOBEZKY_08_2019.CSV
modules/crawler/CrawledData/WIFI/ignore.txt
1 1
ignore.txt
2
OD_ZCU_WIFI_07_2019.CSV
3
OD_ZCU_WIFI_00_2019.CSV
4
OD_ZCU_WIFI_06_2019.CSV
5
OD_ZCU_WIFI_08_2019.CSV
modules/crawler/CrawlerLogs/JIS/ignore.txt
1
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_JIS_00_2019_CSV.zip
2
https://openstore.zcu.cz/OD_ZCU_01_2020/OD_ZCU_JIS_01_2020_CSV.zip
3
https://openstore.zcu.cz/OD_ZCU_02_2020/OD_ZCU_JIS_02_2020_CSV.zip
4
https://openstore.zcu.cz/OD_ZCU_03_2020/OD_ZCU_JIS_03_2020_CSV.zip
5
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_JIS_06_2019_CSV.zip
6
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_JIS_07_2019_CSV.zip
7
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_JIS_08_2019_CSV.zip
8
https://openstore.zcu.cz/OD_ZCU_09_2019/OD_ZCU_JIS_09_2019_CSV.zip
9
https://openstore.zcu.cz/OD_ZCU_10_2019/OD_ZCU_JIS_10_2019_CSV.zip
10
https://openstore.zcu.cz/OD_ZCU_11_2019/OD_ZCU_JIS_11_2019_CSV.zip
11
https://openstore.zcu.cz/OD_ZCU_12_2019/OD_ZCU_JIS_12_2019_CSV.zip
modules/crawler/CrawlerLogs/KOLOBEZKY/ignore.txt
1
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_KOLOBEZKY_00_2019_CSV.zip
2
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_KOLOBEZKY_06_2019_CSV.zip
3
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_KOLOBEZKY_07_2019_CSV.zip
4
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_KOLOBEZKY_08_2019_CSV.zip
modules/crawler/CrawlerLogs/WIFI/ignore.txt
1
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_WIFI_00_2019_CSV.zip
2
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_WIFI_06_2019_CSV.zip
3
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_WIFI_07_2019_CSV.zip
4
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_WIFI_08_2019_CSV.zip
modules/crawler/DatasetConfigs/JIS.yaml
8 8
update-period: 24
9 9
# pozice jednotlivych zarizeni, ktera jsou v datasetu
10 10
devices:
11
  - US 005 - z?vora vjezd:
12
      x: UNKNOWN!
13
      y: UNKNOWN!
14

  
15
  - STUD_KL20:
16
      x: UNKNOWN!
17
      y: UNKNOWN!
18

  
19
  - US 005 - m?? vjezd:
20
      x: UNKNOWN!
21
      y: UNKNOWN!
22

  
23
  - Menza4-kasa3:
24
      x: UNKNOWN!
25
      y: UNKNOWN!
26

  
27
  - NTIS-BUFET:
28
      x: UNKNOWN!
29
      y: UNKNOWN!
30

  
31
  - Parkoviste-vjezd:
32
      x: UNKNOWN!
33
      y: UNKNOWN!
34

  
35
  - Menza4-kasa1:
36
      x: UNKNOWN!
37
      y: UNKNOWN!
38

  
39
  - Menza1-kasa-p:
40
      x: UNKNOWN!
41
      y: UNKNOWN!
42

  
43
  - Menza4-kasa2:
44
      x: UNKNOWN!
45
      y: UNKNOWN!
46

  
47
  - L2:
48
      x: UNKNOWN!
49
      y: UNKNOWN!
50

  
51
  - EP-BUFET:
52
      x: UNKNOWN!
53
      y: UNKNOWN!
54

  
55
  - KolaBory-vnitrni:
56
      x: UNKNOWN!
57
      y: UNKNOWN!
58

  
59
  - Menza1-kasa-l:
60
      x: UNKNOWN!
61
      y: UNKNOWN!
62

  
63
  - Zavora-FEL:
64
      x: UNKNOWN!
65
      y: UNKNOWN!
66

  
67
  - Zavora-NTIS-vjezd:
68
      x: UNKNOWN!
69
      y: UNKNOWN!
70

  
71
  - L-Posilovna:
72
      x: UNKNOWN!
73
      y: UNKNOWN!
74

  
75
  - STUD_ST407:
76
      x: UNKNOWN!
77
      y: UNKNOWN!
78

  
79
  - M16:
80
      x: UNKNOWN!
81
      y: UNKNOWN!
82

  
83
  - Zavora-Kaplirova:
84
      x: UNKNOWN!
85
      y: UNKNOWN!
86

  
87
  - Zavora-FDU:
88
      x: UNKNOWN!
89
      y: UNKNOWN!
90

  
91
  - KolaBory-vnejsi:
92
      x: UNKNOWN!
93
      y: UNKNOWN!
94

  
95
  - STUD_VC53:
96
      x: UNKNOWN!
97
      y: UNKNOWN!
98

  
99
  - Menza4-kasa4:
100
      x: UNKNOWN!
101
      y: UNKNOWN!
102

  
103
  - KL-Posilovna:
104
      x: UNKNOWN!
105
      y: UNKNOWN!
106

  
107
  - VC-VYJEZD:
108
      x: UNKNOWN!
109
      y: UNKNOWN!
110

  
111
  - EXT/kola-B:
112
      x: UNKNOWN!
113
      y: UNKNOWN!
114

  
115
  - Menza4-kasa5:
116
      x: UNKNOWN!
117
      y: UNKNOWN!
118

  
119
  - L1L2-vchod:
120
      x: UNKNOWN!
121
      y: UNKNOWN!
122

  
123
  - VC-VJEZD:
124
      x: UNKNOWN!
125
      y: UNKNOWN!
126

  
127
  - A3:
128
      x: UNKNOWN!
129
      y: UNKNOWN!
130

  
131
  - STUD_UB113:
132
      x: UNKNOWN!
133
      y: UNKNOWN!
134

  
135
  - STUD_CHEB:
136
      x: UNKNOWN!
137
      y: UNKNOWN!
138

  
139
  - STUD_PRA1:
140
      x: UNKNOWN!
141
      y: UNKNOWN!
142

  
143
  - L1:
144
      x: UNKNOWN!
145
      y: UNKNOWN!
146

  
147
  - STUD_UB211:
148
      x: UNKNOWN!
149
      y: UNKNOWN!
150

  
151
  - A1:
152
      x: UNKNOWN!
153
      y: UNKNOWN!
154

  
155
  - EXT/kola:
156
      x: UNKNOWN!
157
      y: UNKNOWN!
158

  
159
  - STUD_KL87:
160
      x: UNKNOWN!
161
      y: UNKNOWN!
162

  
163
  - UV1-Bufet:
164
      x: UNKNOWN!
165
      y: UNKNOWN!
166

  
167
  - M14:
168
      x: UNKNOWN!
169
      y: UNKNOWN!
170

  
171
  - Zavora-NTIS-vyjezd:
172
      x: UNKNOWN!
173
      y: UNKNOWN!
174

  
175
  - B3-kolarna:
176
      x: UNKNOWN!
177
      y: UNKNOWN!
178

  
179
  - B3-LEVY:
180
      x: UNKNOWN!
181
      y: UNKNOWN!
182

  
183
  - MenzaKL-vydej:
184
      x: UNKNOWN!
185
      y: UNKNOWN!
186

  
187
  - A2-Hlavni vchod:
188
      x: UNKNOWN!
189
      y: UNKNOWN!
190

  
191
  - Parkoviste-vyjezd:
192
      x: UNKNOWN!
193
      y: UNKNOWN!
194

  
11
  - Menza4-kasa5:
12
      x: UNKNOWN!
13
      y: UNKNOWN!
14

  
15
  - Zavora-FDU:
16
      x: UNKNOWN!
17
      y: UNKNOWN!
18

  
19
  - STUD_PRA1:
20
      x: UNKNOWN!
21
      y: UNKNOWN!
22

  
23
  - B3-LEVY:
24
      x: UNKNOWN!
25
      y: UNKNOWN!
26

  
27
  - KolaBory-vnejsi:
28
      x: UNKNOWN!
29
      y: UNKNOWN!
30

  
31
  - VC-VYJEZD:
32
      x: UNKNOWN!
33
      y: UNKNOWN!
34

  
35
  - L-Posilovna:
36
      x: UNKNOWN!
37
      y: UNKNOWN!
38

  
39
  - A3:
40
      x: UNKNOWN!
41
      y: UNKNOWN!
42

  
43
  - Menza4-kasa1:
44
      x: UNKNOWN!
45
      y: UNKNOWN!
46

  
47
  - US 005 - závora vjezd:
48
      x: UNKNOWN!
49
      y: UNKNOWN!
50

  
51
  - EP-BUFET:
52
      x: UNKNOWN!
53
      y: UNKNOWN!
54

  
55
  - Zavora-FEL:
56
      x: UNKNOWN!
57
      y: UNKNOWN!
58

  
59
  - US 005 - mříž vjezd:
60
      x: UNKNOWN!
61
      y: UNKNOWN!
62

  
63
  - STUD_VC53:
64
      x: UNKNOWN!
65
      y: UNKNOWN!
66

  
67
  - NTIS-BUFET:
68
      x: UNKNOWN!
69
      y: UNKNOWN!
70

  
71
  - Zavora-NTIS-vjezd:
72
      x: UNKNOWN!
73
      y: UNKNOWN!
74

  
75
  - EXT/kola:
76
      x: UNKNOWN!
77
      y: UNKNOWN!
78

  
79
  - VC-VJEZD:
80
      x: UNKNOWN!
81
      y: UNKNOWN!
82

  
83
  - Zavora-Kaplirova:
84
      x: UNKNOWN!
85
      y: UNKNOWN!
86

  
87
  - M16:
88
      x: UNKNOWN!
89
      y: UNKNOWN!
90

  
91
  - Menza1-kasa-p:
92
      x: UNKNOWN!
93
      y: UNKNOWN!
94

  
95
  - STUD_CHEB:
96
      x: UNKNOWN!
97
      y: UNKNOWN!
98

  
99
  - M14:
100
      x: UNKNOWN!
101
      y: UNKNOWN!
102

  
103
  - L2:
104
      x: UNKNOWN!
105
      y: UNKNOWN!
106

  
107
  - STUD_UB113:
108
      x: UNKNOWN!
109
      y: UNKNOWN!
110

  
111
  - B3-kolarna:
112
      x: UNKNOWN!
113
      y: UNKNOWN!
114

  
115
  - L1:
116
      x: UNKNOWN!
117
      y: UNKNOWN!
118

  
119
  - Menza1-kasa-l:
120
      x: UNKNOWN!
121
      y: UNKNOWN!
122

  
123
  - Zavora-NTIS-vyjezd:
124
      x: UNKNOWN!
125
      y: UNKNOWN!
126

  
127
  - Menza4-kasa4:
128
      x: UNKNOWN!
129
      y: UNKNOWN!
130

  
131
  - MenzaKL-vydej:
132
      x: UNKNOWN!
133
      y: UNKNOWN!
134

  
135
  - Parkoviste-vjezd:
136
      x: UNKNOWN!
137
      y: UNKNOWN!
138

  
139
  - KL-Posilovna:
140
      x: UNKNOWN!
141
      y: UNKNOWN!
142

  
143
  - A1:
144
      x: UNKNOWN!
145
      y: UNKNOWN!
146

  
147
  - KolaBory-vnitrni:
148
      x: UNKNOWN!
149
      y: UNKNOWN!
150

  
151
  - EXT/kola-B:
152
      x: UNKNOWN!
153
      y: UNKNOWN!
154

  
155
  - A2-Hlavni vchod:
156
      x: UNKNOWN!
157
      y: UNKNOWN!
158

  
159
  - STUD_ST407:
160
      x: UNKNOWN!
161
      y: UNKNOWN!
162

  
163
  - STUD_KL87:
164
      x: UNKNOWN!
165
      y: UNKNOWN!
166

  
167
  - Menza4-kasa2:
168
      x: UNKNOWN!
169
      y: UNKNOWN!
170

  
171
  - L1L2-vchod:
172
      x: UNKNOWN!
173
      y: UNKNOWN!
174

  
175
  - Menza4-kasa3:
176
      x: UNKNOWN!
177
      y: UNKNOWN!
178

  
179
  - Parkoviste-vyjezd:
180
      x: UNKNOWN!
181
      y: UNKNOWN!
182

  
183
  - STUD_KL20:
184
      x: UNKNOWN!
185
      y: UNKNOWN!
186

  
187
  - UV1-Bufet:
188
      x: UNKNOWN!
189
      y: UNKNOWN!
190

  
191
  - STUD_UB211:
192
      x: UNKNOWN!
193
      y: UNKNOWN!
194

  
modules/crawler/PrepareNewDataset.py
1
import os
2

  
3
# Path to crawled data
4
CRAWLED_DATA_PATH = "../CrawledData/"
5
# Path to processed data
6
PROCESSED_DATA_PATH = "../ProcessedData/"
7
# Path to crawler logs
8
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
9
# Path for DatasetCrawlers implementations
10
CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
11
# Path for DatasetProcessors implementations
12
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
13
# Path to dataset configuration files
14
CONFIG_FILES_PATH = "../DatasetConfigs"
15

  
16

  
17
def create_default_config_file(dataset_name):
18
    """
19
    Creates default config file
20

  
21
    Args:
22
        dataset_name: Name of newly created dataset
23
    """
24
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
25
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
26
        file.write("dataset-name: " + dataset_name + "\n")
27
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
28
        file.write("url: ZDE VLOZTE URL\n")
29
        file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
30
        file.write("regex: ZDE VLOZTE REGEX\n")
31
        file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
32
                   "tak defaultni hodnota (dny)\n")
33
        file.write("update-period: ZDE VLOZTE HODNOTU\n")
34
        file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
35
        file.write("devices:\n")
36

  
37

  
38
def create_default_processor(dataset_name):
39
    """
40
    Creates default processor for dataset
41

  
42
    Args:
43
        dataset_name: Name of newly created dataset
44
    """
45
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
46
        file.write("from Utilities.CSV import CSVDataLine, CSVutils")
47
        file.write("\n")
48
        file.write("\n")
49
        file.write("def process_file(filename):\n")
50
        file.write("    \"\"\"\n")
51
        file.write("    Method that take path to crawled file and outputs date dictionary using method:\n")
52
        file.write("    CSVutils.export_data_to_csv(filename, date_dict)\n")
53
        file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
54
        file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
55
        file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
56
        file.write("\n")
57
        file.write("    Args:\n")
58
        file.write("    filename: name of processed file\n")
59
        file.write("\n")
60
        file.write("    Returns:\n")
61
        file.write("    False if not implemented\n")
62
        file.write("    True when implemented\n")
63
        file.write("    \"\"\"\n")
64
        file.write("    print(\"You must implements process_file method first!\")\n")
65
        file.write("    #CSVutils.export_data_to_csv(filename, date_dict)\n")
66
        file.write("    return False\n")
67

  
68

  
69
def create_default_crawler(dataset_name):
70
    """
71
    Creates default crawler for dataset
72

  
73
    Args:
74
        dataset_name: Name of newly created dataset
75
    """
76

  
77
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
78
        file.write("# Path to crawled data\n")
79
        file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
80
        file.write("\n")
81
        file.write("\n")
82
        file.write("def crawl(config):\n")
83
        file.write("    \"\"\"\n")
84
        file.write("    Implement crawl method that downloads new data to path_for_files\n")
85
        file.write("    For keeping the project structure\n")
86
        file.write("    url , regex, and dataset_name from config\n")
87
        file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
88
        file.write("\n")
89
        file.write("    Args:\n")
90
        file.write("        config: loaded configuration file of dataset\n")
91
        file.write("    \"\"\"\n")
92
        file.write("    dataset_name = config[\"dataset-name\"]\n")
93
        file.write("    url = config['url']\n")
94
        file.write("    regex = config['regex']\n")
95
        file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
96
        file.write("    print(\"You must implements Crawl method first!\")\n")
97

  
98

  
99
def create_ignore_file(path, text):
100
    """
101
    Creates ignore file
102
    Args:
103
        path: path to directory for creating ignore.txt
104
        text: text that will be on first line of ignore.txt can be None
105
    """
106
    with open(path + "/ignore.txt", "w") as file:
107
        if text is not None:
108
            file.write(text + "\n")
109

  
110

  
111
def prepare_dataset_structure(dataset_name):
112
    """
113
    Prepares folders for new dataset
114
    Args:
115
        dataset_name: Name of newly created dataset
116
    """
117
    jump_folder = "../"
118

  
119
    # create folder for crawled data
120
    try:
121
        path = CRAWLED_DATA_PATH+dataset_name
122
        os.mkdir(path)
123
        create_ignore_file(path, "ignore.txt")
124
    except os.error as e:
125
        print(e)
126
        print("Creation of the directory %s failed" % path)
127

  
128
    # create folder for processed data
129
    try:
130
        path = PROCESSED_DATA_PATH + dataset_name
131
        os.mkdir(path)
132
        create_ignore_file(path, "ignore.txt")
133
    except OSError:
134
        print("Creation of the directory %s failed" % path)
135

  
136
    # create folder for crawler logs
137
    try:
138
        path = CRAWLER_LOGS_PATH + dataset_name
139
        os.mkdir(path)
140
        create_ignore_file(path, None)
141
    except OSError:
142
        print("Creation of the directory %s failed" % path)
143

  
144
    create_default_crawler(dataset_name)
145
    create_default_processor(dataset_name)
146
    create_default_config_file(dataset_name)
147

  
148

  
149
prepare_dataset_structure("TEST")
modules/crawler/ProcessedData/KOLOBEZKY/ignore.txt
1 1
ignore.txt
2
OD_ZCU_KOLOBEZKY_08_2019.CSV
3
OD_ZCU_KOLOBEZKY_00_2019.CSV
4
OD_ZCU_KOLOBEZKY_07_2019.CSV
5
OD_ZCU_KOLOBEZKY_06_2019.CSV
modules/crawler/RemoveDataset.py
1
import os
2
import shutil
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6
# Path to processed data
7
PROCESSED_DATA_PATH = "ProcessedData/"
8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
10
# Path to dataset configuration files
11
CONFIG_FILES_PATH = "DatasetConfigs"
12
# Path for DatasetCrawlers implementations
13
CRAWLER_PROGRAM_PATH = "DatasetCrawler"
14
# Path for DatasetProcessors implementations
15
PROCESSOR_PROGRAM_PATH = "DatasetProcessing"
16

  
17

  
18
def remove_dataset(dataset_name):
19
    """
20
    Remove dataset
21
    Args:
22
        dataset_name: name of dataset that has existing configuration file
23
    """
24
    shutil.rmtree(CRAWLED_DATA_PATH + dataset_name + "/")
25
    shutil.rmtree(PROCESSED_DATA_PATH + dataset_name + "/")
26
    shutil.rmtree(CRAWLER_LOGS_PATH + dataset_name + "/")
27

  
28
    os.remove(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml")
29
    os.remove(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py")
30
    os.remove(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py")
31

  
32
    print("Dataset: " + dataset_name + " removed")
33

  
34
remove_dataset("TEST")
modules/crawler/RemoveDatasetDatabase.py
1
from Utilities.Database import DatabaseLoader
2

  
3

  
4
def remove_dataset_database(dataset_name):
5
    """
6
    Removes dataset entries from database
7
    Args:
8
        dataset_name: name of dataset that has existing configuration file
9
    """
10
    # Creating connection
11
    mydb = DatabaseLoader.create_database_connection()
12

  
13
    # collection where are specified aviable datasets
14
    collection_datasets = mydb[DatabaseLoader.MONGODB_DATASET_COLLECTION]
15

  
16
    collection_datasets.delete_one({"name": dataset_name})
17
    print("Removing record from DATASETS collection")
18

  
19

  
20
    # Retrieve list of all collections
21
    collections = mydb.list_collection_names()
22

  
23
    # Drop of all collections
24
    for name in collections:
25
        if name.startswith(dataset_name):
26
            mydb[name].drop()
27
            print("Dropping: " + name)
28

  
29
    print("Database Cleaned")
30

  
31

  
32
remove_dataset_database("KOLOBEZKY")
modules/crawler/ResetDatabaseData.py
1
from Utilities.Database import DatabaseLoader
2

  
3

  
4
def clean_database():
5
    """
6
    Deletes all collections from database
7
    """
8
    # Create connection
9
    mydb = DatabaseLoader.create_database_connection()
10

  
11
    # Retrieve list of all collections
12
    collections = mydb.list_collection_names()
13

  
14
    # Drop of all collections
15
    for name in collections:
16
        mydb[name].drop()
17

  
18
    print("Database Cleaned")
19

  
20

  
21
clean_database()
modules/crawler/ResetDataset.py
1
import os
2
from Utilities import FolderProcessor
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "../CrawledData/"
6
# Path to processed data
7
PROCESSED_DATA_PATH = "../ProcessedData/"
8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
10
# Path to dataset configuration files
11
CONFIG_FILES_PATH = "../DatasetConfigs"
12

  
13

  
14
def create_ignore_file(path, text):
15
    """
16
    Creates ignore file
17
    Args:
18
        path: path to directory for creating ignore.txt
19
        text: text that will be on first line of ignore.txt can be None
20
    """
21
    with open(path + "/ignore.txt", "w") as file:
22
        if text is not None:
23
            file.write(text + "\n")
24

  
25

  
26
def reset_dataset(dataset_name):
27
    """
28
    Resets all saved data in dataset except config and implementation
29
    Args:
30
        dataset_name: name of dataset that has existing configuration file
31
    """
32
    path = CRAWLED_DATA_PATH + dataset_name + "/"
33
    FolderProcessor.clean_folder(path)
34
    create_ignore_file(path, "ignore.txt")
35

  
36
    path = PROCESSED_DATA_PATH + dataset_name + "/"
37
    FolderProcessor.clean_folder(path)
38
    create_ignore_file(path, "ignore.txt")
39

  
40
    path = CRAWLER_LOGS_PATH + dataset_name + "/"
41
    FolderProcessor.clean_folder(path)
42
    create_ignore_file(path, None)
43

  
44

  
45
def reset_all_datasets():
46
    """
47
    Resets all saved data in all datasets with config file except configs and implementation
48
    """
49
    datasets = os.listdir(CONFIG_FILES_PATH)
50

  
51
    for dataset in datasets:
52
        reset_dataset(dataset.split('.')[0])
53

  
54

  
55
reset_all_datasets()
modules/crawler/Scripts/PrepareNewDataset.py
1
import os
2

  
3
# Path to crawled data
4
CRAWLED_DATA_PATH = "../CrawledData/"
5
# Path to processed data
6
PROCESSED_DATA_PATH = "../ProcessedData/"
7
# Path to crawler logs
8
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
9
# Path for DatasetCrawlers implementations
10
CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
11
# Path for DatasetProcessors implementations
12
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
13
# Path to dataset configuration files
14
CONFIG_FILES_PATH = "../DatasetConfigs"
15

  
16

  
17
def create_default_config_file(dataset_name):
18
    """
19
    Creates default config file
20

  
21
    Args:
22
        dataset_name: Name of newly created dataset
23
    """
24
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
25
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
26
        file.write("dataset-name: " + dataset_name + "\n")
27
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
28
        file.write("url: ZDE VLOZTE URL\n")
29
        file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
30
        file.write("regex: ZDE VLOZTE REGEX\n")
31
        file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
32
                   "tak defaultni hodnota (dny)\n")
33
        file.write("update-period: ZDE VLOZTE HODNOTU\n")
34
        file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
35
        file.write("devices:\n")
36

  
37

  
38
def create_default_processor(dataset_name):
39
    """
40
    Creates default processor for dataset
41

  
42
    Args:
43
        dataset_name: Name of newly created dataset
44
    """
45
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
46
        file.write("from Utilities.CSV import CSVDataLine, CSVutils")
47
        file.write("\n")
48
        file.write("\n")
49
        file.write("def process_file(filename):\n")
50
        file.write("    \"\"\"\n")
51
        file.write("    Method that take path to crawled file and outputs date dictionary using method:\n")
52
        file.write("    CSVutils.export_data_to_csv(filename, date_dict)\n")
53
        file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
54
        file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
55
        file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
56
        file.write("\n")
57
        file.write("    Args:\n")
58
        file.write("    filename: name of processed file\n")
59
        file.write("\n")
60
        file.write("    Returns:\n")
61
        file.write("    False if not implemented\n")
62
        file.write("    True when implemented\n")
63
        file.write("    \"\"\"\n")
64
        file.write("    print(\"You must implements process_file method first!\")\n")
65
        file.write("    #CSVutils.export_data_to_csv(filename, date_dict)\n")
66
        file.write("    return False\n")
67

  
68

  
69
def create_default_crawler(dataset_name):
70
    """
71
    Creates default crawler for dataset
72

  
73
    Args:
74
        dataset_name: Name of newly created dataset
75
    """
76

  
77
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
78
        file.write("# Path to crawled data\n")
79
        file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
80
        file.write("\n")
81
        file.write("\n")
82
        file.write("def crawl(config):\n")
83
        file.write("    \"\"\"\n")
84
        file.write("    Implement crawl method that downloads new data to path_for_files\n")
85
        file.write("    For keeping the project structure\n")
86
        file.write("    url , regex, and dataset_name from config\n")
87
        file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
88
        file.write("\n")
89
        file.write("    Args:\n")
90
        file.write("        config: loaded configuration file of dataset\n")
91
        file.write("    \"\"\"\n")
92
        file.write("    dataset_name = config[\"dataset-name\"]\n")
93
        file.write("    url = config['url']\n")
94
        file.write("    regex = config['regex']\n")
95
        file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
96
        file.write("    print(\"You must implements Crawl method first!\")\n")
97

  
98

  
99
def create_ignore_file(path, text):
100
    """
101
    Creates ignore file
102
    Args:
103
        path: path to directory for creating ignore.txt
104
        text: text that will be on first line of ignore.txt can be None
105
    """
106
    with open(path + "/ignore.txt", "w") as file:
107
        if text is not None:
108
            file.write(text + "\n")
109

  
110

  
111
def prepare_dataset_structure(dataset_name):
112
    """
113
    Prepares folders for new dataset
114
    Args:
115
        dataset_name: Name of newly created dataset
116
    """
117
    jump_folder = "../"
118

  
119
    # create folder for crawled data
120
    try:
121
        path = CRAWLED_DATA_PATH+dataset_name
122
        os.mkdir(path)
123
        create_ignore_file(path, "ignore.txt")
124
    except os.error as e:
125
        print(e)
126
        print("Creation of the directory %s failed" % path)
127

  
128
    # create folder for processed data
129
    try:
130
        path = PROCESSED_DATA_PATH + dataset_name
131
        os.mkdir(path)
132
        create_ignore_file(path, "ignore.txt")
133
    except OSError:
134
        print("Creation of the directory %s failed" % path)
135

  
136
    # create folder for crawler logs
137
    try:
138
        path = CRAWLER_LOGS_PATH + dataset_name
139
        os.mkdir(path)
140
        create_ignore_file(path, None)
141
    except OSError:
142
        print("Creation of the directory %s failed" % path)
143

  
144
    create_default_crawler(dataset_name)
145
    create_default_processor(dataset_name)
146
    create_default_config_file(dataset_name)
147

  
148

  
149
prepare_dataset_structure("TEST")
modules/crawler/Scripts/RemoveDataset.py
1
import os
2
import shutil
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "../CrawledData/"
6
# Path to processed data
7
PROCESSED_DATA_PATH = "../ProcessedData/"
8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
10
# Path to dataset configuration files
11
CONFIG_FILES_PATH = "../DatasetConfigs"
12
# Path for DatasetCrawlers implementations
13
CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
14
# Path for DatasetProcessors implementations
15
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
16

  
17

  
18
def remove_dataset(dataset_name):
19
    """
20
    Remove dataset
21
    Args:
22
        dataset_name: name of dataset that has existing configuration file
23
    """
24
    shutil.rmtree(CRAWLED_DATA_PATH + dataset_name + "/")
25
    shutil.rmtree(PROCESSED_DATA_PATH + dataset_name + "/")
26
    shutil.rmtree(CRAWLER_LOGS_PATH + dataset_name + "/")
27

  
28
    os.remove(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml")
29
    os.remove(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py")
30
    os.remove(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py")
31

  
32
    print("Dataset: " + dataset_name + " removed");
33

  
34
remove_dataset("TEST");
modules/crawler/Scripts/RemoveDatasetDatabase.py
1
from Utilities.Database import DatabaseLoader
2

  
3

  
4
def remove_dataset_database(dataset_name):
5
    """
6
    Removes dataset entries from database
7
    Args:
8
        dataset_name: name of dataset that has existing configuration file
9
    """
10
    # Creating connection
11
    mydb = DatabaseLoader.create_database_connection();
12

  
13
    # collection where are specified aviable datasets
14
    collection_datasets = mydb[DatabaseLoader.MONGODB_DATASET_COLLECTION]
15

  
16
    collection_datasets.delete_one({"name": dataset_name})
17
    print("Removing record from DATASETS collection")
18

  
19

  
20
    # Retrieve list of all collections
21
    collections = mydb.list_collection_names()
22

  
23
    # Drop of all collections
24
    for name in collections:
25
        if name.startswith(dataset_name):
26
            mydb[name].drop()
27
            print("Dropping: " + name)
28

  
29
    print("Database Cleaned")
30

  
31

  
32
remove_dataset_database("KOLOBEZKY")
modules/crawler/Scripts/ResetDatabaseData.py
1
from Utilities.Database import DatabaseLoader
2

  
3

  
4
def clean_database():
5
    """
6
    Deletes all collections from database
7
    """
8
    # Create connection
9
    mydb = DatabaseLoader.create_database_connection()
10

  
11
    # Retrieve list of all collections
12
    collections = mydb.list_collection_names()
13

  
14
    # Drop of all collections
15
    for name in collections:
16
        mydb[name].drop()
17

  
18
    print("Database Cleaned")
19

  
20

  
21
clean_database();
modules/crawler/Scripts/ResetDataset.py
1
import os
2
from Utilities import FolderProcessor
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "../CrawledData/"
6
# Path to processed data
7
PROCESSED_DATA_PATH = "../ProcessedData/"
8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
10
# Path to dataset configuration files
11
CONFIG_FILES_PATH = "../DatasetConfigs"
12

  
13

  
14
def create_ignore_file(path, text):
15
    """
16
    Creates ignore file
17
    Args:
18
        path: path to directory for creating ignore.txt
19
        text: text that will be on first line of ignore.txt can be None
20
    """
21
    with open(path + "/ignore.txt", "w") as file:
22
        if text is not None:
23
            file.write(text + "\n")
24

  
25

  
26
def reset_dataset(dataset_name):
27
    """
28
    Resets all saved data in dataset except config and implementation
29
    Args:
30
        dataset_name: name of dataset that has existing configuration file
31
    """
32
    path = CRAWLED_DATA_PATH + dataset_name + "/"
33
    FolderProcessor.clean_folder(path)
34
    create_ignore_file(path, "ignore.txt")
35

  
36
    path = PROCESSED_DATA_PATH + dataset_name + "/"
37
    FolderProcessor.clean_folder(path)
38
    create_ignore_file(path, "ignore.txt")
39

  
40
    path = CRAWLER_LOGS_PATH + dataset_name + "/"
41
    FolderProcessor.clean_folder(path)
42
    create_ignore_file(path, None)
43

  
44

  
45
def reset_all_datasets():
46
    """
47
    Resets all saved data in all datasets with config file except configs and implementation
48
    """
49
    datasets = os.listdir(CONFIG_FILES_PATH)
50

  
51
    for dataset in datasets:
52
        reset_dataset(dataset.split('.')[0])
53

  
54

  
55
reset_all_datasets()
modules/crawler/Utilities/Database/DatabaseLoader.py
2 2
import pymongo
3 3

  
4 4
# specify mongodb connection
5
MONGODB_CONNECTION = "mongodb://localhost:27017/"
5
MONGODB_CONNECTION = "mongodb://root:root@database"
6 6
# mongodb account name
7 7
MONGODB_ACC_NAME = "root"
8 8
# mongodb account password

Také k dispozici: Unified diff