Revize d6ca840d
Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)
.gitignore | ||
---|---|---|
1 | 1 |
logs |
2 |
database |
|
3 | 2 |
website/vendor/ |
4 | 3 |
/website/var |
5 | 4 |
cache |
docker-compose-prod.yml | ||
---|---|---|
9 | 9 |
volumes: |
10 | 10 |
- /etc/certificate:/certificate |
11 | 11 |
- ./docker/nginx/sites-dev:/etc/nginx/sites-available |
12 |
- /acme-challenge:/var/www/symfony/public/.well-known/acme-challenge/ |
|
12 |
- /acme-challenge:/var/www/symfony/public/.well-known/acme-challenge/ |
|
13 |
crawler: |
|
14 |
volumes: |
|
15 |
- /logs/crawler:/src/CrawlerLogs |
|
16 |
- /data/crawler:/src/CrawledData |
docker-compose.yml | ||
---|---|---|
11 | 11 |
- backend |
12 | 12 |
volumes: |
13 | 13 |
- ./modules/crawler/:/src |
14 |
- ./logs/crawler:/log/
|
|
14 |
- ./logs/crawler:/src/CrawlerLogs
|
|
15 | 15 |
container_name: "heatmap_crawler" |
16 | 16 |
environment: |
17 | 17 |
- TZ=Europe/Prague |
modules/crawler/.gitignore | ||
---|---|---|
1 | 1 |
*__pycache__* |
2 |
*.CSV |
|
2 |
*.CSV |
|
3 |
CrawlerLogs |
|
4 |
CrawledData |
|
5 |
ProcessedData |
modules/crawler/CrawledData/JIS/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
modules/crawler/CrawledData/KOLOBEZKY/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
modules/crawler/CrawledData/WIFI/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
modules/crawler/CrawlerLogs/CommonRecords/.gitignore | ||
---|---|---|
1 |
# Ignore everything in this directory |
|
2 |
* |
|
3 |
# Except this file |
|
4 |
!.gitignore |
modules/crawler/CrawlerLogs/JIS/updated.txt | ||
---|---|---|
1 |
0 |
modules/crawler/CrawlerLogs/KOLOBEZKY/updated.txt | ||
---|---|---|
1 |
0 |
modules/crawler/CrawlerLogs/WIFI/updated.txt | ||
---|---|---|
1 |
0 |
modules/crawler/CroneUpdateScript.py | ||
---|---|---|
1 |
import Pipeline |
|
2 |
import os |
|
3 |
|
|
4 |
# Path to configuration files |
|
5 |
CONFIG_FILES_PATH = "DatasetConfigs/" |
|
6 |
|
|
7 |
|
|
8 |
def run_pipeline_for_all_datasets(): |
|
9 |
""" |
|
10 |
Runs whole DataScript pipeline for every dataset that has existing configuration file |
|
11 |
""" |
|
12 |
files_in_dir = os.listdir(CONFIG_FILES_PATH) |
|
13 |
|
|
14 |
for file in files_in_dir: |
|
15 |
name = file.split('.') |
|
16 |
Pipeline.run_full_pipeline_crone(name[0]) |
|
17 |
|
|
18 |
|
|
19 |
run_pipeline_for_all_datasets() |
modules/crawler/DatasetConfigs/KOLOBEZKY.yaml | ||
---|---|---|
33 | 33 |
- stojan-borska: |
34 | 34 |
x: 49.734518 |
35 | 35 |
y: 13.359475 |
36 |
|
|
36 |
|
modules/crawler/DatasetCrawler/JISCrawler.py | ||
---|---|---|
1 |
from Utilities import FolderProcessor |
|
2 |
from Utilities.Crawler import BasicCrawlerFunctions |
|
3 |
|
|
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
6 |
|
|
7 |
|
|
8 |
def crawl(config): |
|
9 |
""" |
|
10 |
Implement crawl method that downloads new data to path_for_files |
|
11 |
For keeping the project structure |
|
12 |
url , regex, and dataset_name from config |
|
13 |
You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py |
|
14 |
|
|
15 |
Args: |
|
16 |
config: loaded configuration file of dataset |
|
17 |
""" |
|
18 |
dataset_name = config["dataset-name"] |
|
19 |
url = config['url'] |
|
20 |
regex = config['regex'] |
|
21 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
|
22 |
|
|
23 |
first_level_links = BasicCrawlerFunctions.get_all_links(url) |
|
24 |
filtered_first_level_links = BasicCrawlerFunctions.filter_links(first_level_links, "^OD_ZCU") |
|
25 |
absolute_first_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_first_level_links, url) |
|
26 |
|
|
27 |
files = [] |
|
28 |
|
|
29 |
for link in absolute_first_level_links: |
|
30 |
second_level_links = BasicCrawlerFunctions.get_all_links(link) |
|
31 |
filtered_second_level_links = BasicCrawlerFunctions.filter_links(second_level_links, regex) |
|
32 |
absolute_second_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_second_level_links, link) |
|
33 |
final_links = BasicCrawlerFunctions.remove_downloaded_links(absolute_second_level_links, dataset_name) |
|
34 |
|
|
35 |
for file_link in final_links: |
|
36 |
files.append(file_link) |
|
37 |
|
|
38 |
for file in files: |
|
39 |
BasicCrawlerFunctions.download_file_from_url(file, dataset_name) |
|
40 |
|
|
41 |
FolderProcessor.unzip_all_csv_zip_files_in_folder(path_for_files) |
modules/crawler/DatasetCrawler/JIS_crawler.py | ||
---|---|---|
1 |
from Utilities import folder_processor |
|
2 |
from Utilities.Crawler import basic_crawler_functions |
|
3 |
|
|
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
6 |
|
|
7 |
|
|
8 |
def crawl(config): |
|
9 |
""" |
|
10 |
Implement crawl method that downloads new data to path_for_files |
|
11 |
For keeping the project structure |
|
12 |
url , regex, and dataset_name from config |
|
13 |
You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py |
|
14 |
|
|
15 |
Args: |
|
16 |
config: loaded configuration file of dataset |
|
17 |
""" |
|
18 |
dataset_name = config["dataset-name"] |
|
19 |
url = config['url'] |
|
20 |
regex = config['regex'] |
|
21 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
|
22 |
|
|
23 |
first_level_links = basic_crawler_functions.get_all_links(url) |
|
24 |
filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU") |
|
25 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url) |
|
26 |
|
|
27 |
files = [] |
|
28 |
|
|
29 |
for link in absolute_first_level_links: |
|
30 |
second_level_links = basic_crawler_functions.get_all_links(link) |
|
31 |
filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex) |
|
32 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link) |
|
33 |
final_links = basic_crawler_functions.remove_downloaded_links(absolute_second_level_links, dataset_name) |
|
34 |
|
|
35 |
for file_link in final_links: |
|
36 |
files.append(file_link) |
|
37 |
|
|
38 |
for file in files: |
|
39 |
basic_crawler_functions.download_file_from_url(file, dataset_name) |
|
40 |
|
|
41 |
folder_processor.unzip_all_csv_zip_files_in_folder(path_for_files) |
modules/crawler/DatasetCrawler/KOLOBEZKYCrawler.py | ||
---|---|---|
1 |
from Utilities import FolderProcessor |
|
2 |
from Utilities.Crawler import BasicCrawlerFunctions |
|
3 |
|
|
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
6 |
|
|
7 |
|
|
8 |
def crawl(config): |
|
9 |
""" |
|
10 |
Implement crawl method that downloads new data to path_for_files |
|
11 |
For keeping the project structure |
|
12 |
url , regex, and dataset_name from config |
|
13 |
You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py |
|
14 |
|
|
15 |
Args: |
|
16 |
config: loaded configuration file of dataset |
|
17 |
""" |
|
18 |
dataset_name = config["dataset-name"] |
|
19 |
url = config['url'] |
|
20 |
regex = config['regex'] |
|
21 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
|
22 |
|
|
23 |
first_level_links = BasicCrawlerFunctions.get_all_links(url) |
|
24 |
filtered_first_level_links = BasicCrawlerFunctions.filter_links(first_level_links, "^OD_ZCU") |
|
25 |
absolute_first_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_first_level_links, url) |
|
26 |
|
|
27 |
files = [] |
|
28 |
|
|
29 |
for link in absolute_first_level_links: |
|
30 |
second_level_links = BasicCrawlerFunctions.get_all_links(link) |
|
31 |
filtered_second_level_links = BasicCrawlerFunctions.filter_links(second_level_links, regex) |
|
32 |
absolute_second_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_second_level_links, link) |
|
33 |
final_links = BasicCrawlerFunctions.remove_downloaded_links(absolute_second_level_links, dataset_name) |
|
34 |
|
|
35 |
for file_link in final_links: |
|
36 |
files.append(file_link) |
|
37 |
|
|
38 |
for file in files: |
|
39 |
BasicCrawlerFunctions.download_file_from_url(file, dataset_name) |
|
40 |
|
|
41 |
FolderProcessor.unzip_all_csv_zip_files_in_folder(path_for_files) |
modules/crawler/DatasetCrawler/KOLOBEZKY_crawler.py | ||
---|---|---|
1 |
from Utilities import folder_processor |
|
2 |
from Utilities.Crawler import basic_crawler_functions |
|
3 |
|
|
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
6 |
|
|
7 |
|
|
8 |
def crawl(config): |
|
9 |
""" |
|
10 |
Implement crawl method that downloads new data to path_for_files |
|
11 |
For keeping the project structure |
|
12 |
url , regex, and dataset_name from config |
|
13 |
You can use already implemented functions from Utilities/Crawler/basic_crawler_functions.py |
|
14 |
|
|
15 |
Args: |
|
16 |
config: loaded configuration file of dataset |
|
17 |
""" |
|
18 |
dataset_name = config["dataset-name"] |
|
19 |
url = config['url'] |
|
20 |
regex = config['regex'] |
|
21 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
|
22 |
|
|
23 |
first_level_links = basic_crawler_functions.get_all_links(url) |
|
24 |
filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU") |
|
25 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url) |
|
26 |
|
|
27 |
files = [] |
|
28 |
|
|
29 |
for link in absolute_first_level_links: |
|
30 |
second_level_links = basic_crawler_functions.get_all_links(link) |
|
31 |
filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex) |
|
32 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link) |
|
33 |
final_links = basic_crawler_functions.remove_downloaded_links(absolute_second_level_links, dataset_name) |
|
34 |
|
|
35 |
for file_link in final_links: |
|
36 |
files.append(file_link) |
|
37 |
|
|
38 |
for file in files: |
|
39 |
basic_crawler_functions.download_file_from_url(file, dataset_name) |
|
40 |
|
|
41 |
folder_processor.unzip_all_csv_zip_files_in_folder(path_for_files) |
modules/crawler/DatasetCrawler/WIFICrawler.py | ||
---|---|---|
1 |
from Utilities import FolderProcessor |
|
2 |
from Utilities.Crawler import BasicCrawlerFunctions |
|
3 |
|
|
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
6 |
|
|
7 |
|
|
8 |
def crawl(config): |
|
9 |
""" |
|
10 |
Implement crawl method that downloads new data to path_for_files |
|
11 |
For keeping the project structure |
|
12 |
url , regex, and dataset_name from config |
|
13 |
You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py |
|
14 |
|
|
15 |
Args: |
|
16 |
config: loaded configuration file of dataset |
|
17 |
""" |
|
18 |
dataset_name = config["dataset-name"] |
|
19 |
url = config['url'] |
|
20 |
regex = config['regex'] |
|
21 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
|
22 |
|
|
23 |
first_level_links = BasicCrawlerFunctions.get_all_links(url) |
|
24 |
filtered_first_level_links = BasicCrawlerFunctions.filter_links(first_level_links, "^OD_ZCU") |
|
25 |
absolute_first_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_first_level_links, url) |
|
26 |
|
|
27 |
files = [] |
|
28 |
|
|
29 |
for link in absolute_first_level_links: |
|
30 |
second_level_links = BasicCrawlerFunctions.get_all_links(link) |
|
31 |
filtered_second_level_links = BasicCrawlerFunctions.filter_links(second_level_links, regex) |
|
32 |
absolute_second_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_second_level_links, link) |
|
33 |
final_links = BasicCrawlerFunctions.remove_downloaded_links(absolute_second_level_links, dataset_name) |
|
34 |
|
|
35 |
for file_link in final_links: |
|
36 |
files.append(file_link) |
|
37 |
|
|
38 |
for file in files: |
|
39 |
BasicCrawlerFunctions.download_file_from_url(file, dataset_name) |
|
40 |
|
|
41 |
FolderProcessor.unzip_all_csv_zip_files_in_folder(path_for_files) |
modules/crawler/DatasetCrawler/WIFI_crawler.py | ||
---|---|---|
1 |
from Utilities import folder_processor |
|
2 |
from Utilities.Crawler import basic_crawler_functions |
|
3 |
|
|
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
6 |
|
|
7 |
|
|
8 |
def crawl(config): |
|
9 |
""" |
|
10 |
Implement crawl method that downloads new data to path_for_files |
|
11 |
For keeping the project structure |
|
12 |
url , regex, and dataset_name from config |
|
13 |
You can use already implemented functions from Utilities/Crawler/basic_crawler_functions.py |
|
14 |
|
|
15 |
Args: |
|
16 |
config: loaded configuration file of dataset |
|
17 |
""" |
|
18 |
dataset_name = config["dataset-name"] |
|
19 |
url = config['url'] |
|
20 |
regex = config['regex'] |
|
21 |
path_for_files = CRAWLED_DATA_PATH + dataset_name + '/' |
|
22 |
|
|
23 |
first_level_links = basic_crawler_functions.get_all_links(url) |
|
24 |
filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU") |
|
25 |
absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url) |
|
26 |
|
|
27 |
files = [] |
|
28 |
|
|
29 |
for link in absolute_first_level_links: |
|
30 |
second_level_links = basic_crawler_functions.get_all_links(link) |
|
31 |
filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex) |
|
32 |
absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link) |
|
33 |
final_links = basic_crawler_functions.remove_downloaded_links(absolute_second_level_links, dataset_name) |
|
34 |
|
|
35 |
for file_link in final_links: |
|
36 |
files.append(file_link) |
|
37 |
|
|
38 |
for file in files: |
|
39 |
basic_crawler_functions.download_file_from_url(file, dataset_name) |
|
40 |
|
|
41 |
folder_processor.unzip_all_csv_zip_files_in_folder(path_for_files) |
modules/crawler/DatasetProcessing/JISProcessor.py | ||
---|---|---|
1 |
from Utilities.CSV import CSVDataLine |
|
2 |
from Utilities import DateFormating |
|
3 |
|
|
4 |
|
|
5 |
def process_file(filename): |
|
6 |
""" |
|
7 |
Method that take path to crawled file and outputs date dictionary: |
|
8 |
Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815) |
|
9 |
and value is dictionary where keys devices (specified in configuration file) |
|
10 |
and value is CSVDataLine.CSVDataLine with device,date and occurrence |
|
11 |
|
|
12 |
Args: |
|
13 |
filename: name of processed file |
|
14 |
|
|
15 |
Returns: |
|
16 |
None if not implemented |
|
17 |
date_dict when implemented |
|
18 |
""" |
|
19 |
date_dict = dict() |
|
20 |
|
|
21 |
with open(filename, "r", encoding="utf-8") as file: |
|
22 |
|
|
23 |
for line in file: |
|
24 |
|
|
25 |
array = line.split(";") |
|
26 |
|
|
27 |
date = DateFormating.date_time_formatter(array[1][1:-1]) |
|
28 |
name = array[0][1:-1] |
|
29 |
occurrence = array[2][:-1] |
|
30 |
|
|
31 |
if date not in date_dict: |
|
32 |
date_dict[date] = dict() |
|
33 |
|
|
34 |
if name in date_dict[date]: |
|
35 |
date_dict[date][name].occurrence += int(occurrence) |
|
36 |
else: |
|
37 |
date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, occurrence) |
|
38 |
|
|
39 |
return date_dict |
|
40 |
|
modules/crawler/DatasetProcessing/JIS_processor.py | ||
---|---|---|
1 |
from Utilities.CSV import csv_data_line |
|
2 |
from Utilities import date_formating |
|
3 |
|
|
4 |
|
|
5 |
def process_file(filename): |
|
6 |
""" |
|
7 |
Method that take path to crawled file and outputs date dictionary: |
|
8 |
Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815) |
|
9 |
and value is dictionary where keys are devices (specified in configuration file) |
|
10 |
and value is CSVDataLine.csv_data_line with device,date and occurrence |
|
11 |
|
|
12 |
Args: |
|
13 |
filename: name of processed file |
|
14 |
|
|
15 |
Returns: |
|
16 |
None if not implemented |
|
17 |
date_dict when implemented |
|
18 |
""" |
|
19 |
date_dict = dict() |
|
20 |
|
|
21 |
with open(filename, "r", encoding="utf-8") as file: |
|
22 |
|
|
23 |
for line in file: |
|
24 |
|
|
25 |
array = line.split(";") |
|
26 |
|
|
27 |
date = date_formating.date_time_formatter(array[1][1:-1]) |
|
28 |
name = array[0][1:-1] |
|
29 |
occurrence = array[2][:-1] |
|
30 |
|
|
31 |
if date not in date_dict: |
|
32 |
date_dict[date] = dict() |
|
33 |
|
|
34 |
if name in date_dict[date]: |
|
35 |
date_dict[date][name].occurrence += int(occurrence) |
|
36 |
else: |
|
37 |
date_dict[date][name] = csv_data_line.CSVDataLine(name, date, occurrence) |
|
38 |
|
|
39 |
return date_dict |
|
40 |
|
modules/crawler/DatasetProcessing/KOLOBEZKYProcessor.py | ||
---|---|---|
1 |
from Utilities.CSV import CSVDataLine |
|
2 |
from Utilities import DateFormating |
|
3 |
|
|
4 |
|
|
5 |
def process_file(filename): |
|
6 |
""" |
|
7 |
Method that take path to crawled file and outputs date dictionary: |
|
8 |
Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815) |
|
9 |
and value is dictionary where keys devices (specified in configuration file) |
|
10 |
and value is CSVDataLine.CSVDataLine with device,date and occurrence |
|
11 |
|
|
12 |
Args: |
|
13 |
filename: name of processed file |
|
14 |
|
|
15 |
Returns: |
|
16 |
None if not implemented |
|
17 |
date_dict when implemented |
|
18 |
""" |
|
19 |
date_dict = dict() |
|
20 |
|
|
21 |
with open(filename, "r") as file: |
|
22 |
|
|
23 |
for line in file: |
|
24 |
|
|
25 |
array = line.split(";") |
|
26 |
|
|
27 |
date = DateFormating.date_time_formatter(array[0][1:-1]) |
|
28 |
name = array[1][1:-1] |
|
29 |
|
|
30 |
if date not in date_dict: |
|
31 |
date_dict[date] = dict() |
|
32 |
|
|
33 |
if name in date_dict[date]: |
|
34 |
date_dict[date][name].occurrence += 1 |
|
35 |
else: |
|
36 |
date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, 1) |
|
37 |
|
|
38 |
return date_dict |
|
39 |
|
modules/crawler/DatasetProcessing/KOLOBEZKY_processor.py | ||
---|---|---|
1 |
from Utilities.CSV import csv_data_line |
|
2 |
from Utilities import date_formating |
|
3 |
|
|
4 |
|
|
5 |
def process_file(filename): |
|
6 |
""" |
|
7 |
Method that take path to crawled file and outputs date dictionary: |
|
8 |
Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815) |
|
9 |
and value is dictionary where keys are devices (specified in configuration file) |
|
10 |
and value is CSVDataLine.csv_data_line with device,date and occurrence |
|
11 |
|
|
12 |
Args: |
|
13 |
filename: name of processed file |
|
14 |
|
|
15 |
Returns: |
|
16 |
None if not implemented |
|
17 |
date_dict when implemented |
|
18 |
""" |
|
19 |
date_dict = dict() |
|
20 |
|
|
21 |
with open(filename, "r") as file: |
|
22 |
|
|
23 |
for line in file: |
|
24 |
|
|
25 |
array = line.split(";") |
|
26 |
|
|
27 |
date = date_formating.date_time_formatter(array[0][1:-1]) |
|
28 |
name = array[1][1:-1] |
|
29 |
|
|
30 |
if date not in date_dict: |
|
31 |
date_dict[date] = dict() |
|
32 |
|
|
33 |
if name in date_dict[date]: |
|
34 |
date_dict[date][name].occurrence += 1 |
|
35 |
else: |
|
36 |
date_dict[date][name] = csv_data_line.CSVDataLine(name, date, 1) |
|
37 |
|
|
38 |
return date_dict |
|
39 |
|
modules/crawler/DatasetProcessing/WIFIProcessor.py | ||
---|---|---|
1 |
from Utilities.CSV import CSVDataLine |
|
2 |
from Utilities import DateFormating |
|
3 |
|
|
4 |
|
|
5 |
def process_file(filename): |
|
6 |
""" |
|
7 |
Method that take path to crawled file and outputs date dictionary: |
|
8 |
Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815) |
|
9 |
and value is dictionary where keys devices (specified in configuration file) |
|
10 |
and value is CSVDataLine.CSVDataLine with device,date and occurrence |
|
11 |
|
|
12 |
Args: |
|
13 |
filename: name of processed file |
|
14 |
|
|
15 |
Returns: |
|
16 |
None if not implemented |
|
17 |
date_dict when implemented |
|
18 |
""" |
|
19 |
date_dict = dict() |
|
20 |
|
|
21 |
with open(filename, "r", encoding="utf-8") as file: |
|
22 |
|
|
23 |
for line in file: |
|
24 |
|
|
25 |
array = line.split(";") |
|
26 |
|
|
27 |
date = DateFormating.date_time_formatter(array[4][1:-2]) |
|
28 |
name = array[1][1:-1] |
|
29 |
occurrence = array[0] |
|
30 |
|
|
31 |
if date not in date_dict: |
|
32 |
date_dict[date] = dict() |
|
33 |
|
|
34 |
if name in date_dict[date]: |
|
35 |
date_dict[date][name].occurrence += max(date_dict[date][name].occurrence,int(occurrence)) |
|
36 |
else: |
|
37 |
date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, int(occurrence)) |
|
38 |
|
|
39 |
return date_dict |
|
40 |
|
modules/crawler/DatasetProcessing/WIFI_processor.py | ||
---|---|---|
1 |
from Utilities.CSV import csv_data_line |
|
2 |
from Utilities import date_formating |
|
3 |
|
|
4 |
|
|
5 |
def process_file(filename): |
|
6 |
""" |
|
7 |
Method that take path to crawled file and outputs date dictionary: |
|
8 |
Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815) |
|
9 |
and value is dictionary where keys are devices (specified in configuration file) |
|
10 |
and value is CSVDataLine.csv_data_line with device,date and occurrence |
|
11 |
|
|
12 |
Args: |
|
13 |
filename: name of processed file |
|
14 |
|
|
15 |
Returns: |
|
16 |
None if not implemented |
|
17 |
date_dict when implemented |
|
18 |
""" |
|
19 |
date_dict = dict() |
|
20 |
|
|
21 |
with open(filename, "r", encoding="utf-8") as file: |
|
22 |
|
|
23 |
for line in file: |
|
24 |
|
|
25 |
array = line.split(";") |
|
26 |
|
|
27 |
date = date_formating.date_time_formatter(array[4][1:-2]) |
|
28 |
name = array[1][1:-1] |
|
29 |
occurrence = array[0] |
|
30 |
|
|
31 |
if date not in date_dict: |
|
32 |
date_dict[date] = dict() |
|
33 |
|
|
34 |
if name in date_dict[date]: |
|
35 |
date_dict[date][name].occurrence += max(date_dict[date][name].occurrence,int(occurrence)) |
|
36 |
else: |
|
37 |
date_dict[date][name] = csv_data_line.CSVDataLine(name, date, int(occurrence)) |
|
38 |
|
|
39 |
return date_dict |
|
40 |
|
modules/crawler/ForceUpdateDataset.py | ||
---|---|---|
1 |
import Pipeline |
|
2 |
import os |
|
3 |
|
|
4 |
print("Zadejte jméno Datasetu který chcete upadtovat:\n") |
|
5 |
Pipeline.run_full_pipeline(input()) |
modules/crawler/ForceUpdateDatasets.py | ||
---|---|---|
1 |
import Pipeline |
|
2 |
import os |
|
3 |
|
|
4 |
# Path to configuration files |
|
5 |
CONFIG_FILES_PATH = "DatasetConfigs/" |
|
6 |
|
|
7 |
|
|
8 |
def run_pipeline_for_all_datasets(): |
|
9 |
""" |
|
10 |
Runs whole DataScript pipeline for every dataset that has existing configuration file |
|
11 |
""" |
|
12 |
files_in_dir = os.listdir(CONFIG_FILES_PATH) |
|
13 |
|
|
14 |
for file in files_in_dir: |
|
15 |
name = file.split('.') |
|
16 |
Pipeline.run_full_pipeline(name[0]) |
|
17 |
|
|
18 |
|
|
19 |
run_pipeline_for_all_datasets() |
modules/crawler/Pipeline.py | ||
---|---|---|
1 |
from Utilities import FolderProcessor, ConfigureFunctions |
|
2 |
from Utilities.Database import DatabaseLoader |
|
3 |
from Utilities.CSV import CSVutils |
|
4 |
|
|
5 |
import logging |
|
6 |
from datetime import date |
|
7 |
|
|
8 |
|
|
9 |
# Path to crawled data |
|
10 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
11 |
# Path to processed data |
|
12 |
PROCESSED_DATA_PATH = "ProcessedData/" |
|
13 |
# Path to crawler logs |
|
14 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
|
15 |
# Path to dataset crawler implementations |
|
16 |
CRAWLER_LIB_PATH = "DatasetCrawler." |
|
17 |
# Path to dataset processor implementations |
|
18 |
PROCESSOR_LIB_PATH = "DatasetProcessing." |
|
19 |
|
|
20 |
|
|
21 |
#logger |
|
22 |
logging.basicConfig(filename=CRAWLER_LOGS_PATH + "CommonRecords/" + 'Applicationlog-' + date.today().strftime("%b-%Y") + '.log', |
|
23 |
level=logging.INFO, |
|
24 |
format='%(asctime)s %(message)s' |
|
25 |
) |
|
26 |
|
|
27 |
|
|
28 |
def check_last_update(config): |
|
29 |
""" |
|
30 |
Loads integer from updated.txt in CrawlerLogs/"dataset_name" |
|
31 |
representing number of days from last update if number equals |
|
32 |
number in confing update period updates it and reset number of |
|
33 |
days to zero else increment the number |
|
34 |
|
|
35 |
Arguments: |
|
36 |
config loaded configuration file of dataset |
|
37 |
|
|
38 |
Returns: |
|
39 |
True if updating |
|
40 |
Else if incementing days from last update |
|
41 |
""" |
|
42 |
dataset_name = config["dataset-name"] |
|
43 |
|
|
44 |
with open(CRAWLER_LOGS_PATH + dataset_name + "/" + "updated.txt", "r+") as file: |
|
45 |
last_update = file.read() |
|
46 |
last_update = int(last_update) |
|
47 |
file.seek(0) |
|
48 |
|
|
49 |
confing_update_period = int(config["update-period"]) |
|
50 |
|
|
51 |
if config["update-period"] <= last_update: |
|
52 |
logging.info("Dataset " + dataset_name + " is being updated today") |
|
53 |
file.write("0") |
|
54 |
file.truncate() |
|
55 |
return True |
|
56 |
else: |
|
57 |
last_update_days = last_update + 1 |
|
58 |
logging.info("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days") |
|
59 |
file.write(str(last_update_days)) |
|
60 |
file.truncate() |
|
61 |
return False |
|
62 |
|
|
63 |
|
|
64 |
|
|
65 |
def crawl_data(config): |
|
66 |
""" |
|
67 |
Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py |
|
68 |
runs crawler. |
|
69 |
|
|
70 |
Args: |
|
71 |
config: loaded configuration file of dataset |
|
72 |
""" |
|
73 |
dataset_name = config["dataset-name"] |
|
74 |
|
|
75 |
crawl_func = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl |
|
76 |
crawl_func(config) |
|
77 |
|
|
78 |
dataset_name += '/' |
|
79 |
|
|
80 |
|
|
81 |
def process_data(dataset_name): |
|
82 |
""" |
|
83 |
Goes trough every not processed file(not contained in CrawledData/"dataset_name"/ignore.txt) |
|
84 |
Imports dataset processor in DatasetProcessing/"dataset_name"Processor.py |
|
85 |
Runs processor on every file |
|
86 |
After successful processing updates ignore.txt |
|
87 |
|
|
88 |
Args: |
|
89 |
dataset_name: name of dataset that has existing configuration file |
|
90 |
""" |
|
91 |
dataset_path = dataset_name + '/' |
|
92 |
|
|
93 |
process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor", globals(), locals(), |
|
94 |
['process_file']).process_file |
|
95 |
|
|
96 |
not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path) |
|
97 |
logging.info(dataset_name + " has downloaded " + str(len(not_processed_files)) + " new files") |
|
98 |
|
|
99 |
for not_processed_file in not_processed_files: |
|
100 |
path = CRAWLED_DATA_PATH + dataset_path + not_processed_file |
|
101 |
date_dic = process_file_func(path) |
|
102 |
CSVutils.export_data_to_csv(path, date_dic) |
|
103 |
FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file) |
|
104 |
|
|
105 |
logging.info(dataset_name + " has processed " + str(len(not_processed_files)) + " newly crawled files") |
|
106 |
|
|
107 |
|
|
108 |
def validate_process_data(config): |
|
109 |
""" |
|
110 |
Function goes through newly processed data and checks theirs status |
|
111 |
|
|
112 |
Args: |
|
113 |
config: loaded configuration file of dataset |
|
114 |
|
|
115 |
Returns: |
|
116 |
boolean variable TRUE/FALSE. |
|
117 |
Data processed correctly - TRUE |
|
118 |
Wrong format or NEW unknown devices - FALSE |
|
119 |
""" |
|
120 |
processed_devices_set = FolderProcessor.get_devices_set(PROCESSED_DATA_PATH + config["dataset-name"] + '/') |
|
121 |
unknown_devices_set = FolderProcessor.get_unknown_devices_set(config, processed_devices_set) |
|
122 |
unknown_devices_size = len(unknown_devices_set) |
|
123 |
|
|
124 |
if unknown_devices_size != 0: |
|
125 |
logging.info("There is " + str(unknown_devices_size) + " unknown devices") |
|
126 |
logging.info("Adding devices to " + config["dataset-name"] + " config file") |
|
127 |
ConfigureFunctions.update_configuration(config["dataset-name"], unknown_devices_set) |
|
128 |
return False |
|
129 |
|
|
130 |
for device in config["devices"]: |
|
131 |
device = config["devices"][device] |
|
132 |
if device["x"] == "UNKNOWN!" or device["y"] == "UNKNOWN!": |
|
133 |
logging.info(config["dataset-name"] + " config file contains devices with UNKOWN! values please update them!!") |
|
134 |
return False |
|
135 |
|
|
136 |
return True |
|
137 |
|
|
138 |
|
|
139 |
def load_data_to_database(config): |
|
140 |
""" |
|
141 |
Goes trough every not loaded file(not contained in ProcessedData/ignore.txt) |
|
142 |
loads data appends coordination from configurations |
|
143 |
and exports it into the database |
|
144 |
After successful exporting updates ignore.txt |
|
145 |
|
|
146 |
Args: |
|
147 |
config: loaded configuration file of dataset |
|
148 |
""" |
|
149 |
dataset_name = config["dataset-name"] |
|
150 |
dataset_path = dataset_name + '/' |
|
151 |
|
|
152 |
# get all unprocessed files from dataset |
|
153 |
not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path) |
|
154 |
|
|
155 |
database_connection = DatabaseLoader.create_database_connection() |
|
156 |
|
|
157 |
DatabaseLoader.check_or_update_datasets_collection(database_connection,config) |
|
158 |
|
|
159 |
DatabaseLoader.update_devices_collection(database_connection,config) |
|
160 |
|
|
161 |
|
|
162 |
# load every file |
|
163 |
for not_loaded_file in not_loaded_files: |
|
164 |
#check if file is not in database already if it is skip |
|
165 |
test = DatabaseLoader.check_if_database_doesnt_contain_file(database_connection,dataset_name,not_loaded_file) |
|
166 |
if test == False: |
|
167 |
logging.info(dataset_name + " could not load " + not_loaded_file + " to database because he is already there.") |
|
168 |
FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file) |
|
169 |
continue |
|
170 |
# load processed data |
|
171 |
processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config) |
|
172 |
# load processed data to database |
|
173 |
DatabaseLoader.load_data_to_database(database_connection, dataset_name, processed_data, not_loaded_file) |
|
174 |
FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file) |
|
175 |
|
|
176 |
logging.info(dataset_name + " has loaded to database " + str(len(not_loaded_files)) + " newly processed files.") |
|
177 |
|
|
178 |
|
|
179 |
def load_data_to_database_crone(config): |
|
180 |
""" |
|
181 |
Goes trough every not loaded file(not contained in ProcessedData/ignore.txt) |
|
182 |
loads data appends coordination from configurations |
|
183 |
and exports it into the database |
|
184 |
After successful exporting updates ignore.txt |
|
185 |
|
|
186 |
Args: |
|
187 |
config: loaded configuration file of dataset |
|
188 |
""" |
|
189 |
dataset_name = config["dataset-name"] |
|
190 |
dataset_path = dataset_name + '/' |
|
191 |
|
|
192 |
# get all unprocessed files from dataset |
|
193 |
not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path) |
|
194 |
|
|
195 |
database_connection = DatabaseLoader.create_database_connection() |
|
196 |
|
|
197 |
# load every file |
|
198 |
for not_loaded_file in not_loaded_files: |
|
199 |
# load processed data |
|
200 |
processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config) |
|
201 |
# load processed data to database |
|
202 |
DatabaseLoader.load_data_to_database(database_connection, dataset_name, processed_data, not_loaded_file) |
|
203 |
FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file) |
|
204 |
|
|
205 |
logging.info(dataset_name + " has loaded to database " + str(len(not_loaded_files)) + " newly processed files.") |
|
206 |
|
|
207 |
|
|
208 |
def run_full_pipeline(dataset_name): |
|
209 |
""" |
|
210 |
Loads config file and starts full pipeline |
|
211 |
-crawl data |
|
212 |
-process data |
|
213 |
-load data to database |
|
214 |
|
|
215 |
Args: |
|
216 |
dataset_name: name of dataset that has existing configuration file |
|
217 |
""" |
|
218 |
logging.info("Starting pipeline for dataset " + dataset_name) |
|
219 |
|
|
220 |
config = ConfigureFunctions.load_configuration(dataset_name) |
|
221 |
crawl_data(config) |
|
222 |
process_data(config["dataset-name"]) |
|
223 |
|
|
224 |
validation_test = validate_process_data(config) |
|
225 |
|
|
226 |
if validation_test: |
|
227 |
load_data_to_database(config) |
|
228 |
|
|
229 |
|
|
230 |
def run_full_pipeline_crone(dataset_name): |
|
231 |
""" |
|
232 |
Loads config file and starts full pipeline |
|
233 |
-crawl data |
|
234 |
-process data |
|
235 |
-load data to database |
|
236 |
|
|
237 |
Args: |
|
238 |
dataset_name: name of dataset that has existing configuration file |
|
239 |
""" |
|
240 |
logging.info("Starting pipeline for dataset " + dataset_name) |
|
241 |
|
|
242 |
config = ConfigureFunctions.load_configuration(dataset_name) |
|
243 |
update_test = check_last_update(config) |
|
244 |
if update_test: |
|
245 |
crawl_data(config) |
|
246 |
process_data(config["dataset-name"]) |
|
247 |
|
|
248 |
validation_test = validate_process_data(config) |
|
249 |
|
|
250 |
if validation_test: |
|
251 |
load_data_to_database_crone(config) |
|
252 |
|
modules/crawler/PrepareNewDataset.py | ||
---|---|---|
1 |
import os |
|
2 |
|
|
3 |
# Path to crawled data |
|
4 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
5 |
# Path to processed data |
|
6 |
PROCESSED_DATA_PATH = "ProcessedData/" |
|
7 |
# Path to crawler logs |
|
8 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
|
9 |
# Path for DatasetCrawlers implementations |
|
10 |
CRAWLER_PROGRAM_PATH = "DatasetCrawler" |
|
11 |
# Path for DatasetProcessors implementations |
|
12 |
PROCESSOR_PROGRAM_PATH = "DatasetProcessing" |
|
13 |
# Path to dataset configuration files |
|
14 |
CONFIG_FILES_PATH = "DatasetConfigs" |
|
15 |
|
|
16 |
|
|
17 |
def create_default_config_file(dataset_name): |
|
18 |
""" |
|
19 |
Creates default config file |
|
20 |
|
|
21 |
Args: |
|
22 |
dataset_name: Name of newly created dataset |
|
23 |
""" |
|
24 |
with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file: |
|
25 |
file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n") |
|
26 |
file.write("dataset-name: " + dataset_name + "\n") |
|
27 |
file.write("# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře\n") |
|
28 |
file.write("dataset-name: " + dataset_name + "\n") |
|
29 |
file.write("# root slozka, ktera obsahuje odkazy na dataset\n") |
|
30 |
file.write("url: ZDE VLOZTE URL\n") |
|
31 |
file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n") |
|
32 |
file.write("regex: ZDE VLOZTE REGEX\n") |
|
33 |
file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, " |
|
34 |
"tak defaultni hodnota (dny)\n") |
|
35 |
file.write("update-period: ZDE VLOZTE HODNOTU\n") |
|
36 |
file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n") |
|
37 |
file.write("devices:\n") |
|
38 |
|
|
39 |
|
|
40 |
def create_default_processor(dataset_name): |
|
41 |
""" |
|
42 |
Creates default processor for dataset |
|
43 |
|
|
44 |
Args: |
|
45 |
dataset_name: Name of newly created dataset |
|
46 |
""" |
|
47 |
with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file: |
|
48 |
file.write("from Utilities.CSV import CSVDataLine") |
|
49 |
file.write("\n") |
|
50 |
file.write("\n") |
|
51 |
file.write("def process_file(filename):\n") |
|
52 |
file.write(" \"\"\"\n") |
|
53 |
file.write(" Method that take path to crawled file and outputs date dictionary:\n") |
|
54 |
file.write(" Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n") |
|
55 |
file.write(" and value is dictionary where keys devices (specified in configuration file)\n") |
|
56 |
file.write(" and value is CSVDataLine.CSVDataLine with device,date and occurrence\n") |
|
57 |
file.write("\n") |
|
58 |
file.write(" Args:\n") |
|
59 |
file.write(" filename: name of processed file\n") |
|
60 |
file.write("\n") |
|
61 |
file.write(" Returns:\n") |
|
62 |
file.write(" None if not implemented\n") |
|
63 |
file.write(" date_dict when implemented\n") |
|
64 |
file.write(" \"\"\"\n") |
|
65 |
file.write(" date_dict = dict()\n") |
|
66 |
file.write("\n") |
|
67 |
file.write(" #with open(filename, \"r\") as file:\n") |
|
68 |
file.write(" print(\"You must implements process_file method first!\")\n") |
|
69 |
file.write(" return None\n") |
|
70 |
|
|
71 |
|
|
72 |
def create_default_crawler(dataset_name): |
|
73 |
""" |
|
74 |
Creates default crawler for dataset |
|
75 |
|
|
76 |
Args: |
|
77 |
dataset_name: Name of newly created dataset |
|
78 |
""" |
|
79 |
|
|
80 |
with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file: |
|
81 |
file.write("# Path to crawled data\n") |
|
82 |
file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n") |
|
83 |
file.write("\n") |
|
84 |
file.write("\n") |
|
85 |
file.write("def crawl(config):\n") |
|
86 |
file.write(" \"\"\"\n") |
|
87 |
file.write(" Implement crawl method that downloads new data to path_for_files\n") |
|
88 |
file.write(" For keeping the project structure\n") |
|
89 |
file.write(" url , regex, and dataset_name from config\n") |
|
90 |
file.write(" You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n") |
|
91 |
file.write("\n") |
|
92 |
file.write(" Args:\n") |
|
93 |
file.write(" config: loaded configuration file of dataset\n") |
|
94 |
file.write(" \"\"\"\n") |
|
95 |
file.write(" dataset_name = config[\"dataset-name\"]\n") |
|
96 |
file.write(" url = config['url']\n") |
|
97 |
file.write(" regex = config['regex']\n") |
|
98 |
file.write(" path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n") |
|
99 |
file.write(" print(\"You must implements Crawl method first!\")\n") |
|
100 |
|
|
101 |
|
|
102 |
def create_ignore_file(path, text): |
|
103 |
""" |
|
104 |
Creates ignore file |
|
105 |
Args: |
|
106 |
path: path to directory for creating ignore.txt |
|
107 |
text: text that will be on first line of ignore.txt can be None |
|
108 |
""" |
|
109 |
with open(path + "/ignore.txt", "w") as file: |
|
110 |
if text is not None: |
|
111 |
file.write(text + "\n") |
|
112 |
|
|
113 |
def create_updated_file(path): |
|
114 |
""" |
|
115 |
Creates updated file |
|
116 |
Args: |
|
117 |
path: path to directory for creating updated.txt |
|
118 |
""" |
|
119 |
with open(path + "/updated.txt", "w") as file: |
|
120 |
file.write(str(0) + "\n") |
|
121 |
|
|
122 |
|
|
123 |
def prepare_dataset_structure(dataset_name): |
|
124 |
""" |
|
125 |
Prepares folders for new dataset |
|
126 |
Args: |
|
127 |
dataset_name: Name of newly created dataset |
|
128 |
""" |
|
129 |
|
|
130 |
# create folder for crawled data |
|
131 |
try: |
|
132 |
path = CRAWLED_DATA_PATH+dataset_name |
|
133 |
os.mkdir(path) |
|
134 |
create_ignore_file(path, "ignore.txt") |
|
135 |
except os.error as e: |
|
136 |
print(e) |
|
137 |
print("Creation of the directory %s failed" % path) |
|
138 |
|
|
139 |
# create folder for processed data |
|
140 |
try: |
|
141 |
path = PROCESSED_DATA_PATH + dataset_name |
|
142 |
os.mkdir(path) |
|
143 |
create_ignore_file(path, "ignore.txt") |
|
144 |
except OSError: |
|
145 |
print("Creation of the directory %s failed" % path) |
|
146 |
|
|
147 |
# create folder for crawler logs |
|
148 |
try: |
|
149 |
path = CRAWLER_LOGS_PATH + dataset_name |
|
150 |
os.mkdir(path) |
|
151 |
create_ignore_file(path, None) |
|
152 |
create_updated_file(path) |
|
153 |
except OSError: |
|
154 |
print("Creation of the directory %s failed" % path) |
|
155 |
|
|
156 |
create_default_crawler(dataset_name) |
|
157 |
create_default_processor(dataset_name) |
|
158 |
create_default_config_file(dataset_name) |
|
159 |
|
|
160 |
print("Zadejte jméno nového datasetu:\n") |
|
161 |
prepare_dataset_structure(input()) |
modules/crawler/ProcessedData/JIS/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
modules/crawler/ProcessedData/KOLOBEZKY/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
modules/crawler/ProcessedData/WIFI/ignore.txt | ||
---|---|---|
1 |
ignore.txt |
modules/crawler/RemoveDataset.py | ||
---|---|---|
1 |
import os |
|
2 |
import shutil |
|
3 |
|
|
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
6 |
# Path to processed data |
|
7 |
PROCESSED_DATA_PATH = "ProcessedData/" |
|
8 |
# Path to crawler logs |
|
9 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
|
10 |
# Path to dataset configuration files |
|
11 |
CONFIG_FILES_PATH = "DatasetConfigs" |
|
12 |
# Path for DatasetCrawlers implementations |
|
13 |
CRAWLER_PROGRAM_PATH = "DatasetCrawler" |
|
14 |
# Path for DatasetProcessors implementations |
|
15 |
PROCESSOR_PROGRAM_PATH = "DatasetProcessing" |
|
16 |
|
|
17 |
|
|
18 |
def remove_dataset(dataset_name): |
|
19 |
""" |
|
20 |
Remove dataset |
|
21 |
Args: |
|
22 |
dataset_name: name of dataset that has existing configuration file |
|
23 |
""" |
|
24 |
shutil.rmtree(CRAWLED_DATA_PATH + dataset_name + "/") |
|
25 |
shutil.rmtree(PROCESSED_DATA_PATH + dataset_name + "/") |
|
26 |
shutil.rmtree(CRAWLER_LOGS_PATH + dataset_name + "/") |
|
27 |
|
|
28 |
os.remove(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml") |
|
29 |
os.remove(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py") |
|
30 |
os.remove(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py") |
|
31 |
|
|
32 |
print("Dataset " + dataset_name + " removed") |
|
33 |
|
|
34 |
print("Zadejte jméno Datasetu který chcete odstranit:\n") |
|
35 |
remove_dataset(input()) |
modules/crawler/RemoveDatasetDatabase.py | ||
---|---|---|
1 |
from Utilities.Database import DatabaseLoader |
|
2 |
|
|
3 |
|
|
4 |
def remove_dataset_database(dataset_name): |
|
5 |
""" |
|
6 |
Removes dataset entries from database |
|
7 |
Args: |
|
8 |
dataset_name: name of dataset that has existing configuration file |
|
9 |
""" |
|
10 |
# Creating connection |
|
11 |
mydb = DatabaseLoader.create_database_connection() |
|
12 |
|
|
13 |
# collection where are specified aviable datasets |
|
14 |
collection_datasets = mydb[DatabaseLoader.MONGODB_DATASET_COLLECTION] |
|
15 |
|
|
16 |
collection_datasets.delete_one({"name": dataset_name}) |
|
17 |
print("Removing record from DATASETS collection") |
|
18 |
|
|
19 |
|
|
20 |
# Retrieve list of all collections |
|
21 |
collections = mydb.list_collection_names() |
|
22 |
|
|
23 |
# Drop of all collections |
|
24 |
for name in collections: |
|
25 |
if name.startswith(dataset_name): |
|
26 |
mydb[name].drop() |
|
27 |
print("Dropping: " + name) |
|
28 |
|
|
29 |
|
|
30 |
print("Zadejte jméno Datasetu který chcete odstranit z databáze:\n") |
|
31 |
remove_dataset_database(input()) |
modules/crawler/ResetDatabaseData.py | ||
---|---|---|
1 |
from Utilities.Database import DatabaseLoader |
|
2 |
|
|
3 |
|
|
4 |
def clean_database(): |
|
5 |
""" |
|
6 |
Deletes all collections from database |
|
7 |
""" |
|
8 |
# Create connection |
|
9 |
mydb = DatabaseLoader.create_database_connection() |
|
10 |
|
|
11 |
# Retrieve list of all collections |
|
12 |
collections = mydb.list_collection_names() |
|
13 |
|
|
14 |
# Drop of all collections |
|
15 |
for name in collections: |
|
16 |
mydb[name].drop() |
|
17 |
|
|
18 |
print("Database Cleaned") |
|
19 |
|
|
20 |
|
|
21 |
clean_database() |
modules/crawler/ResetDataset.py | ||
---|---|---|
1 |
import os |
|
2 |
from Utilities import FolderProcessor |
|
3 |
|
|
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
6 |
# Path to processed data |
|
7 |
PROCESSED_DATA_PATH = "ProcessedData/" |
|
8 |
# Path to crawler logs |
|
9 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
|
10 |
# Path to dataset configuration files |
|
11 |
CONFIG_FILES_PATH = "DatasetConfigs" |
|
12 |
|
|
13 |
|
|
14 |
def create_ignore_file(path, text): |
|
15 |
""" |
|
16 |
Creates ignore file |
|
17 |
Args: |
|
18 |
path: path to directory for creating ignore.txt |
|
19 |
text: text that will be on first line of ignore.txt can be None |
|
20 |
""" |
|
21 |
with open(path + "/ignore.txt", "w") as file: |
|
22 |
if text is not None: |
|
23 |
file.write(text + "\n") |
|
24 |
|
|
25 |
|
|
26 |
def create_updated_file(path): |
|
27 |
""" |
|
28 |
Creates updated file |
|
29 |
Args: |
|
30 |
path: path to directory for creating updated.txt |
|
31 |
""" |
|
32 |
with open(path + "/updated.txt", "w") as file: |
|
33 |
file.write(str(0) + "\n") |
|
34 |
|
|
35 |
|
|
36 |
def reset_dataset(dataset_name): |
|
37 |
""" |
|
38 |
Resets all saved data in dataset except config and implementation |
|
39 |
Args: |
|
40 |
dataset_name: name of dataset that has existing configuration file |
|
41 |
""" |
|
42 |
path = CRAWLED_DATA_PATH + dataset_name + "/" |
|
43 |
FolderProcessor.clean_folder(path) |
|
44 |
create_ignore_file(path, "ignore.txt") |
|
45 |
|
|
46 |
path = PROCESSED_DATA_PATH + dataset_name + "/" |
|
47 |
FolderProcessor.clean_folder(path) |
|
48 |
create_ignore_file(path, "ignore.txt") |
|
49 |
|
|
50 |
path = CRAWLER_LOGS_PATH + dataset_name + "/" |
|
51 |
FolderProcessor.clean_folder(path) |
|
52 |
create_ignore_file(path, None) |
|
53 |
create_updated_file(path) |
|
54 |
|
|
55 |
print("Zadejte jméno Datasetu který chcete resetovat:\n") |
|
56 |
reset_dataset(input()) |
modules/crawler/ResetDatasets.py | ||
---|---|---|
1 |
import os |
|
2 |
from Utilities import FolderProcessor |
|
3 |
|
|
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
6 |
# Path to processed data |
|
7 |
PROCESSED_DATA_PATH = "ProcessedData/" |
|
8 |
# Path to crawler logs |
|
9 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
|
10 |
# Path to dataset configuration files |
|
11 |
CONFIG_FILES_PATH = "DatasetConfigs" |
|
12 |
|
|
13 |
|
|
14 |
def create_ignore_file(path, text): |
|
15 |
""" |
|
16 |
Creates ignore file |
|
17 |
Args: |
|
18 |
path: path to directory for creating ignore.txt |
|
19 |
text: text that will be on first line of ignore.txt can be None |
|
20 |
""" |
|
21 |
with open(path + "/ignore.txt", "w") as file: |
|
22 |
if text is not None: |
|
23 |
file.write(text + "\n") |
|
24 |
|
|
25 |
|
|
26 |
def create_updated_file(path): |
|
27 |
""" |
|
28 |
Creates updated file |
|
29 |
Args: |
|
30 |
path: path to directory for creating updated.txt |
|
31 |
""" |
|
32 |
with open(path + "/updated.txt", "w") as file: |
|
33 |
file.write(str(0) + "\n") |
|
34 |
|
|
35 |
|
|
36 |
def reset_dataset(dataset_name): |
|
37 |
""" |
|
38 |
Resets all saved data in dataset except config and implementation |
|
39 |
Args: |
|
40 |
dataset_name: name of dataset that has existing configuration file |
|
41 |
""" |
|
42 |
path = CRAWLED_DATA_PATH + dataset_name + "/" |
|
43 |
FolderProcessor.clean_folder(path) |
|
44 |
create_ignore_file(path, "ignore.txt") |
|
45 |
|
|
46 |
path = PROCESSED_DATA_PATH + dataset_name + "/" |
|
47 |
FolderProcessor.clean_folder(path) |
|
48 |
create_ignore_file(path, "ignore.txt") |
|
49 |
|
|
50 |
path = CRAWLER_LOGS_PATH + dataset_name + "/" |
|
51 |
FolderProcessor.clean_folder(path) |
|
52 |
create_ignore_file(path, None) |
|
53 |
create_updated_file(path) |
|
54 |
|
|
55 |
|
|
56 |
def reset_all_datasets(): |
|
57 |
""" |
|
58 |
Resets all saved data in all datasets with config file except configs and implementation |
|
59 |
""" |
|
60 |
datasets = os.listdir(CONFIG_FILES_PATH) |
|
61 |
|
|
62 |
for dataset in datasets: |
|
63 |
reset_dataset(dataset.split('.')[0]) |
|
64 |
|
|
65 |
|
|
66 |
reset_all_datasets() |
modules/crawler/Utilities/CSV/CSVDataLine.py | ||
---|---|---|
1 |
class CSVDataLine: |
|
2 |
""" |
|
3 |
Class that specifies the look of data line in processed csv file |
|
4 |
prepared for database |
|
5 |
""" |
|
6 |
|
|
7 |
def __init__(self, name, date, occurrence): |
|
8 |
try: |
|
9 |
test_val = int(occurrence) |
|
10 |
except ValueError: |
|
11 |
print("Occurence should be and integer value!") |
|
12 |
|
|
13 |
if len(date) != 13: |
|
14 |
raise ValueError("Invalid date format YYYY-dd-mm-hh expected!") |
|
15 |
|
|
16 |
self.name = name |
|
17 |
self.date = date |
|
18 |
self.occurrence = test_val |
|
19 |
|
|
20 |
def to_csv(self): |
|
21 |
return self.name + ";" + str(self.occurrence) + ";" + self.date |
|
22 |
|
modules/crawler/Utilities/CSV/CSVutils.py | ||
---|---|---|
1 |
import inspect |
|
2 |
from Utilities.CSV import CSVDataLine |
|
3 |
|
|
4 |
# Path to processed data |
|
5 |
PROCESSED_DATA_PATH = "ProcessedData/" |
|
6 |
|
|
7 |
|
|
8 |
def get_unique_names_from_file(filename, column_number): |
|
9 |
""" |
|
10 |
Extract set of unique names from file |
|
11 |
Args: |
|
12 |
filename: path to processed file |
|
13 |
column_number: unique names are expected in csv file on column_number |
|
14 |
|
|
15 |
Returns: |
|
16 |
set of unique names |
|
17 |
""" |
|
18 |
# create set of unique names |
|
19 |
name_set = set() |
|
20 |
|
|
21 |
with open(filename, "r") as file: |
|
22 |
# go through every line of line |
|
23 |
for x in file: |
|
24 |
# split by csv splitter ; |
|
25 |
array = x.split(";") |
|
26 |
# add string from chosen column to set |
|
27 |
name_set.add(array[column_number]) |
|
28 |
|
|
29 |
return name_set |
Také k dispozici: Unified diff
Re #8116
predelano ukladani
prejmenovany soubory podle python konvenci