Projekt

Obecné

Profil

« Předchozí | Další » 

Revize d6ca840d

Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)

Re #8116
predelano ukladani
prejmenovany soubory podle python konvenci

Zobrazit rozdíly:

.gitignore
1 1
logs
2
database
3 2
website/vendor/
4 3
/website/var
5 4
cache
docker-compose-prod.yml
9 9
              volumes: 
10 10
                     - /etc/certificate:/certificate
11 11
                     - ./docker/nginx/sites-dev:/etc/nginx/sites-available
12
                     - /acme-challenge:/var/www/symfony/public/.well-known/acme-challenge/
12
                     - /acme-challenge:/var/www/symfony/public/.well-known/acme-challenge/
13
       crawler:
14
              volumes: 
15
                     - /logs/crawler:/src/CrawlerLogs
16
                     - /data/crawler:/src/CrawledData
docker-compose.yml
11 11
                        - backend
12 12
                volumes:
13 13
                        - ./modules/crawler/:/src
14
                        - ./logs/crawler:/log/
14
                        - ./logs/crawler:/src/CrawlerLogs
15 15
                container_name: "heatmap_crawler"
16 16
                environment:
17 17
                        - TZ=Europe/Prague
modules/crawler/.gitignore
1 1
*__pycache__*
2
*.CSV
2
*.CSV
3
CrawlerLogs
4
CrawledData
5
ProcessedData
modules/crawler/CrawledData/JIS/ignore.txt
1
ignore.txt
modules/crawler/CrawledData/KOLOBEZKY/ignore.txt
1
ignore.txt
modules/crawler/CrawledData/WIFI/ignore.txt
1
ignore.txt
modules/crawler/CrawlerLogs/CommonRecords/.gitignore
1
# Ignore everything in this directory
2
*
3
# Except this file
4
!.gitignore
modules/crawler/CrawlerLogs/JIS/updated.txt
1
0
modules/crawler/CrawlerLogs/KOLOBEZKY/updated.txt
1
0
modules/crawler/CrawlerLogs/WIFI/updated.txt
1
0
modules/crawler/CroneUpdateScript.py
1
import Pipeline
2
import os
3

  
4
# Path to configuration files
5
CONFIG_FILES_PATH = "DatasetConfigs/"
6

  
7

  
8
def run_pipeline_for_all_datasets():
9
    """
10
    Runs whole DataScript pipeline for every dataset that has existing configuration file
11
    """
12
    files_in_dir = os.listdir(CONFIG_FILES_PATH)
13

  
14
    for file in files_in_dir:
15
        name = file.split('.')
16
        Pipeline.run_full_pipeline_crone(name[0])
17

  
18

  
19
run_pipeline_for_all_datasets()
modules/crawler/DatasetConfigs/KOLOBEZKY.yaml
33 33
  - stojan-borska:
34 34
      x: 49.734518
35 35
      y: 13.359475
36
      
36

  
modules/crawler/DatasetCrawler/JISCrawler.py
1
from Utilities import FolderProcessor
2
from Utilities.Crawler import BasicCrawlerFunctions
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6

  
7

  
8
def crawl(config):
9
    """
10
    Implement crawl method that downloads new data to path_for_files
11
    For keeping the project structure
12
    url , regex, and dataset_name from config
13
    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py
14

  
15
    Args:
16
        config: loaded configuration file of dataset
17
    """
18
    dataset_name = config["dataset-name"]
19
    url = config['url']
20
    regex = config['regex']
21
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
22

  
23
    first_level_links = BasicCrawlerFunctions.get_all_links(url)
24
    filtered_first_level_links = BasicCrawlerFunctions.filter_links(first_level_links, "^OD_ZCU")
25
    absolute_first_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_first_level_links, url)
26

  
27
    files = []
28

  
29
    for link in absolute_first_level_links:
30
        second_level_links = BasicCrawlerFunctions.get_all_links(link)
31
        filtered_second_level_links = BasicCrawlerFunctions.filter_links(second_level_links, regex)
32
        absolute_second_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_second_level_links, link)
33
        final_links = BasicCrawlerFunctions.remove_downloaded_links(absolute_second_level_links, dataset_name)
34

  
35
        for file_link in final_links:
36
            files.append(file_link)
37

  
38
    for file in files:
39
        BasicCrawlerFunctions.download_file_from_url(file, dataset_name)
40

  
41
    FolderProcessor.unzip_all_csv_zip_files_in_folder(path_for_files)
modules/crawler/DatasetCrawler/JIS_crawler.py
1
from Utilities import folder_processor
2
from Utilities.Crawler import basic_crawler_functions
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6

  
7

  
8
def crawl(config):
9
    """
10
    Implement crawl method that downloads new data to path_for_files
11
    For keeping the project structure
12
    url , regex, and dataset_name from config
13
    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py
14

  
15
    Args:
16
        config: loaded configuration file of dataset
17
    """
18
    dataset_name = config["dataset-name"]
19
    url = config['url']
20
    regex = config['regex']
21
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
22

  
23
    first_level_links = basic_crawler_functions.get_all_links(url)
24
    filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU")
25
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url)
26

  
27
    files = []
28

  
29
    for link in absolute_first_level_links:
30
        second_level_links = basic_crawler_functions.get_all_links(link)
31
        filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex)
32
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link)
33
        final_links = basic_crawler_functions.remove_downloaded_links(absolute_second_level_links, dataset_name)
34

  
35
        for file_link in final_links:
36
            files.append(file_link)
37

  
38
    for file in files:
39
        basic_crawler_functions.download_file_from_url(file, dataset_name)
40

  
41
    folder_processor.unzip_all_csv_zip_files_in_folder(path_for_files)
modules/crawler/DatasetCrawler/KOLOBEZKYCrawler.py
1
from Utilities import FolderProcessor
2
from Utilities.Crawler import BasicCrawlerFunctions
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6

  
7

  
8
def crawl(config):
9
    """
10
    Implement crawl method that downloads new data to path_for_files
11
    For keeping the project structure
12
    url , regex, and dataset_name from config
13
    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py
14

  
15
    Args:
16
        config: loaded configuration file of dataset
17
    """
18
    dataset_name = config["dataset-name"]
19
    url = config['url']
20
    regex = config['regex']
21
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
22

  
23
    first_level_links = BasicCrawlerFunctions.get_all_links(url)
24
    filtered_first_level_links = BasicCrawlerFunctions.filter_links(first_level_links, "^OD_ZCU")
25
    absolute_first_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_first_level_links, url)
26

  
27
    files = []
28

  
29
    for link in absolute_first_level_links:
30
        second_level_links = BasicCrawlerFunctions.get_all_links(link)
31
        filtered_second_level_links = BasicCrawlerFunctions.filter_links(second_level_links, regex)
32
        absolute_second_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_second_level_links, link)
33
        final_links = BasicCrawlerFunctions.remove_downloaded_links(absolute_second_level_links, dataset_name)
34

  
35
        for file_link in final_links:
36
            files.append(file_link)
37

  
38
    for file in files:
39
        BasicCrawlerFunctions.download_file_from_url(file, dataset_name)
40

  
41
    FolderProcessor.unzip_all_csv_zip_files_in_folder(path_for_files)
modules/crawler/DatasetCrawler/KOLOBEZKY_crawler.py
1
from Utilities import folder_processor
2
from Utilities.Crawler import basic_crawler_functions
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6

  
7

  
8
def crawl(config):
9
    """
10
    Implement crawl method that downloads new data to path_for_files
11
    For keeping the project structure
12
    url , regex, and dataset_name from config
13
    You can use already implemented functions from Utilities/Crawler/basic_crawler_functions.py
14

  
15
    Args:
16
        config: loaded configuration file of dataset
17
    """
18
    dataset_name = config["dataset-name"]
19
    url = config['url']
20
    regex = config['regex']
21
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
22

  
23
    first_level_links = basic_crawler_functions.get_all_links(url)
24
    filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU")
25
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url)
26

  
27
    files = []
28

  
29
    for link in absolute_first_level_links:
30
        second_level_links = basic_crawler_functions.get_all_links(link)
31
        filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex)
32
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link)
33
        final_links = basic_crawler_functions.remove_downloaded_links(absolute_second_level_links, dataset_name)
34

  
35
        for file_link in final_links:
36
            files.append(file_link)
37

  
38
    for file in files:
39
        basic_crawler_functions.download_file_from_url(file, dataset_name)
40

  
41
    folder_processor.unzip_all_csv_zip_files_in_folder(path_for_files)
modules/crawler/DatasetCrawler/WIFICrawler.py
1
from Utilities import FolderProcessor
2
from Utilities.Crawler import BasicCrawlerFunctions
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6

  
7

  
8
def crawl(config):
9
    """
10
    Implement crawl method that downloads new data to path_for_files
11
    For keeping the project structure
12
    url , regex, and dataset_name from config
13
    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py
14

  
15
    Args:
16
        config: loaded configuration file of dataset
17
    """
18
    dataset_name = config["dataset-name"]
19
    url = config['url']
20
    regex = config['regex']
21
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
22

  
23
    first_level_links = BasicCrawlerFunctions.get_all_links(url)
24
    filtered_first_level_links = BasicCrawlerFunctions.filter_links(first_level_links, "^OD_ZCU")
25
    absolute_first_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_first_level_links, url)
26

  
27
    files = []
28

  
29
    for link in absolute_first_level_links:
30
        second_level_links = BasicCrawlerFunctions.get_all_links(link)
31
        filtered_second_level_links = BasicCrawlerFunctions.filter_links(second_level_links, regex)
32
        absolute_second_level_links = BasicCrawlerFunctions.create_absolute_links(filtered_second_level_links, link)
33
        final_links = BasicCrawlerFunctions.remove_downloaded_links(absolute_second_level_links, dataset_name)
34

  
35
        for file_link in final_links:
36
            files.append(file_link)
37

  
38
    for file in files:
39
        BasicCrawlerFunctions.download_file_from_url(file, dataset_name)
40

  
41
    FolderProcessor.unzip_all_csv_zip_files_in_folder(path_for_files)
modules/crawler/DatasetCrawler/WIFI_crawler.py
1
from Utilities import folder_processor
2
from Utilities.Crawler import basic_crawler_functions
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6

  
7

  
8
def crawl(config):
9
    """
10
    Implement crawl method that downloads new data to path_for_files
11
    For keeping the project structure
12
    url , regex, and dataset_name from config
13
    You can use already implemented functions from Utilities/Crawler/basic_crawler_functions.py
14

  
15
    Args:
16
        config: loaded configuration file of dataset
17
    """
18
    dataset_name = config["dataset-name"]
19
    url = config['url']
20
    regex = config['regex']
21
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
22

  
23
    first_level_links = basic_crawler_functions.get_all_links(url)
24
    filtered_first_level_links = basic_crawler_functions.filter_links(first_level_links, "^OD_ZCU")
25
    absolute_first_level_links = basic_crawler_functions.create_absolute_links(filtered_first_level_links, url)
26

  
27
    files = []
28

  
29
    for link in absolute_first_level_links:
30
        second_level_links = basic_crawler_functions.get_all_links(link)
31
        filtered_second_level_links = basic_crawler_functions.filter_links(second_level_links, regex)
32
        absolute_second_level_links = basic_crawler_functions.create_absolute_links(filtered_second_level_links, link)
33
        final_links = basic_crawler_functions.remove_downloaded_links(absolute_second_level_links, dataset_name)
34

  
35
        for file_link in final_links:
36
            files.append(file_link)
37

  
38
    for file in files:
39
        basic_crawler_functions.download_file_from_url(file, dataset_name)
40

  
41
    folder_processor.unzip_all_csv_zip_files_in_folder(path_for_files)
modules/crawler/DatasetProcessing/JISProcessor.py
1
from Utilities.CSV import CSVDataLine
2
from Utilities import DateFormating
3

  
4

  
5
def process_file(filename):
6
    """
7
    Method that take path to crawled file and outputs date dictionary:
8
    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)
9
    and value is dictionary where keys devices (specified in configuration file)
10
    and value is CSVDataLine.CSVDataLine with device,date and occurrence
11

  
12
    Args:
13
    filename: name of processed file
14

  
15
    Returns:
16
    None if not implemented
17
    date_dict when implemented
18
    """
19
    date_dict = dict()
20

  
21
    with open(filename, "r", encoding="utf-8") as file:
22

  
23
        for line in file:
24

  
25
            array = line.split(";")
26

  
27
            date = DateFormating.date_time_formatter(array[1][1:-1])
28
            name = array[0][1:-1]
29
            occurrence = array[2][:-1]
30

  
31
            if date not in date_dict:
32
                date_dict[date] = dict()
33

  
34
            if name in date_dict[date]:
35
                date_dict[date][name].occurrence += int(occurrence)
36
            else:
37
                date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, occurrence)
38

  
39
    return date_dict
40

  
modules/crawler/DatasetProcessing/JIS_processor.py
1
from Utilities.CSV import csv_data_line
2
from Utilities import date_formating
3

  
4

  
5
def process_file(filename):
6
    """
7
    Method that take path to crawled file and outputs date dictionary:
8
    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)
9
    and value is dictionary where keys are devices (specified in configuration file)
10
    and value is CSVDataLine.csv_data_line with device,date and occurrence
11

  
12
    Args:
13
    filename: name of processed file
14

  
15
    Returns:
16
    None if not implemented
17
    date_dict when implemented
18
    """
19
    date_dict = dict()
20

  
21
    with open(filename, "r", encoding="utf-8") as file:
22

  
23
        for line in file:
24

  
25
            array = line.split(";")
26

  
27
            date = date_formating.date_time_formatter(array[1][1:-1])
28
            name = array[0][1:-1]
29
            occurrence = array[2][:-1]
30

  
31
            if date not in date_dict:
32
                date_dict[date] = dict()
33

  
34
            if name in date_dict[date]:
35
                date_dict[date][name].occurrence += int(occurrence)
36
            else:
37
                date_dict[date][name] = csv_data_line.CSVDataLine(name, date, occurrence)
38

  
39
    return date_dict
40

  
modules/crawler/DatasetProcessing/KOLOBEZKYProcessor.py
1
from Utilities.CSV import CSVDataLine
2
from Utilities import DateFormating
3

  
4

  
5
def process_file(filename):
6
    """
7
    Method that take path to crawled file and outputs date dictionary:
8
    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)
9
    and value is dictionary where keys devices (specified in configuration file)
10
    and value is CSVDataLine.CSVDataLine with device,date and occurrence
11

  
12
    Args:
13
    filename: name of processed file
14

  
15
    Returns:
16
    None if not implemented
17
    date_dict when implemented
18
    """
19
    date_dict = dict()
20

  
21
    with open(filename, "r") as file:
22

  
23
        for line in file:
24

  
25
            array = line.split(";")
26

  
27
            date = DateFormating.date_time_formatter(array[0][1:-1])
28
            name = array[1][1:-1]
29

  
30
            if date not in date_dict:
31
                date_dict[date] = dict()
32

  
33
            if name in date_dict[date]:
34
                date_dict[date][name].occurrence += 1
35
            else:
36
                date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, 1)
37

  
38
    return date_dict
39

  
modules/crawler/DatasetProcessing/KOLOBEZKY_processor.py
1
from Utilities.CSV import csv_data_line
2
from Utilities import date_formating
3

  
4

  
5
def process_file(filename):
6
    """
7
    Method that take path to crawled file and outputs date dictionary:
8
    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)
9
    and value is dictionary where keys are devices (specified in configuration file)
10
    and value is CSVDataLine.csv_data_line with device,date and occurrence
11

  
12
    Args:
13
    filename: name of processed file
14

  
15
    Returns:
16
    None if not implemented
17
    date_dict when implemented
18
    """
19
    date_dict = dict()
20

  
21
    with open(filename, "r") as file:
22

  
23
        for line in file:
24

  
25
            array = line.split(";")
26

  
27
            date = date_formating.date_time_formatter(array[0][1:-1])
28
            name = array[1][1:-1]
29

  
30
            if date not in date_dict:
31
                date_dict[date] = dict()
32

  
33
            if name in date_dict[date]:
34
                date_dict[date][name].occurrence += 1
35
            else:
36
                date_dict[date][name] = csv_data_line.CSVDataLine(name, date, 1)
37

  
38
    return date_dict
39

  
modules/crawler/DatasetProcessing/WIFIProcessor.py
1
from Utilities.CSV import CSVDataLine
2
from Utilities import DateFormating
3

  
4

  
5
def process_file(filename):
6
    """
7
    Method that take path to crawled file and outputs date dictionary:
8
    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)
9
    and value is dictionary where keys devices (specified in configuration file)
10
    and value is CSVDataLine.CSVDataLine with device,date and occurrence
11

  
12
    Args:
13
    filename: name of processed file
14

  
15
    Returns:
16
    None if not implemented
17
    date_dict when implemented
18
    """
19
    date_dict = dict()
20

  
21
    with open(filename, "r", encoding="utf-8") as file:
22

  
23
        for line in file:
24

  
25
            array = line.split(";")
26

  
27
            date = DateFormating.date_time_formatter(array[4][1:-2])
28
            name = array[1][1:-1]
29
            occurrence = array[0]
30

  
31
            if date not in date_dict:
32
                date_dict[date] = dict()
33

  
34
            if name in date_dict[date]:
35
                date_dict[date][name].occurrence += max(date_dict[date][name].occurrence,int(occurrence))
36
            else:
37
                date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, int(occurrence))
38

  
39
    return date_dict
40

  
modules/crawler/DatasetProcessing/WIFI_processor.py
1
from Utilities.CSV import csv_data_line
2
from Utilities import date_formating
3

  
4

  
5
def process_file(filename):
6
    """
7
    Method that take path to crawled file and outputs date dictionary:
8
    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)
9
    and value is dictionary where keys are devices (specified in configuration file)
10
    and value is CSVDataLine.csv_data_line with device,date and occurrence
11

  
12
    Args:
13
    filename: name of processed file
14

  
15
    Returns:
16
    None if not implemented
17
    date_dict when implemented
18
    """
19
    date_dict = dict()
20

  
21
    with open(filename, "r", encoding="utf-8") as file:
22

  
23
        for line in file:
24

  
25
            array = line.split(";")
26

  
27
            date = date_formating.date_time_formatter(array[4][1:-2])
28
            name = array[1][1:-1]
29
            occurrence = array[0]
30

  
31
            if date not in date_dict:
32
                date_dict[date] = dict()
33

  
34
            if name in date_dict[date]:
35
                date_dict[date][name].occurrence += max(date_dict[date][name].occurrence,int(occurrence))
36
            else:
37
                date_dict[date][name] = csv_data_line.CSVDataLine(name, date, int(occurrence))
38

  
39
    return date_dict
40

  
modules/crawler/ForceUpdateDataset.py
1
import Pipeline
2
import os
3

  
4
print("Zadejte jméno Datasetu který chcete upadtovat:\n")
5
Pipeline.run_full_pipeline(input())
modules/crawler/ForceUpdateDatasets.py
1
import Pipeline
2
import os
3

  
4
# Path to configuration files
5
CONFIG_FILES_PATH = "DatasetConfigs/"
6

  
7

  
8
def run_pipeline_for_all_datasets():
9
    """
10
    Runs whole DataScript pipeline for every dataset that has existing configuration file
11
    """
12
    files_in_dir = os.listdir(CONFIG_FILES_PATH)
13

  
14
    for file in files_in_dir:
15
        name = file.split('.')
16
        Pipeline.run_full_pipeline(name[0])
17

  
18

  
19
run_pipeline_for_all_datasets()
modules/crawler/Pipeline.py
1
from Utilities import FolderProcessor, ConfigureFunctions
2
from Utilities.Database import DatabaseLoader
3
from Utilities.CSV import CSVutils
4

  
5
import logging
6
from datetime import date
7

  
8

  
9
# Path to crawled data
10
CRAWLED_DATA_PATH = "CrawledData/"
11
# Path to processed data
12
PROCESSED_DATA_PATH = "ProcessedData/"
13
# Path to crawler logs
14
CRAWLER_LOGS_PATH = "CrawlerLogs/"
15
# Path to dataset crawler implementations
16
CRAWLER_LIB_PATH = "DatasetCrawler."
17
# Path to dataset processor implementations
18
PROCESSOR_LIB_PATH = "DatasetProcessing."
19

  
20

  
21
#logger
22
logging.basicConfig(filename=CRAWLER_LOGS_PATH + "CommonRecords/" + 'Applicationlog-' + date.today().strftime("%b-%Y") + '.log',
23
                    level=logging.INFO,
24
                    format='%(asctime)s %(message)s'
25
                    )
26

  
27

  
28
def check_last_update(config):
29
    """
30
    Loads integer from updated.txt in CrawlerLogs/"dataset_name"
31
    representing number of days from last update if number equals
32
    number in confing update period updates it and reset number of
33
    days to zero else increment the number
34

  
35
    Arguments:
36
        config loaded configuration file of dataset
37

  
38
    Returns:
39
       True if updating
40
       Else if incementing days from last update
41
    """
42
    dataset_name = config["dataset-name"]
43

  
44
    with open(CRAWLER_LOGS_PATH + dataset_name + "/" + "updated.txt", "r+") as file:
45
        last_update = file.read()
46
        last_update = int(last_update)
47
        file.seek(0)
48

  
49
        confing_update_period = int(config["update-period"])
50

  
51
        if config["update-period"] <= last_update:
52
            logging.info("Dataset " + dataset_name + " is being updated today")
53
            file.write("0")
54
            file.truncate()
55
            return True
56
        else:
57
            last_update_days = last_update + 1
58
            logging.info("Dataset " + dataset_name + " will be updated in " + str(confing_update_period - last_update_days) + "days")
59
            file.write(str(last_update_days))
60
            file.truncate()
61
            return False
62

  
63

  
64

  
65
def crawl_data(config):
66
    """
67
      Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py
68
      runs crawler.
69

  
70
    Args:
71
        config: loaded configuration file of dataset
72
    """
73
    dataset_name = config["dataset-name"]
74

  
75
    crawl_func = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
76
    crawl_func(config)
77

  
78
    dataset_name += '/'
79

  
80

  
81
def process_data(dataset_name):
82
    """
83
    Goes trough every not processed file(not contained in CrawledData/"dataset_name"/ignore.txt)
84
    Imports dataset processor in DatasetProcessing/"dataset_name"Processor.py
85
    Runs processor on every file
86
    After successful processing updates ignore.txt
87

  
88
    Args:
89
        dataset_name: name of dataset that has existing configuration file
90
    """
91
    dataset_path = dataset_name + '/'
92

  
93
    process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor", globals(), locals(),
94
                                   ['process_file']).process_file
95

  
96
    not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path)
97
    logging.info(dataset_name + " has downloaded " + str(len(not_processed_files)) + " new files")
98

  
99
    for not_processed_file in not_processed_files:
100
        path = CRAWLED_DATA_PATH + dataset_path + not_processed_file
101
        date_dic = process_file_func(path)
102
        CSVutils.export_data_to_csv(path, date_dic)
103
        FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file)
104

  
105
    logging.info(dataset_name + " has processed " + str(len(not_processed_files)) + " newly crawled files")
106

  
107

  
108
def validate_process_data(config):
109
    """
110
    Function goes through newly processed data and checks theirs status
111

  
112
    Args:
113
        config: loaded configuration file of dataset
114

  
115
    Returns:
116
        boolean variable TRUE/FALSE.
117
        Data processed correctly - TRUE
118
        Wrong format or NEW unknown devices - FALSE
119
    """
120
    processed_devices_set = FolderProcessor.get_devices_set(PROCESSED_DATA_PATH + config["dataset-name"] + '/')
121
    unknown_devices_set = FolderProcessor.get_unknown_devices_set(config, processed_devices_set)
122
    unknown_devices_size = len(unknown_devices_set)
123

  
124
    if unknown_devices_size != 0:
125
        logging.info("There is " + str(unknown_devices_size) + " unknown devices")
126
        logging.info("Adding devices to " + config["dataset-name"] + " config file")
127
        ConfigureFunctions.update_configuration(config["dataset-name"], unknown_devices_set)
128
        return False
129

  
130
    for device in config["devices"]:
131
        device = config["devices"][device]
132
        if device["x"] == "UNKNOWN!" or device["y"] == "UNKNOWN!":
133
            logging.info(config["dataset-name"] + " config file contains devices with UNKOWN! values please update them!!")
134
            return False
135

  
136
    return True
137

  
138

  
139
def load_data_to_database(config):
140
    """
141
    Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
142
    loads data appends coordination from configurations
143
    and exports it into the database
144
    After successful exporting updates ignore.txt
145

  
146
    Args:
147
        config: loaded configuration file of dataset
148
    """
149
    dataset_name = config["dataset-name"]
150
    dataset_path = dataset_name + '/'
151

  
152
    # get all unprocessed files from dataset
153
    not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path)
154

  
155
    database_connection = DatabaseLoader.create_database_connection()
156

  
157
    DatabaseLoader.check_or_update_datasets_collection(database_connection,config)
158

  
159
    DatabaseLoader.update_devices_collection(database_connection,config)
160

  
161

  
162
    # load every file
163
    for not_loaded_file in not_loaded_files:
164
        #check if file is not in database already if it is skip
165
        test = DatabaseLoader.check_if_database_doesnt_contain_file(database_connection,dataset_name,not_loaded_file)
166
        if test == False:
167
            logging.info(dataset_name + " could not load " + not_loaded_file + " to database because he is already there.")
168
            FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
169
            continue
170
        # load processed data
171
        processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
172
        # load processed data to database
173
        DatabaseLoader.load_data_to_database(database_connection, dataset_name, processed_data, not_loaded_file)
174
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
175

  
176
    logging.info(dataset_name + " has loaded to database " + str(len(not_loaded_files)) + " newly processed files.")
177

  
178

  
179
def load_data_to_database_crone(config):
180
    """
181
    Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
182
    loads data appends coordination from configurations
183
    and exports it into the database
184
    After successful exporting updates ignore.txt
185

  
186
    Args:
187
        config: loaded configuration file of dataset
188
    """
189
    dataset_name = config["dataset-name"]
190
    dataset_path = dataset_name + '/'
191

  
192
    # get all unprocessed files from dataset
193
    not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path)
194

  
195
    database_connection = DatabaseLoader.create_database_connection()
196

  
197
    # load every file
198
    for not_loaded_file in not_loaded_files:
199
        # load processed data
200
        processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
201
        # load processed data to database
202
        DatabaseLoader.load_data_to_database(database_connection, dataset_name, processed_data, not_loaded_file)
203
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
204

  
205
    logging.info(dataset_name + " has loaded to database " + str(len(not_loaded_files)) + " newly processed files.")
206

  
207

  
208
def run_full_pipeline(dataset_name):
209
    """
210
    Loads config file and starts full pipeline
211
    -crawl data
212
    -process data
213
    -load data to database
214

  
215
    Args:
216
        dataset_name: name of dataset that has existing configuration file
217
    """
218
    logging.info("Starting pipeline for dataset " + dataset_name)
219
    
220
    config = ConfigureFunctions.load_configuration(dataset_name)
221
    crawl_data(config)
222
    process_data(config["dataset-name"])
223

  
224
    validation_test = validate_process_data(config)
225

  
226
    if validation_test:
227
        load_data_to_database(config)
228

  
229

  
230
def run_full_pipeline_crone(dataset_name):
231
    """
232
    Loads config file and starts full pipeline
233
    -crawl data
234
    -process data
235
    -load data to database
236

  
237
    Args:
238
        dataset_name: name of dataset that has existing configuration file
239
    """
240
    logging.info("Starting pipeline for dataset " + dataset_name)
241

  
242
    config = ConfigureFunctions.load_configuration(dataset_name)
243
    update_test = check_last_update(config)
244
    if update_test:
245
        crawl_data(config)
246
        process_data(config["dataset-name"])
247

  
248
        validation_test = validate_process_data(config)
249

  
250
        if validation_test:
251
            load_data_to_database_crone(config)
252
            
modules/crawler/PrepareNewDataset.py
1
import os
2

  
3
# Path to crawled data
4
CRAWLED_DATA_PATH = "CrawledData/"
5
# Path to processed data
6
PROCESSED_DATA_PATH = "ProcessedData/"
7
# Path to crawler logs
8
CRAWLER_LOGS_PATH = "CrawlerLogs/"
9
# Path for DatasetCrawlers implementations
10
CRAWLER_PROGRAM_PATH = "DatasetCrawler"
11
# Path for DatasetProcessors implementations
12
PROCESSOR_PROGRAM_PATH = "DatasetProcessing"
13
# Path to dataset configuration files
14
CONFIG_FILES_PATH = "DatasetConfigs"
15

  
16

  
17
def create_default_config_file(dataset_name):
18
    """
19
    Creates default config file
20

  
21
    Args:
22
        dataset_name: Name of newly created dataset
23
    """
24
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
25
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
26
        file.write("dataset-name: " + dataset_name + "\n")
27
        file.write("# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře\n")
28
        file.write("dataset-name: " + dataset_name + "\n")
29
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
30
        file.write("url: ZDE VLOZTE URL\n")
31
        file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
32
        file.write("regex: ZDE VLOZTE REGEX\n")
33
        file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
34
                   "tak defaultni hodnota (dny)\n")
35
        file.write("update-period: ZDE VLOZTE HODNOTU\n")
36
        file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
37
        file.write("devices:\n")
38

  
39

  
40
def create_default_processor(dataset_name):
41
    """
42
    Creates default processor for dataset
43

  
44
    Args:
45
        dataset_name: Name of newly created dataset
46
    """
47
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
48
        file.write("from Utilities.CSV import CSVDataLine")
49
        file.write("\n")
50
        file.write("\n")
51
        file.write("def process_file(filename):\n")
52
        file.write("    \"\"\"\n")
53
        file.write("    Method that take path to crawled file and outputs date dictionary:\n")
54
        file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
55
        file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
56
        file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
57
        file.write("\n")
58
        file.write("    Args:\n")
59
        file.write("    filename: name of processed file\n")
60
        file.write("\n")
61
        file.write("    Returns:\n")
62
        file.write("    None if not implemented\n")
63
        file.write("    date_dict when implemented\n")
64
        file.write("    \"\"\"\n")
65
        file.write("    date_dict = dict()\n")
66
        file.write("\n")
67
        file.write("    #with open(filename, \"r\") as file:\n")
68
        file.write("    print(\"You must implements process_file method first!\")\n")
69
        file.write("    return None\n")
70

  
71

  
72
def create_default_crawler(dataset_name):
73
    """
74
    Creates default crawler for dataset
75

  
76
    Args:
77
        dataset_name: Name of newly created dataset
78
    """
79

  
80
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
81
        file.write("# Path to crawled data\n")
82
        file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
83
        file.write("\n")
84
        file.write("\n")
85
        file.write("def crawl(config):\n")
86
        file.write("    \"\"\"\n")
87
        file.write("    Implement crawl method that downloads new data to path_for_files\n")
88
        file.write("    For keeping the project structure\n")
89
        file.write("    url , regex, and dataset_name from config\n")
90
        file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
91
        file.write("\n")
92
        file.write("    Args:\n")
93
        file.write("        config: loaded configuration file of dataset\n")
94
        file.write("    \"\"\"\n")
95
        file.write("    dataset_name = config[\"dataset-name\"]\n")
96
        file.write("    url = config['url']\n")
97
        file.write("    regex = config['regex']\n")
98
        file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
99
        file.write("    print(\"You must implements Crawl method first!\")\n")
100

  
101

  
102
def create_ignore_file(path, text):
103
    """
104
    Creates ignore file
105
    Args:
106
        path: path to directory for creating ignore.txt
107
        text: text that will be on first line of ignore.txt can be None
108
    """
109
    with open(path + "/ignore.txt", "w") as file:
110
        if text is not None:
111
            file.write(text + "\n")
112

  
113
def create_updated_file(path):
114
    """
115
    Creates updated file
116
    Args:
117
        path: path to directory for creating updated.txt
118
    """
119
    with open(path + "/updated.txt", "w") as file:
120
            file.write(str(0) + "\n")
121

  
122

  
123
def prepare_dataset_structure(dataset_name):
124
    """
125
    Prepares folders for new dataset
126
    Args:
127
        dataset_name: Name of newly created dataset
128
    """
129

  
130
    # create folder for crawled data
131
    try:
132
        path = CRAWLED_DATA_PATH+dataset_name
133
        os.mkdir(path)
134
        create_ignore_file(path, "ignore.txt")
135
    except os.error as e:
136
        print(e)
137
        print("Creation of the directory %s failed" % path)
138

  
139
    # create folder for processed data
140
    try:
141
        path = PROCESSED_DATA_PATH + dataset_name
142
        os.mkdir(path)
143
        create_ignore_file(path, "ignore.txt")
144
    except OSError:
145
        print("Creation of the directory %s failed" % path)
146

  
147
    # create folder for crawler logs
148
    try:
149
        path = CRAWLER_LOGS_PATH + dataset_name
150
        os.mkdir(path)
151
        create_ignore_file(path, None)
152
        create_updated_file(path)
153
    except OSError:
154
        print("Creation of the directory %s failed" % path)
155

  
156
    create_default_crawler(dataset_name)
157
    create_default_processor(dataset_name)
158
    create_default_config_file(dataset_name)
159

  
160
print("Zadejte jméno nového datasetu:\n")
161
prepare_dataset_structure(input())
modules/crawler/ProcessedData/JIS/ignore.txt
1
ignore.txt
modules/crawler/ProcessedData/KOLOBEZKY/ignore.txt
1
ignore.txt
modules/crawler/ProcessedData/WIFI/ignore.txt
1
ignore.txt
modules/crawler/RemoveDataset.py
1
import os
2
import shutil
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6
# Path to processed data
7
PROCESSED_DATA_PATH = "ProcessedData/"
8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
10
# Path to dataset configuration files
11
CONFIG_FILES_PATH = "DatasetConfigs"
12
# Path for DatasetCrawlers implementations
13
CRAWLER_PROGRAM_PATH = "DatasetCrawler"
14
# Path for DatasetProcessors implementations
15
PROCESSOR_PROGRAM_PATH = "DatasetProcessing"
16

  
17

  
18
def remove_dataset(dataset_name):
19
    """
20
    Remove dataset
21
    Args:
22
        dataset_name: name of dataset that has existing configuration file
23
    """
24
    shutil.rmtree(CRAWLED_DATA_PATH + dataset_name + "/")
25
    shutil.rmtree(PROCESSED_DATA_PATH + dataset_name + "/")
26
    shutil.rmtree(CRAWLER_LOGS_PATH + dataset_name + "/")
27

  
28
    os.remove(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml")
29
    os.remove(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py")
30
    os.remove(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py")
31

  
32
    print("Dataset " + dataset_name + " removed")
33

  
34
print("Zadejte jméno Datasetu který chcete odstranit:\n")
35
remove_dataset(input())
modules/crawler/RemoveDatasetDatabase.py
1
from Utilities.Database import DatabaseLoader
2

  
3

  
4
def remove_dataset_database(dataset_name):
5
    """
6
    Removes dataset entries from database
7
    Args:
8
        dataset_name: name of dataset that has existing configuration file
9
    """
10
    # Creating connection
11
    mydb = DatabaseLoader.create_database_connection()
12

  
13
    # collection where are specified aviable datasets
14
    collection_datasets = mydb[DatabaseLoader.MONGODB_DATASET_COLLECTION]
15

  
16
    collection_datasets.delete_one({"name": dataset_name})
17
    print("Removing record from DATASETS collection")
18

  
19

  
20
    # Retrieve list of all collections
21
    collections = mydb.list_collection_names()
22

  
23
    # Drop of all collections
24
    for name in collections:
25
        if name.startswith(dataset_name):
26
            mydb[name].drop()
27
            print("Dropping: " + name)
28

  
29

  
30
print("Zadejte jméno Datasetu který chcete odstranit z databáze:\n")
31
remove_dataset_database(input())
modules/crawler/ResetDatabaseData.py
1
from Utilities.Database import DatabaseLoader
2

  
3

  
4
def clean_database():
5
    """
6
    Deletes all collections from database
7
    """
8
    # Create connection
9
    mydb = DatabaseLoader.create_database_connection()
10

  
11
    # Retrieve list of all collections
12
    collections = mydb.list_collection_names()
13

  
14
    # Drop of all collections
15
    for name in collections:
16
        mydb[name].drop()
17

  
18
    print("Database Cleaned")
19

  
20

  
21
clean_database()
modules/crawler/ResetDataset.py
1
import os
2
from Utilities import FolderProcessor
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6
# Path to processed data
7
PROCESSED_DATA_PATH = "ProcessedData/"
8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
10
# Path to dataset configuration files
11
CONFIG_FILES_PATH = "DatasetConfigs"
12

  
13

  
14
def create_ignore_file(path, text):
15
    """
16
    Creates ignore file
17
    Args:
18
        path: path to directory for creating ignore.txt
19
        text: text that will be on first line of ignore.txt can be None
20
    """
21
    with open(path + "/ignore.txt", "w") as file:
22
        if text is not None:
23
            file.write(text + "\n")
24

  
25

  
26
def create_updated_file(path):
27
    """
28
    Creates updated file
29
    Args:
30
        path: path to directory for creating updated.txt
31
    """
32
    with open(path + "/updated.txt", "w") as file:
33
            file.write(str(0) + "\n")
34

  
35

  
36
def reset_dataset(dataset_name):
37
    """
38
    Resets all saved data in dataset except config and implementation
39
    Args:
40
        dataset_name: name of dataset that has existing configuration file
41
    """
42
    path = CRAWLED_DATA_PATH + dataset_name + "/"
43
    FolderProcessor.clean_folder(path)
44
    create_ignore_file(path, "ignore.txt")
45

  
46
    path = PROCESSED_DATA_PATH + dataset_name + "/"
47
    FolderProcessor.clean_folder(path)
48
    create_ignore_file(path, "ignore.txt")
49

  
50
    path = CRAWLER_LOGS_PATH + dataset_name + "/"
51
    FolderProcessor.clean_folder(path)
52
    create_ignore_file(path, None)
53
    create_updated_file(path)
54

  
55
print("Zadejte jméno Datasetu který chcete resetovat:\n")
56
reset_dataset(input())
modules/crawler/ResetDatasets.py
1
import os
2
from Utilities import FolderProcessor
3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
6
# Path to processed data
7
PROCESSED_DATA_PATH = "ProcessedData/"
8
# Path to crawler logs
9
CRAWLER_LOGS_PATH = "CrawlerLogs/"
10
# Path to dataset configuration files
11
CONFIG_FILES_PATH = "DatasetConfigs"
12

  
13

  
14
def create_ignore_file(path, text):
15
    """
16
    Creates ignore file
17
    Args:
18
        path: path to directory for creating ignore.txt
19
        text: text that will be on first line of ignore.txt can be None
20
    """
21
    with open(path + "/ignore.txt", "w") as file:
22
        if text is not None:
23
            file.write(text + "\n")
24

  
25

  
26
def create_updated_file(path):
27
    """
28
    Creates updated file
29
    Args:
30
        path: path to directory for creating updated.txt
31
    """
32
    with open(path + "/updated.txt", "w") as file:
33
            file.write(str(0) + "\n")
34

  
35

  
36
def reset_dataset(dataset_name):
37
    """
38
    Resets all saved data in dataset except config and implementation
39
    Args:
40
        dataset_name: name of dataset that has existing configuration file
41
    """
42
    path = CRAWLED_DATA_PATH + dataset_name + "/"
43
    FolderProcessor.clean_folder(path)
44
    create_ignore_file(path, "ignore.txt")
45

  
46
    path = PROCESSED_DATA_PATH + dataset_name + "/"
47
    FolderProcessor.clean_folder(path)
48
    create_ignore_file(path, "ignore.txt")
49

  
50
    path = CRAWLER_LOGS_PATH + dataset_name + "/"
51
    FolderProcessor.clean_folder(path)
52
    create_ignore_file(path, None)
53
    create_updated_file(path)
54

  
55

  
56
def reset_all_datasets():
57
    """
58
    Resets all saved data in all datasets with config file except configs and implementation
59
    """
60
    datasets = os.listdir(CONFIG_FILES_PATH)
61

  
62
    for dataset in datasets:
63
        reset_dataset(dataset.split('.')[0])
64

  
65

  
66
reset_all_datasets()
modules/crawler/Utilities/CSV/CSVDataLine.py
1
class CSVDataLine:
2
    """
3
    Class that specifies the look of data line in processed csv file
4
    prepared for database
5
    """
6

  
7
    def __init__(self, name, date, occurrence):
8
        try:
9
            test_val = int(occurrence)
10
        except ValueError:
11
            print("Occurence should be and integer value!")
12

  
13
        if len(date) != 13:
14
            raise ValueError("Invalid date format YYYY-dd-mm-hh expected!")    
15

  
16
        self.name = name
17
        self.date = date
18
        self.occurrence = test_val
19

  
20
    def to_csv(self):
21
        return self.name + ";" + str(self.occurrence) + ";" + self.date
22

  
modules/crawler/Utilities/CSV/CSVutils.py
1
import inspect
2
from Utilities.CSV import CSVDataLine
3

  
4
# Path to processed data
5
PROCESSED_DATA_PATH = "ProcessedData/"
6

  
7

  
8
def get_unique_names_from_file(filename, column_number):
9
    """
10
        Extract set of unique names from file
11
    Args:
12
        filename: path to processed file
13
        column_number: unique names are expected in csv file on column_number
14

  
15
    Returns:
16
        set of unique names
17
    """
18
    # create set of unique names
19
    name_set = set()
20

  
21
    with open(filename, "r") as file:
22
        # go through every line of line
23
        for x in file:
24
            # split by csv splitter ;
25
            array = x.split(";")
26
            # add string from chosen column to set
27
            name_set.add(array[column_number])
28

  
29
    return name_set
... Rozdílový soubor je zkrácen, protože jeho délka přesahuje max. limit.

Také k dispozici: Unified diff