Projekt

Obecné

Profil

« Předchozí | Další » 

Revize 04a2b5a4

Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)

Re #7939
- pridana dokumentace metod a trid
- korekce chyb v jmenech promenych
- pridani informaci pro vygenerovane skripty

Zobrazit rozdíly:

python-module/DatasetConfigs/JIS.yaml
192 192
      x: UNKNOWN!
193 193
      y: UNKNOWN!
194 194

  
195
  - US 005 - z?vora vjezd:
196
      x: UNKNOWN!
197
      y: UNKNOWN!
198

  
199
  - US 005 - m?? vjezd:
200
      x: UNKNOWN!
201
      y: UNKNOWN!
202

  
python-module/DatasetCrawler/JISCrawler.py
1 1
from Utilities import FolderProcessor
2 2
from Utilities.Crawler import BasicCrawler
3 3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
4 6

  
5
def crawl(config):
6 7

  
8
def crawl(config):
9
    """
10
    Implement crawl method that downloads new data to path_for_files
11
    For keeping the project structure
12
    url , regex, and dataset_name from config
13
    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py
14

  
15
    Args:
16
        config: loaded configuration file of dataset
17
    """
7 18
    dataset_name = config["dataset-name"]
8 19
    url = config['url']
9 20
    regex = config['regex']
21
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
10 22

  
11 23
    first_level_links = BasicCrawler.get_all_links(url)
12 24
    filtered_first_level_links = BasicCrawler.filter_links(first_level_links, "^OD_ZCU")
......
24 36
            files.append(file_link)
25 37

  
26 38
    for file in files:
27
        BasicCrawler.download_file_from_url(file, "CrawledData/" + dataset_name + "/", dataset_name)
39
        BasicCrawler.download_file_from_url(file, path_for_files, dataset_name)
28 40

  
29
    FolderProcessor.unzip_all_csv_zip_files_in_folder("CrawledData/" + dataset_name + "/")
41
    FolderProcessor.unzip_all_csv_zip_files_in_folder(path_for_files)
python-module/DatasetCrawler/KOLOBEZKYCrawler.py
1 1
from Utilities import FolderProcessor
2 2
from Utilities.Crawler import BasicCrawler
3 3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
4 6

  
5
def crawl(config):
6 7

  
8
def crawl(config):
9
    """
10
    Implement crawl method that downloads new data to path_for_files
11
    For keeping the project structure
12
    url , regex, and dataset_name from config
13
    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py
14

  
15
    Args:
16
        config: loaded configuration file of dataset
17
    """
7 18
    dataset_name = config["dataset-name"]
8 19
    url = config['url']
9 20
    regex = config['regex']
21
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
10 22

  
11 23
    first_level_links = BasicCrawler.get_all_links(url)
12 24
    filtered_first_level_links = BasicCrawler.filter_links(first_level_links, "^OD_ZCU")
python-module/DatasetCrawler/WIFICrawler.py
1 1
from Utilities import FolderProcessor
2 2
from Utilities.Crawler import BasicCrawler
3 3

  
4
# Path to crawled data
5
CRAWLED_DATA_PATH = "CrawledData/"
4 6

  
5
def crawl(config):
6 7

  
8
def crawl(config):
9
    """
10
    Implement crawl method that downloads new data to path_for_files
11
    For keeping the project structure
12
    url , regex, and dataset_name from config
13
    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py
14

  
15
    Args:
16
        config: loaded configuration file of dataset
17
    """
7 18
    dataset_name = config["dataset-name"]
8 19
    url = config['url']
9 20
    regex = config['regex']
21
    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'
10 22

  
11 23
    first_level_links = BasicCrawler.get_all_links(url)
12 24
    filtered_first_level_links = BasicCrawler.filter_links(first_level_links, "^OD_ZCU")
......
24 36
            files.append(file_link)
25 37

  
26 38
    for file in files:
27
        BasicCrawler.download_file_from_url(file, "CrawledData/" + dataset_name + "/", dataset_name)
39
        BasicCrawler.download_file_from_url(file, path_for_files, dataset_name)
28 40

  
29
    FolderProcessor.unzip_all_csv_zip_files_in_folder("CrawledData/" + dataset_name + "/")
41
    FolderProcessor.unzip_all_csv_zip_files_in_folder(path_for_files)
python-module/DatasetProcessing/JISProcessor.py
3 3

  
4 4

  
5 5
def process_file(filename):
6

  
6
    """
7
    Method that take path to crawled file and outputs date dictionary using method:
8
    CSVutils.export_data_to_csv(filename, date_dict)
9
    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)
10
    and value is dictionary where keys devices (specified in configuration file)
11
    and value is CSVDataLine.CSVDataLine with device,date and occurrence
12

  
13
    Args:
14
        filename: name of processed file
15

  
16
    Returns:
17
        False if not implemented
18
        True when implemented
19
    """
7 20
    with open(filename, "r", encoding="utf-8") as file:
8 21

  
9 22
        date_dict = dict()
......
12 25

  
13 26
            array = line.split(";")
14 27

  
15
            date = DateFormating.date_time_formater(array[1][1:-1])
28
            date = DateFormating.date_time_formatter(array[1][1:-1])
16 29
            name = array[0][1:-1]
17 30
            occurence = array[2][:-1]
18 31

  
......
24 37
            else:
25 38
                date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, int(occurence))
26 39

  
27
        CSVutils.export_data_to_csv(filename, date_dict)
40
    CSVutils.export_data_to_csv(filename, date_dict)
41
    return True
42

  
python-module/DatasetProcessing/KOLOBEZKYProcessor.py
3 3

  
4 4

  
5 5
def process_file(filename):
6

  
6
    """
7
    Method that take path to crawled file and outputs date dictionary using method:
8
    CSVutils.export_data_to_csv(filename, date_dict)
9
    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)
10
    and value is dictionary where keys devices (specified in configuration file)
11
    and value is CSVDataLine.CSVDataLine with device,date and occurrence
12

  
13
    Args:
14
        filename: name of processed file
15

  
16
    Returns:
17
        False if not implemented
18
        True when implemented
19
    """
7 20
    with open(filename, "r") as file:
8 21

  
9 22
        date_dict = dict()
......
12 25

  
13 26
            array = line.split(";")
14 27

  
15
            date = DateFormating.date_time_formater(array[0][1:-1])
28
            date = DateFormating.date_time_formatter(array[0][1:-1])
16 29
            name = array[1][1:-1]
17 30

  
18 31
            if date not in date_dict:
......
23 36
            else:
24 37
                date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, 1)
25 38

  
26
        CSVutils.export_data_to_csv(filename, date_dict)
39
    CSVutils.export_data_to_csv(filename, date_dict)
40
    return True
41

  
python-module/DatasetProcessing/WIFIProcessor.py
3 3

  
4 4

  
5 5
def process_file(filename):
6

  
6
    """
7
    Method that take path to crawled file and outputs date dictionary using method:
8
    CSVutils.export_data_to_csv(filename, date_dict)
9
    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)
10
    and value is dictionary where keys devices (specified in configuration file)
11
    and value is CSVDataLine.CSVDataLine with device,date and occurrence
12

  
13
    Args:
14
        filename: name of processed file
15

  
16
    Returns:
17
        False if not implemented
18
        True when implemented
19
    """
7 20
    with open(filename, "r", encoding="utf-8") as file:
8 21

  
9 22
        date_dict = dict()
......
12 25

  
13 26
            array = line.split(";")
14 27

  
15
            date = DateFormating.date_time_formater(array[4][1:-2])
28
            date = DateFormating.date_time_formatter(array[4][1:-2])
16 29
            name = array[1][1:-1]
17 30
            occurence = array[0]
18 31

  
......
24 37
            else:
25 38
                date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, int(occurence))
26 39

  
27
        CSVutils.export_data_to_csv(filename, date_dict)
40
    CSVutils.export_data_to_csv(filename, date_dict)
41
    return True
42

  
python-module/Pipeline.py
1 1
from Utilities import FolderProcessor, ConfigureFunctions
2 2
from Utilities.Database import DatabaseLoader
3 3

  
4

  
5
CONFIG_FILES_PATH = "DatasetConfigs/"
4
# Path to crawled data
6 5
CRAWLED_DATA_PATH = "CrawledData/"
6
# Path to processed data
7 7
PROCESSED_DATA_PATH = "ProcessedData/"
8
# Path to dataset crawler implementations
8 9
CRAWLER_LIB_PATH = "DatasetCrawler."
10
# Path to dataset processor implementations
9 11
PROCESSOR_LIB_PATH = "DatasetProcessing."
10 12

  
11 13

  
12 14
def crawl_data(config):
15
    """
16
      Imports dataset crawler in DatasetCrawler/"dataset_name"Crawler.py
17
      runs crawler.
13 18

  
19
    Args:
20
        config: loaded configuration file of dataset
21
    """
14 22
    dataset_name = config["dataset-name"]
15 23

  
16
    my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler",globals(),locals(),['crawl']).crawl
24
    my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler", globals(), locals(), ['crawl']).crawl
17 25
    my_function(config)
18 26

  
19 27
    dataset_name += '/'
20 28

  
21 29

  
22 30
def process_data(dataset_name):
31
    """
32
    Goes trough every not processed file(not contained in CrawledData/"dataset_name"/ignore.txt)
33
    Imports dataset processor in DatasetProcessing/"dataset_name"Processor.py
34
    Runs processor on every file
35
    After successful processing updates ignore.txt
36

  
37
    Args:
38
        dataset_name: name of dataset that has existing configuration file
39
    """
23 40
    dataset_path = dataset_name + '/'
24 41

  
25
    process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor",globals(),locals(),
42
    process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor", globals(), locals(),
26 43
                                   ['process_file']).process_file
27 44

  
28
    # get all not processed files from dataset
29 45
    not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path)
30 46

  
31
    # process every file
32 47
    for not_processed_file in not_processed_files:
33
        # call processing for one file in dataset
34 48
        process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file)
35 49
        FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file)
36 50

  
37 51

  
38 52
def validate_process_data(config):
53
    """
54
    Function goes through newly processed data and checks theirs status
55

  
56
    Args:
57
        config: loaded configuration file of dataset
58

  
59
    Returns:
60
        boolean variable TRUE/FALSE.
61
        Data processed correctly - TRUE
62
        Wrong format or NEW unknown devices - FALSE
63
    """
39 64
    processed_devices_set = FolderProcessor.get_devices_set(PROCESSED_DATA_PATH + config["dataset-name"] + '/')
40
    unknown_devices_set = FolderProcessor.get_unknown_devices_set(config,processed_devices_set)
65
    unknown_devices_set = FolderProcessor.get_unknown_devices_set(config, processed_devices_set)
41 66
    unknown_devices_size = len(unknown_devices_set)
42 67

  
43 68
    if unknown_devices_size != 0:
44 69
        print("There is " + str(unknown_devices_size) + " unknown devies")
45
        ConfigureFunctions.update_configuration(CONFIG_FILES_PATH + config["dataset-name"] + ".yaml", unknown_devices_set)
70
        ConfigureFunctions.update_configuration(config["dataset-name"], unknown_devices_set)
46 71
        return False
47 72

  
48 73

  
49 74
def load_data_to_database(config):
50

  
75
    """
76
    Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
77
    loads data appends coordination from configurations
78
    and exports it into the database
79
    After successful exporting updates ignore.txt
80

  
81
    Args:
82
        config: loaded configuration file of dataset
83
    """
51 84
    dataset_name = config["dataset-name"]
52 85
    dataset_path = dataset_name + '/'
53 86

  
......
57 90
    # load every file
58 91
    for not_loaded_file in not_loaded_files:
59 92
        # load processed data
60
        processed_data = DatabaseLoader.get_data_from_file(PROCESSED_DATA_PATH + dataset_path + not_loaded_file,
61
                                                           config["devices"])
93
        processed_data = DatabaseLoader.get_data_from_file(not_loaded_file, config)
62 94
        # load processed data to database
63 95
        DatabaseLoader.load_data_to_database(dataset_name, processed_data)
64 96
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
65 97

  
66 98

  
67 99
def run_full_pipeline(dataset_name):
68
    config = ConfigureFunctions.load_configuration(CONFIG_FILES_PATH + dataset_name)
100
    """
101
    Loads config file and starts full pipeline
102
    -crawl data
103
    -process data
104
    -load data to database
105

  
106
    Args:
107
        dataset_name: name of dataset that has existing configuration file
108
    """
109
    config = ConfigureFunctions.load_configuration(dataset_name)
69 110
    crawl_data(config)
70 111
    process_data(config["dataset-name"])
71 112

  
python-module/Scripts/PrepareNewDataset.py
1 1
import os
2 2

  
3
# Path to crawled data
3 4
CRAWLED_DATA_PATH = "../CrawledData/"
5
# Path to processed data
4 6
PROCESSED_DATA_PATH = "../ProcessedData/"
7
# Path to crawler logs
5 8
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
9
# Path for DatasetCrawlers implementations
6 10
CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
11
# Path for DatasetProcessors implementations
7 12
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
13
# Path to dataset configuration files
8 14
CONFIG_FILES_PATH = "../DatasetConfigs"
9 15

  
10 16

  
11 17
def create_default_config_file(dataset_name):
18
    """
19
    Creates default config file
12 20

  
21
    Args:
22
        dataset_name: Name of newly created dataset
23
    """
13 24
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
14 25
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
15 26
        file.write("dataset-name: " + dataset_name + "\n")
16 27
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
17
        file.write("url: ZDE VLOZTE URL/\n")
28
        file.write("url: ZDE VLOZTE URL\n")
18 29
        file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
19 30
        file.write("regex: ZDE VLOZTE REGEX\n")
20 31
        file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
......
25 36

  
26 37

  
27 38
def create_default_processor(dataset_name):
39
    """
40
    Creates default processor for dataset
41

  
42
    Args:
43
        dataset_name: Name of newly created dataset
44
    """
28 45
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
46
        file.write("from Utilities.CSV import CSVDataLine, CSVutils")
47
        file.write("\n")
48
        file.write("\n")
29 49
        file.write("def process_file(filename):\n")
50
        file.write("    \"\"\"\n")
51
        file.write("    Method that take path to crawled file and outputs date dictionary using method:\n")
52
        file.write("    CSVutils.export_data_to_csv(filename, date_dict)\n")
53
        file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
54
        file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
55
        file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
56
        file.write("\n")
57
        file.write("    Args:\n")
58
        file.write("    filename: name of processed file\n")
59
        file.write("\n")
60
        file.write("    Returns:\n")
61
        file.write("    False if not implemented\n")
62
        file.write("    True when implemented\n")
63
        file.write("    \"\"\"\n")
30 64
        file.write("    print(\"You must implements process_file method first!\")\n")
65
        file.write("    #CSVutils.export_data_to_csv(filename, date_dict)\n")
66
        file.write("    return False\n")
31 67

  
32 68

  
33 69
def create_default_crawler(dataset_name):
70
    """
71
    Creates default crawler for dataset
72

  
73
    Args:
74
        dataset_name: Name of newly created dataset
75
    """
34 76

  
35 77
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
78
        file.write("# Path to crawled data\n")
79
        file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
80
        file.write("\n")
81
        file.write("\n")
36 82
        file.write("def crawl(config):\n")
83
        file.write("    \"\"\"\n")
84
        file.write("    Implement crawl method that downloads new data to path_for_files\n")
85
        file.write("    For keeping the project structure\n")
86
        file.write("    url , regex, and dataset_name from config\n")
87
        file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
88
        file.write("\n")
89
        file.write("    Args:\n")
90
        file.write("        config: loaded configuration file of dataset\n")
91
        file.write("    \"\"\"\n")
92
        file.write("    dataset_name = config[\"dataset-name\"]\n")
93
        file.write("    url = config['url']\n")
94
        file.write("    regex = config['regex']\n")
95
        file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
37 96
        file.write("    print(\"You must implements Crawl method first!\")\n")
38 97

  
39 98

  
40
def create_ignore_file(path,text):
41

  
99
def create_ignore_file(path, text):
100
    """
101
    Creates ignore file
102
    Args:
103
        path: path to directory for creating ignore.txt
104
        text: text that will be on first line of ignore.txt can be None
105
    """
42 106
    with open(path + "/ignore.txt", "w") as file:
43 107
        if text is not None:
44 108
            file.write(text + "\n")
45 109

  
46 110

  
47 111
def prepare_dataset_structure(dataset_name):
112
    """
113
    Prepares folders for new dataset
114
    Args:
115
        dataset_name: Name of newly created dataset
116
    """
48 117
    jump_folder = "../"
49 118

  
50 119
    # create folder for crawled data
51 120
    try:
52 121
        path = CRAWLED_DATA_PATH+dataset_name
53 122
        os.mkdir(path)
54
        create_ignore_file(path,"ignore.txt")
123
        create_ignore_file(path, "ignore.txt")
55 124
    except os.error as e:
56 125
        print(e)
57 126
        print("Creation of the directory %s failed" % path)
......
77 146
    create_default_config_file(dataset_name)
78 147

  
79 148

  
80
prepare_dataset_structure("WIFI")
149
prepare_dataset_structure("TEST")
python-module/Utilities/CSV/CSVDataLine.py
1 1
class CSVDataLine:
2

  
3
    def __init__(self, name, date, occurence):
2
    """
3
    Class that specifies the look of data line in processed csv file
4
    prepared for database
5
    """
6
    def __init__(self, name, date, occurrence):
4 7
        self.name = name
5 8
        self.date = date
6
        self.occurence = occurence
9
        self.occurrence = occurrence
7 10

  
8 11
    def to_csv(self):
9
        return self.name + ";" + str(self.occurence) + ";" + self.date
12
        return self.name + ";" + str(self.occurrence) + ";" + self.date
10 13

  
python-module/Utilities/CSV/CSVutils.py
1
# Path to processed data
1 2
PROCESSED_DATA_PATH = "ProcessedData/"
2 3

  
4

  
3 5
def get_unique_names_from_file(filename, column_number):
6
    """
7

  
8
    Args:
9
        filename:
10
        column_number:
11

  
12
    Returns:
13

  
14
    """
4 15
    f = open(filename, "r")
5 16

  
6 17
    # create set of unique names
python-module/Utilities/ConfigureFunctions.py
1 1
import yaml
2 2

  
3
# Path to dataset configuration files
4
CONFIG_FILES_PATH = "DatasetConfigs/"
5
# Config file type
6
CONFIG_FILE_TYPE = ".yaml"
3 7

  
4
def load_configuration(configure_file_name):
5 8

  
6
    with open(configure_file_name) as f:
9
def load_configuration(dataset_name):
10
    """
11
    Loads yaml configuration file into memory
12

  
13
    Args:
14
        dataset_name: name of dataset that has existing configuration file
15

  
16
    Returns:
17
        yaml configuration file as dictionary
18
    """
19
    with open(CONFIG_FILES_PATH + dataset_name + CONFIG_FILE_TYPE, "r") as f:
7 20
        data = yaml.load(f, Loader=yaml.FullLoader)
8 21

  
9 22
    devices_dic = dict()
......
17 30
    return data
18 31

  
19 32

  
20
def update_configuration(configure_file_name, new_devices):
33
def update_configuration(dataset_name, new_devices):
34
    """
35
    Open dataset and appends new_devices to the end
21 36

  
22
    with open(configure_file_name, "a") as file:
37
    Args:
38
        dataset_name: name of dataset that has existing configuration file
39
        new_devices: list or set of new devices for dataset
40
    """
41
    with open(CONFIG_FILES_PATH + dataset_name + CONFIG_FILE_TYPE, "a") as file:
23 42
        for device in new_devices:
24 43
            file.write("  - "+device+":\n")
25 44
            file.write("      x: UNKNOWN!\n")
python-module/Utilities/Crawler/BasicCrawler.py
1
import requests
2
import re
3
from Utilities import FolderProcessor
4
from bs4 import BeautifulSoup
5

  
6

  
7
def get_all_links(url):
8
    # create response object
9
    r = requests.get(url)
10

  
11
    # create beautiful-soup object
12
    soup = BeautifulSoup(r.content, 'html5lib')
13
    links = []
14

  
15
    for link in soup.findAll('a'):
16
        links.append(link.get('href'))
17

  
18
    return links
19

  
20

  
21
def filter_links(links, regex):
22
    fitlered_links = []
23

  
24
    for link in links:
25
        if re.search(regex,link):
26
            fitlered_links.append(link)
27

  
28
    return fitlered_links
29

  
30

  
31
def create_absolute_links(links, archive):
32
    absolute_links = []
33

  
34
    for link in links:
35
        absolute_links.append(archive + link)
36

  
37
    return absolute_links
38

  
39

  
40
def remove_downloaded_links(links,dataset_name):
41

  
42
    downloaded_links = FolderProcessor.load_ignore_set("CrawlerLogs/" + dataset_name + "/")
43
    final_links = set(links) - downloaded_links
44

  
45
    return final_links
46

  
47

  
48
def download_file_from_url(url,path, dataset_name):
49
    r = requests.get(url, stream=True)
50

  
51
    url_parts = url.split("/")
52
    file_name = url_parts[len(url_parts)-1]
53

  
54
    with open(path + file_name, "wb") as file:
55
        for chunk in r.iter_content(chunk_size=1024):
56

  
57
            # writing one chunk at a time to pdf file
58
            if chunk:
59
                file.write(chunk)
60

  
61
    FolderProcessor.update_ignore_set("CrawlerLogs/" + dataset_name + "/", url)
python-module/Utilities/Crawler/BasicCrawlerFunctions.py
1
import requests
2
import re
3
from Utilities import FolderProcessor
4
from bs4 import BeautifulSoup
5

  
6
# Path to crawler logs
7
CRAWLER_LOGS_PATH = "CrawlerLogs/"
8

  
9

  
10
def get_all_links(url):
11
    """
12
    Sends http request to url, downloads all data,
13
    extract links
14

  
15
    Args:
16
        url: url of website we want to search
17

  
18
    Returns:
19
        list of all links
20
    """
21
    # create response object
22
    r = requests.get(url)
23

  
24
    # create beautiful-soup object
25
    soup = BeautifulSoup(r.content, 'html5lib')
26
    links = []
27

  
28
    for link in soup.findAll('a'):
29
        links.append(link.get('href'))
30

  
31
    return links
32

  
33

  
34
def filter_links(links, regex):
35
    """
36
    Filters list of links using regex
37

  
38
    Args:
39
        links: list of links
40
        regex: regex used for filtering
41

  
42
    Returns:
43
        filtered list of links
44
    """
45
    filtered_links = []
46

  
47
    for link in links:
48
        if re.search(regex, link):
49
            filtered_links.append(link)
50

  
51
    return filtered_links
52

  
53

  
54
def create_absolute_links(links, archive):
55
    """
56
        Appends archive path to every link in links
57
    Args:
58
        links: list of relative links
59
        archive: archive url
60

  
61
    Returns:
62
        list of absolute links
63
    """
64
    absolute_links = []
65

  
66
    for link in links:
67
        absolute_links.append(archive + link)
68

  
69
    return absolute_links
70

  
71

  
72
def remove_downloaded_links(links, dataset_name):
73
    """
74
    Loads already downloaded links from CRAWLER_LOGS_PATH ignore.txt
75
    Args:
76
        links: list of links
77
        dataset_name: name of dataset that has existing configuration file
78

  
79
    Returns:
80
        List of links without already downloaded links
81
    """
82
    downloaded_links = FolderProcessor.load_ignore_set(CRAWLER_LOGS_PATH + dataset_name + "/")
83
    final_links = set(links) - downloaded_links
84

  
85
    return final_links
86

  
87

  
88
def download_file_from_url(url, dataset_name):
89
    """
90
    Downloads file on provided url and saves it to path
91
    Args:
92
        url: url file we want to download
93
        dataset_name: name of dataset that has existing configuration file
94
    """
95
    r = requests.get(url, stream=True)
96

  
97
    # splits url and extract last part that contains filename
98
    url_parts = url.split("/")
99
    file_name = url_parts[len(url_parts)-1]
100

  
101
    path = CRAWLER_LOGS_PATH + dataset_name + '/'
102

  
103
    # download file chunk by chunk so we can download large files
104
    with open(path + file_name, "wb") as file:
105
        for chunk in r.iter_content(chunk_size=1024):
106

  
107
            # writing one chunk at a time to file
108
            if chunk:
109
                file.write(chunk)
110

  
111
    # after successful download update list of already downloaded files
112
    FolderProcessor.update_ignore_set(CRAWLER_LOGS_PATH + dataset_name + "/", url)
python-module/Utilities/Database/DatabaseDataLine.py
1 1
class DatabaseDataLine:
2

  
3
    def __init__(self, name, longitude, latitude, date, occurence):
2
    """
3
    Class that specifies the look of data line in database
4
    """
5
    def __init__(self, name, longitude, latitude, date, occurrence):
4 6
        self.name = name
5 7
        self.latitude = latitude
6 8
        self.longitude = longitude
7 9
        self.date = date
8
        self.occurence = occurence
10
        self.occurrence = occurrence
9 11

  
10 12
    def to_dictionary(self):
11
        return {"place": self.name, "x": self.longitude, "y": self.latitude, "number": self.occurence, "date": self.date}
13
        return {"place": self.name, "x": self.longitude, "y": self.latitude, "number": self.occurrence, "date": self.date}
python-module/Utilities/Database/DatabaseLoader.py
1 1
from Utilities.Database import DatabaseDataLine
2 2
import pymongo
3 3

  
4
# specify mongodb connection
5
MONGODB_CONNECTION = "mongodb://localhost:27017/"
6
# mongodb account name
7
MONGODB_ACC_NAME = "root"
8
# mongodb account password
9
MONGODB_ACC_PASSWORD = "root"
10
# mongodb data database
11
MONGODB_DATA_DATABASE = "DATA"
12
# mongodb collection with aviable datasets
13
MONGODB_DATASET_COLLECTION = "DATASETS"
14

  
15
# Path to processed data
16
PROCESSED_DATA_PATH = "ProcessedData/"
17

  
18

  
19
def create_database_connection():
20
    """
21
    Creates connection to mongoDB
22
    
23
    Returns:
24
        Connection to mongoDB
25
    """
26
    client = pymongo.MongoClient(MONGODB_CONNECTION)
4 27

  
5
def get_data_from_file(filename, devices):
6
    f = open(filename, "r")
28
    # Authenticating
29
    client.admin.authenticate(MONGODB_ACC_NAME, MONGODB_ACC_PASSWORD)
30

  
31
    database = client[MONGODB_DATA_DATABASE]
32

  
33
    return database
34

  
35

  
36
def get_data_from_file(filename, config):
37
    """
38
        Opens processed file, reads it line by line
39
        name, ocurrence, date
40
        searches name in config and adds device map coordinates
41
        than creates a dictionary with date without hours as key
42
        and list of data lines as value.
43
    Args:
44
        filename: name of processed file
45
        config: loaded configuration file of dataset
46

  
47
    Returns:
48
        dictionary with date without hours as key
49
        and list of Datalines as value
50
    """
51
    dataset_name = config["dataset-name"]
52
    dataset_path = PROCESSED_DATA_PATH + dataset_name + '/'
7 53

  
54
    f = open(dataset_path + filename, "r")
55

  
56
    devices = config["devices"]
8 57
    date_dict = dict()
9 58

  
10 59
    for line in f:
11
        # remove \n
12 60
        line = line[:-1]
13
        # split by csv splitter ;
14

  
15
        csv_collum = line.split(";")
16 61

  
17
        name = csv_collum[0]
18
        occurence = csv_collum[1]
19
        date = csv_collum[2]
62
        csv_column = line.split(";")
20 63

  
21
        date_without_hours = date[:-2]
64
        name = csv_column[0]
65
        occurrence = csv_column[1]
66
        date = csv_column[2]
22 67

  
23 68
        database_data_line = DatabaseDataLine.DatabaseDataLine(name, devices[name]["x"]
24
                                                               , devices[name]["y"], date, occurence)
69
                                                               , devices[name]["y"], date, occurrence)
25 70

  
71
        # if you want to change table split by hours or months change this
72
        date_without_hours = date[:-2]
26 73
        if date_without_hours not in date_dict:
27 74
            date_dict[date_without_hours] = list()
28 75

  
29
        date_dict[date_without_hours].append(database_data_line.to_dictionary())
76
        date_dict[date_without_hours].append(database_data_line.to_dictionary)
30 77

  
31 78
    return date_dict
32 79

  
33 80

  
34 81
def load_data_to_database(dataset_name, data_dic):
35
    myclient = pymongo.MongoClient("mongodb://localhost:27017/");
36

  
37
    # Authenticating
38
    myclient.admin.authenticate('root', 'root');
82
    """
83
    Takes data_dic created in method get_data_from_file
84
    and loads into into database where collection name is dataset_name + data_dic key
85
    and data lines are line in collection
39 86

  
40
    # Database DATA
41
    mydb = myclient["DATA"]
87
    Args:
88
        dataset_name: name of dataset that has existing configuration file
89
        data_dic: dictionary of data lines created in get_data_from_file
90
    """
91
    database = create_database_connection()
42 92

  
43
    # Collection Datasets
44
    collection_datasets = mydb["DATASETS"]
93
    # collection where are specified aviable datasets
94
    collection_datasets = database[MONGODB_DATASET_COLLECTION]
45 95

  
96
    # check if newly added data already have a dataset specified in collection
46 97
    dataset_present = collection_datasets.find_one({}, {'name': dataset_name})
47 98

  
48 99
    if dataset_present is None:
49 100
        collection_datasets.insert_one({'name': dataset_name})
50 101

  
51 102
    for date in data_dic:
52
        dataset_collections = mydb[dataset_name]
103
        dataset_collections = database[dataset_name]
53 104
        dataset_collections.insert_one({'name': dataset_name+date})
54
        date_dataset = mydb[dataset_name + date]
105
        date_dataset = database[dataset_name + date]
55 106
        date_dataset.insert_many(data_dic[date])
python-module/Utilities/DateFormating.py
1
def date_formater(string_date):
1
def date_formatter(string_date):
2
    """
3

  
4
    Args:
5
        string_date: string containing date in format 22.08.2018 12:27:00
6

  
7
    Returns:
8
        string of date in format 0804201814 ddmmYYYY
9
    """
2 10
    if string_date[11].isspace():
3 11
        pos = 0
4 12
        srr = ""
......
18 26
    return return_date
19 27

  
20 28

  
21
def date_time_formater(string_date):
29
def date_time_formatter(string_date):
30
    """
31
    Converts one type of date format "dd.mm.yyyy hh.mm.ss" to date format ddmmYYYYhh
32
    Args:
33
        string_date: string containing date in format 22.08.2018 12:27:00
34

  
35
    Returns:
36
        string of date in format 0804201814 ddmmYYYYhh
37
    """
22 38
    if string_date[11].isspace():
23 39
        pos = 0
24 40
        srr = ""
......
35 51

  
36 52
    return_date = string_date[:2] + string_date[3:5] + string_date[6:10] + string_date[11:13]
37 53

  
38
    return return_date
54
    return return_date
python-module/Utilities/FolderProcessor.py
1 1
import os
2 2
import zipfile
3
from CSV import CSVutils
3 4

  
4 5

  
5 6
def list_of_all_files(path):
7
    """
8
    Get all files from directory and all files written in ignore.txt
9
    and return the difference
10
    Args:
11
        path: path to Directory
12

  
13
    Returns:
14
        list with names of all files in directory
15
    """
6 16
    files_in_dir = os.listdir(path)
7 17

  
8 18
    ignore_set = load_ignore_set(path)
......
11 21

  
12 22

  
13 23
def load_ignore_set(path):
24
    """
25
    Reads ignore.txt line by line and add it to a set
26
    Args:
27
        path: Path to directory containing ignore.txt file
28

  
29
    Returns:
30
        list of names contained in ignore.txt file
31
    """
14 32
    ignore_set = set()
15 33

  
16 34
    with open(path + "ignore.txt", "r") as file:
......
21 39
    return ignore_set
22 40

  
23 41

  
24
def update_ignore_set(path,file_name):
25

  
42
def update_ignore_set(path, file_name):
43
    """
44
    Adds file_name to the ignore file
45
    Args:
46
        path: Path to directory containing ignore.txt file
47
        file_name: name of file you want to add to ignore file
48
    """
26 49
    with open(path + "ignore.txt", "a") as file:
27 50
        file.write(file_name + '\n')
28 51

  
29 52

  
30
def get_devices_set(folder):
53
def get_devices_set(path):
54
    """
55
     Goes trough every not loaded file(not contained in ProcessedData/ignore.txt)
56
     Extracts names from not loaded file which should be in first column
57
     Creates set of unique devices_names
58

  
59
    Args:
60
        path: Path to Processed directory containing ignore.txt file
31 61

  
32
    files_in_dir = list_of_all_files(folder)
62
    Returns:
63
        set of unique names contained in not loaded files
64
    """
65
    files_in_dir = list_of_all_files(path)
33 66

  
34 67
    unique_names = set()
35 68

  
36 69
    for file_path in files_in_dir:
37
        with open(folder+file_path) as file:
38
            for line in file:
39
                array = line.split(";")
40
                name = array[0]
41
                unique_names.add(name)
70
        unique_names.add(CSVutils.get_unique_names_from_file(path+file_path, 0))
42 71

  
43 72
    return unique_names
44 73

  
45 74

  
46
def get_unknown_devices_set(config,devices):
75
def get_unknown_devices_set(config, devices):
76
    """
77
    Compares config and devices a return difference
78

  
79
    Args:
80
        config:  loaded configuration file of dataset
81
        devices: set of unique devices contained in dataset
82

  
83
    Returns:
84

  
85
    """
47 86
    devices_set = set(config["devices"].keys())
48 87
    unknown_devices_set = devices.difference(devices_set)
49 88

  
50 89
    return unknown_devices_set
51 90

  
52 91

  
53
def unzip_all_csv_zip_files_in_folder(folder):
54

  
55
    files_in_dir = os.listdir(folder)
92
def unzip_all_csv_zip_files_in_folder(path):
93
    """
94
    Load all files from directory and unzip those which end by .zip
95
    After unziping deletes the zip file
96
    Args:
97
        path: Path to CrawledData directory containing ignore.txt file
98
    """
99
    files_in_dir = os.listdir(path)
56 100
    zips = []
57 101

  
58 102
    for file in files_in_dir:
59 103
        if file.endswith(".zip"):
60
            zips.append(folder + file)
104
            zips.append(path + file)
61 105

  
62 106
    for zip_file in zips:
63 107

  
64 108
        with zipfile.ZipFile(zip_file, "r") as unziped_file:
65
            unziped_file.extractall(folder)
109
            unziped_file.extractall(path)
66 110

  
67 111
        os.remove(zip_file)
68 112

  
python-module/main.py
1 1
import Pipeline
2 2
import os
3 3

  
4
# Path to configuration files
4 5
CONFIG_FILES_PATH = "DatasetConfigs/"
5 6

  
6 7

  
7 8
def run_pipeline_for_all_datasets():
9
    """
10
    Runs whole DataScript pipeline for every dataset that has existing configuration file
11
    """
8 12
    files_in_dir = os.listdir(CONFIG_FILES_PATH)
9 13

  
10 14
    for file in files_in_dir:
11
        Pipeline.run_full_pipeline(file)
15
        name = file.split('.')
16
        Pipeline.run_full_pipeline(name[0])
12 17

  
13 18

  
14 19
def run_pipeline_for_one_dataset(dataset_name):
20
    """
21
    Runs whole DataScript pipeline for only one dataset
22

  
23
    Args:
24
        dataset_name: name of dataset that has existing configuration file
25
    """
15 26
    Pipeline.run_full_pipeline(dataset_name)
16 27

  
17 28

  

Také k dispozici: Unified diff