Projekt

Obecné

Profil

« Předchozí | Další » 

Revize c8f3051b

Přidáno uživatelem Petr Hlaváč před asi 4 roky(ů)

Re #7924 beta version of datascript

Zobrazit rozdíly:

.gitignore
3 3
.idea
4 4
dev-dump
5 5
*.lock
6
<<<<<<< HEAD
6 7
.vscode-server
8
=======
9
*__pycache__*
10
*.CSV
11
python-module/venv
12
>>>>>>> Re #7924 beta version of datascript
python-module/CrawledData/KOLOBEZKY/ignore.txt
1
ignore.txt
2
OD_ZCU_KOLOBEZKY_07_2019.CSV
3
OD_ZCU_KOLOBEZKY_06_2019.CSV
4
OD_ZCU_KOLOBEZKY_00_2019.CSV
5
OD_ZCU_KOLOBEZKY_08_2019.CSV
python-module/CrawlerLogs/KOLOBEZKY/ignore.txt
1
https://openstore.zcu.cz/OD_ZCU_00_2019/OD_ZCU_KOLOBEZKY_00_2019_CSV.zip
2
https://openstore.zcu.cz/OD_ZCU_06_2019/OD_ZCU_KOLOBEZKY_06_2019_CSV.zip
3
https://openstore.zcu.cz/OD_ZCU_07_2019/OD_ZCU_KOLOBEZKY_07_2019_CSV.zip
4
https://openstore.zcu.cz/OD_ZCU_08_2019/OD_ZCU_KOLOBEZKY_08_2019_CSV.zip
python-module/DatasetConfigs/KOLOBEZKY.yaml
1
# jméno datasetu, pod kterým bude zobrazen v aplikaci
2
dataset-name: KOLOBEZKY
3
# pozice jednotlivých zařízení, která jsou v datasetu
4
devices:
5
  - stojan-knihovna:
6
      x: 13.353319
7
      y: 49.725145
8

  
9
  - stojan-Machovka:
10
      x: 13.368016
11
      y: 49.725197
12

  
13
  -  stojan-FDU:
14
      x: 13.347909
15
      y: 49.725360
16

  
17
  - stojan-bory:
18
      x: 13.350861
19
      y: 49.724576
20

  
21
  - stojan-zcu:
22
      x: 13.365958
23
      y: 49.726033
24

  
25
  - stojan-borska:
26
      x: 13.359475
27
      y: 49.734518
28

  
29
# root složka, která obsahuje odkazy na dataset
30
url: https://openstore.zcu.cz/
31
# volitelný parameter, který specifikuje vzor jména datasetů, které se budou stahovat
32
regex: OD_ZCU_KOLOBEZKY_[0-9][0-9]_[0-9][0-9][0-9][0-9]_CSV.zip
33
# volitelný parametr, který udává jak často se budou hledat nové datasety, pokud prázdné, tak defaultní hodnota (dny)
34
update-period: 24
python-module/DatasetCrawler/KOLOBEZKYCrawler.py
1
from Utilities import FolderProcessor
2
from Utilities.Crawler import BasicCrawler
3

  
4

  
5
def crawl(config):
6

  
7
    dataset_name = config["dataset-name"]
8
    url = config['url']
9
    regex = config['regex']
10

  
11
    first_level_links = BasicCrawler.get_all_links(url)
12
    filtered_first_level_links = BasicCrawler.filter_links(first_level_links, "^OD_ZCU")
13
    absolute_first_level_links = BasicCrawler.create_absolute_links(filtered_first_level_links, url)
14

  
15
    files = []
16

  
17
    for link in absolute_first_level_links:
18
        second_level_links = BasicCrawler.get_all_links(link)
19
        filtered_second_level_links = BasicCrawler.filter_links(second_level_links, regex)
20
        absolute_second_level_links = BasicCrawler.create_absolute_links(filtered_second_level_links, link)
21
        final_links = BasicCrawler.remove_downloaded_links(absolute_second_level_links, dataset_name)
22

  
23
        for file_link in final_links:
24
            files.append(file_link)
25

  
26
    for file in files:
27
        BasicCrawler.download_file_from_url(file, "CrawledData/" + dataset_name + "/", dataset_name)
28

  
29
    FolderProcessor.unzip_all_csv_zip_files_in_folder("CrawledData/" + dataset_name + "/")
python-module/DatasetProcessing/KOLOBEZKYProcessor.py
1
from Utilities.CSV import CSVDataLine, CSVutils
2
from Utilities import DateFormating
3

  
4

  
5
def process_file(filename):
6

  
7
    with open(filename, "r") as file:
8

  
9
        date_dict = dict()
10

  
11
        for line in file:
12

  
13
            array = line.split(";")
14

  
15
            date = DateFormating.date_time_formater(array[0][1:-1])
16
            name = array[1][1:-1]
17

  
18
            if date not in date_dict:
19
                date_dict[date] = dict()
20

  
21
            if name in date_dict[date]:
22
                date_dict[date][name].occurence += 1
23
            else:
24
                date_dict[date][name] = CSVDataLine.CSVDataLine(name, date, 1)
25

  
26
        CSVutils.export_data_to_csv(filename, date_dict)
python-module/Pipeline.py
1
from Utilities import FolderProcessor, ConfigureLoader
2
from Utilities.Database import DatabaseLoader
3

  
4

  
5
CONFIG_FILES_PATH = "DatasetConfigs/"
6
CRAWLED_DATA_PATH = "CrawledData/"
7
PROCESSED_DATA_PATH = "ProcessedData/"
8
CRAWLER_LIB_PATH = "DatasetCrawler."
9
PROCESSOR_LIB_PATH = "DatasetProcessing."
10

  
11

  
12
def crawl_data(config):
13

  
14
    dataset_name = config["dataset-name"]
15

  
16
    my_function = __import__(CRAWLER_LIB_PATH + dataset_name + "Crawler",globals(),locals(),['crawl']).crawl
17
    my_function(config)
18

  
19
    dataset_name += '/'
20

  
21

  
22
def process_data(dataset_name):
23
    dataset_path = dataset_name + '/'
24

  
25
    process_file_func = __import__(PROCESSOR_LIB_PATH + dataset_name + "Processor",globals(),locals(),
26
                                   ['process_file']).process_file
27

  
28
    # get all not processed files from dataset
29
    not_processed_files = FolderProcessor.list_of_all_files(CRAWLED_DATA_PATH + dataset_path)
30

  
31
    # process every file
32
    for not_processed_file in not_processed_files:
33
        # call processing for one file in dataset
34
        process_file_func(CRAWLED_DATA_PATH + dataset_path + not_processed_file)
35
        FolderProcessor.update_ignore_set(CRAWLED_DATA_PATH + dataset_path, not_processed_file)
36

  
37

  
38
def load_data_to_database(config):
39

  
40
    dataset_name = config["dataset-name"]
41
    dataset_path = dataset_name + '/'
42

  
43
    # get all unprocessed files from dataset
44
    not_loaded_files = FolderProcessor.list_of_all_files(PROCESSED_DATA_PATH + dataset_path)
45

  
46
    # load every file
47
    for not_loaded_file in not_loaded_files:
48
        # load processed data
49
        processed_data = DatabaseLoader.get_data_from_file(PROCESSED_DATA_PATH + dataset_path + not_loaded_file,
50
                                                           config["devices"])
51
        # load processed data to database
52
        DatabaseLoader.load_data_to_database(dataset_name, processed_data)
53
        FolderProcessor.update_ignore_set(PROCESSED_DATA_PATH + dataset_path, not_loaded_file)
54

  
55

  
56
def run_full_pipeline(dataset_name):
57
    config = ConfigureLoader.load_configuration(CONFIG_FILES_PATH + dataset_name)
58
    crawl_data(config)
59
    process_data(config["dataset-name"])
60
    load_data_to_database(config)
python-module/ProcessedData/KOLOBEZKY/ignore.txt
1
ignore.txt
2
OD_ZCU_KOLOBEZKY_07_2019.CSV
3
OD_ZCU_KOLOBEZKY_06_2019.CSV
4
OD_ZCU_KOLOBEZKY_00_2019.CSV
5
OD_ZCU_KOLOBEZKY_08_2019.CSV
python-module/Scripts/PrepareNewDataset.py
1
import os
2

  
3
CRAWLED_DATA_PATH = "../CrawledData/"
4
PROCESSED_DATA_PATH = "../ProcessedData/"
5
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
6
CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
7
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
8
CONFIG_FILES_PATH = "../DatasetConfigs"
9

  
10

  
11
def create_default_config_file(dataset_name):
12

  
13
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
14
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
15
        file.write("dataset-name: " + dataset_name + "\n")
16
        file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
17
        file.write("devices:\n")
18
        file.write("  - example1:\n")
19
        file.write("      x: 12.3\n")
20
        file.write("      y: 32.1\n")
21
        file.write("\n")
22
        file.write("  - example2:\n")
23
        file.write("      x: 32.1\n")
24
        file.write("      y: 12.3\n")
25
        file.write("\n")
26
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
27
        file.write("url: ZDE VLOZTE URL/\n")
28
        file.write("# volitelný parameter, který specifikuje vzor jména datasetů, které se budou stahovat\n")
29
        file.write("regex: ZDE VLOZTE REGEX\n")
30
        file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
31
                   "tak defaultni hodnota (dny)\n")
32
        file.write("update-period: ZDE VLOZTE HODNOTU\n")
33

  
34

  
35
def create_default_processor(dataset_name):
36
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
37
        file.write("def process_file(filename):\n")
38
        file.write("    print(\"You must implements process_file method first!\")\n")
39

  
40

  
41
def create_default_crawler(dataset_name):
42

  
43
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
44
        file.write("def crawl(config):\n")
45
        file.write("    print(\"You must implements Crawl method first!\")\n")
46

  
47

  
48
def create_ignore_file(path,text):
49

  
50
    with open(path + "/ignore.txt", "w") as file:
51
        if text is not None:
52
            file.write(text + "\n")
53

  
54

  
55
def prepare_dataset_structure(dataset_name):
56
    jump_folder = "../"
57

  
58
    # create folder for crawled data
59
    try:
60
        path = CRAWLED_DATA_PATH+dataset_name
61
        os.mkdir(path)
62
        create_ignore_file(path,"ignore.txt")
63
    except os.error as e:
64
        print(e)
65
        print("Creation of the directory %s failed" % path)
66

  
67
    # create folder for processed data
68
    try:
69
        path = PROCESSED_DATA_PATH + dataset_name
70
        os.mkdir(path)
71
        create_ignore_file(path, "ignore.txt")
72
    except OSError:
73
        print("Creation of the directory %s failed" % path)
74

  
75
    # create folder for crawler logs
76
    try:
77
        path = CRAWLER_LOGS_PATH + dataset_name
78
        os.mkdir(path)
79
        create_ignore_file(path, None)
80
    except OSError:
81
        print("Creation of the directory %s failed" % path)
82

  
83
    create_default_crawler(dataset_name)
84
    create_default_processor(dataset_name)
85
    create_default_config_file(dataset_name)
86

  
87

  
88
prepare_dataset_structure("JIS")
python-module/Utilities/CSV/CSVDataLine.py
1
class CSVDataLine:
2

  
3
    def __init__(self, name, date, occurence):
4
        self.name = name
5
        self.date = date
6
        self.occurence = occurence
7

  
8
    def to_csv(self):
9
        return self.name + ";" + str(self.occurence) + ";" + self.date
10

  
python-module/Utilities/CSV/CSVutils.py
1
PROCESSED_DATA_PATH = "ProcessedData/"
2

  
3
def get_unique_names_from_file(filename, column_number):
4
    f = open(filename, "r")
5

  
6
    # create set of unique names
7
    name_set = set()
8

  
9
    # go through every line of line
10
    for x in f:
11
        # split by csv splitter ;
12
        array = x.split(";")
13
        # add string from chosen column to set
14
        name_set.add(array[column_number])
15

  
16
    f.close()
17

  
18
    return name_set
19

  
20

  
21
def export_data_to_csv(filename, data_dict):
22
    with open(PROCESSED_DATA_PATH + filename[12:], "w+") as file:
23

  
24
        for date in data_dict:
25
            for data in data_dict[date]:
26
                file.write(data_dict[date][data].to_csv() + '\n')
python-module/Utilities/ConfigureLoader.py
1
import yaml
2

  
3

  
4
def load_configuration(configure_file_name):
5

  
6
    with open(configure_file_name) as f:
7
        data = yaml.load(f, Loader=yaml.FullLoader)
8

  
9
    devices_dic = dict()
10

  
11
    for item in data["devices"]:
12
        devices_dic.update(item)
13

  
14
    data["devices"] = devices_dic
15

  
16
    return data
python-module/Utilities/Crawler/BasicCrawler.py
1
import requests
2
import re
3
from Utilities import FolderProcessor
4
from bs4 import BeautifulSoup
5

  
6

  
7
def get_all_links(url):
8
    # create response object
9
    r = requests.get(url)
10

  
11
    # create beautiful-soup object
12
    soup = BeautifulSoup(r.content, 'html5lib')
13
    links = []
14

  
15
    for link in soup.findAll('a'):
16
        links.append(link.get('href'))
17

  
18
    return links
19

  
20

  
21
def filter_links(links, regex):
22
    fitlered_links = []
23

  
24
    for link in links:
25
        if re.search(regex,link):
26
            fitlered_links.append(link)
27

  
28
    return fitlered_links
29

  
30

  
31
def create_absolute_links(links, archive):
32
    absolute_links = []
33

  
34
    for link in links:
35
        absolute_links.append(archive + link)
36

  
37
    return absolute_links
38

  
39

  
40
def remove_downloaded_links(links,dataset_name):
41

  
42
    downloaded_links = FolderProcessor.load_ignore_set("CrawlerLogs/" + dataset_name + "/")
43
    final_links = set(links) - downloaded_links
44

  
45
    return final_links
46

  
47

  
48
def download_file_from_url(url,path, dataset_name):
49
    r = requests.get(url, stream=True)
50

  
51
    url_parts = url.split("/")
52
    file_name = url_parts[len(url_parts)-1]
53

  
54
    with open(path + file_name, "wb") as file:
55
        for chunk in r.iter_content(chunk_size=1024):
56

  
57
            # writing one chunk at a time to pdf file
58
            if chunk:
59
                file.write(chunk)
60

  
61
    FolderProcessor.update_ignore_set("CrawlerLogs/" + dataset_name + "/", url)
python-module/Utilities/DateFormating.py
1
def date_formater(string_date):
2
    if string_date[11].isspace():
3
        pos = 0
4
        srr = ""
5
        for i in string_date:
6

  
7
            if pos == 10:
8
                srr = srr + '0'
9
            else:
10
                srr = srr + i
11

  
12
            pos = pos + 1
13

  
14
        string_date = srr
15

  
16
    return_date = string_date[:2] + string_date[3:5] + string_date[6:10]
17

  
18
    return return_date
19

  
20

  
21
def date_time_formater(string_date):
22
    if string_date[11].isspace():
23
        pos = 0
24
        srr = ""
25
        for i in string_date:
26

  
27
            if pos == 10:
28
                srr = srr + '0'
29
            else:
30
                srr = srr + i
31

  
32
            pos = pos + 1
33

  
34
        string_date = srr
35

  
36
    return_date = string_date[:2] + string_date[3:5] + string_date[6:10] + string_date[11:13]
37

  
38
    return return_date
python-module/Utilities/FolderProcessor.py
1
import os
2
import zipfile
3
from Utilities import ConfigureLoader
4

  
5

  
6
def list_of_all_files(path):
7
    files_in_dir = os.listdir(path)
8

  
9
    ignore_set = load_ignore_set(path)
10

  
11
    return set(files_in_dir).difference(ignore_set)
12

  
13

  
14
def load_ignore_set(path):
15
    ignore_set = set()
16

  
17
    with open(path + "ignore.txt", "r") as file:
18

  
19
        for line in file:
20
            ignore_set.add(line[:-1])
21

  
22
    return ignore_set
23

  
24

  
25
def update_ignore_set(path,file_name):
26

  
27
    with open(path + "ignore.txt", "a") as file:
28
        file.write(file_name + '\n')
29

  
30

  
31
def unzip_all_csv_zip_files_in_folder(folder):
32

  
33
    files_in_dir = os.listdir(folder)
34
    zips = []
35

  
36
    for file in files_in_dir:
37
        if file.endswith(".zip"):
38
            zips.append(folder + file)
39

  
40
    for zip_file in zips:
41

  
42
        with zipfile.ZipFile(zip_file, "r") as unziped_file:
43
            unziped_file.extractall(folder)
44

  
45
        os.remove(zip_file)
46

  
47

  
python-module/main.py
1
import Pipeline
2
import os
3

  
4
CONFIG_FILES_PATH = "DatasetConfigs/"
5

  
6

  
7
def run_pipeline_for_all_datasets():
8
    files_in_dir = os.listdir(CONFIG_FILES_PATH)
9

  
10
    for file in files_in_dir:
11
        Pipeline.run_full_pipeline(file)
12

  
13

  
14
def run_pipeline_for_one_dataset(dataset_name):
15
    Pipeline.run_full_pipeline(dataset_name)
16

  
17

  
18
run_pipeline_for_all_datasets()
python-module/requirements.txt
1
beautifulsoup4==4.9.0
2
certifi==2020.4.5.1
3
chardet==3.0.4
4
html5lib==1.0.1
5
idna==2.9
6
pymongo==3.10.1
7
PyYAML==5.3.1
8
requests==2.23.0
9
six==1.14.0
10
soupsieve==2.0
11
urllib3==1.25.9
12
webencodings==0.5.1

Také k dispozici: Unified diff