Projekt

Obecné

Profil

Stáhnout (6.1 KB) Statistiky
| Větev: | Revize:
1 c8f3051b petrh
import os
2
3 04a2b5a4 petrh
# Path to crawled data
4 2494ea3a petrh
CRAWLED_DATA_PATH = "CrawledData/"
5 04a2b5a4 petrh
# Path to processed data
6 2494ea3a petrh
PROCESSED_DATA_PATH = "ProcessedData/"
7 04a2b5a4 petrh
# Path to crawler logs
8 2494ea3a petrh
CRAWLER_LOGS_PATH = "CrawlerLogs/"
9 04a2b5a4 petrh
# Path for DatasetCrawlers implementations
10 2494ea3a petrh
CRAWLER_PROGRAM_PATH = "DatasetCrawler"
11 04a2b5a4 petrh
# Path for DatasetProcessors implementations
12 2494ea3a petrh
PROCESSOR_PROGRAM_PATH = "DatasetProcessing"
13 04a2b5a4 petrh
# Path to dataset configuration files
14 2494ea3a petrh
CONFIG_FILES_PATH = "DatasetConfigs"
15 c8f3051b petrh
16
17
def create_default_config_file(dataset_name):
18 04a2b5a4 petrh
    """
19
    Creates default config file
20 c8f3051b petrh
21 04a2b5a4 petrh
    Args:
22
        dataset_name: Name of newly created dataset
23
    """
24 c8f3051b petrh
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
25
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
26
        file.write("dataset-name: " + dataset_name + "\n")
27 0a2832fb vastja
        file.write("# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře\n")
28
        file.write("dataset-name: " + dataset_name + "\n")
29 c8f3051b petrh
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
30 04a2b5a4 petrh
        file.write("url: ZDE VLOZTE URL\n")
31 70e660a8 petrh
        file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
32 c8f3051b petrh
        file.write("regex: ZDE VLOZTE REGEX\n")
33
        file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
34
                   "tak defaultni hodnota (dny)\n")
35
        file.write("update-period: ZDE VLOZTE HODNOTU\n")
36 70e660a8 petrh
        file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
37
        file.write("devices:\n")
38 c8f3051b petrh
39
40
def create_default_processor(dataset_name):
41 04a2b5a4 petrh
    """
42
    Creates default processor for dataset
43
44
    Args:
45
        dataset_name: Name of newly created dataset
46
    """
47 c8f3051b petrh
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
48 342ea08a petrh
        file.write("from Utilities.CSV import CSVDataLine")
49 04a2b5a4 petrh
        file.write("\n")
50
        file.write("\n")
51 c8f3051b petrh
        file.write("def process_file(filename):\n")
52 04a2b5a4 petrh
        file.write("    \"\"\"\n")
53 2d129043 petrh
        file.write("    Method that take path to crawled file and outputs date dictionary:\n")
54 04a2b5a4 petrh
        file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
55
        file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
56
        file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
57
        file.write("\n")
58
        file.write("    Args:\n")
59
        file.write("    filename: name of processed file\n")
60
        file.write("\n")
61
        file.write("    Returns:\n")
62 2d129043 petrh
        file.write("    None if not implemented\n")
63
        file.write("    date_dict when implemented\n")
64 04a2b5a4 petrh
        file.write("    \"\"\"\n")
65 2d129043 petrh
        file.write("    date_dict = dict()\n")
66
        file.write("\n")
67 2494ea3a petrh
        file.write("    #with open(filename, \"r\") as file:\n")
68 c8f3051b petrh
        file.write("    print(\"You must implements process_file method first!\")\n")
69 2d129043 petrh
        file.write("    return None\n")
70 c8f3051b petrh
71
72
def create_default_crawler(dataset_name):
73 04a2b5a4 petrh
    """
74
    Creates default crawler for dataset
75
76
    Args:
77
        dataset_name: Name of newly created dataset
78
    """
79 c8f3051b petrh
80
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
81 04a2b5a4 petrh
        file.write("# Path to crawled data\n")
82
        file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
83
        file.write("\n")
84
        file.write("\n")
85 c8f3051b petrh
        file.write("def crawl(config):\n")
86 04a2b5a4 petrh
        file.write("    \"\"\"\n")
87
        file.write("    Implement crawl method that downloads new data to path_for_files\n")
88
        file.write("    For keeping the project structure\n")
89
        file.write("    url , regex, and dataset_name from config\n")
90
        file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
91
        file.write("\n")
92
        file.write("    Args:\n")
93
        file.write("        config: loaded configuration file of dataset\n")
94
        file.write("    \"\"\"\n")
95
        file.write("    dataset_name = config[\"dataset-name\"]\n")
96
        file.write("    url = config['url']\n")
97
        file.write("    regex = config['regex']\n")
98
        file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
99 c8f3051b petrh
        file.write("    print(\"You must implements Crawl method first!\")\n")
100
101
102 04a2b5a4 petrh
def create_ignore_file(path, text):
103
    """
104
    Creates ignore file
105
    Args:
106
        path: path to directory for creating ignore.txt
107
        text: text that will be on first line of ignore.txt can be None
108
    """
109 c8f3051b petrh
    with open(path + "/ignore.txt", "w") as file:
110
        if text is not None:
111
            file.write(text + "\n")
112
113 34baf808 petrh
def create_updated_file(path):
114
    """
115
    Creates updated file
116
    Args:
117
        path: path to directory for creating updated.txt
118
    """
119
    with open(path + "/updated.txt", "w") as file:
120
            file.write(str(0) + "\n")
121
122 c8f3051b petrh
123
def prepare_dataset_structure(dataset_name):
124 04a2b5a4 petrh
    """
125
    Prepares folders for new dataset
126
    Args:
127
        dataset_name: Name of newly created dataset
128
    """
129 c8f3051b petrh
130
    # create folder for crawled data
131
    try:
132
        path = CRAWLED_DATA_PATH+dataset_name
133
        os.mkdir(path)
134 04a2b5a4 petrh
        create_ignore_file(path, "ignore.txt")
135 c8f3051b petrh
    except os.error as e:
136
        print(e)
137
        print("Creation of the directory %s failed" % path)
138
139
    # create folder for processed data
140
    try:
141
        path = PROCESSED_DATA_PATH + dataset_name
142
        os.mkdir(path)
143
        create_ignore_file(path, "ignore.txt")
144
    except OSError:
145
        print("Creation of the directory %s failed" % path)
146
147
    # create folder for crawler logs
148
    try:
149
        path = CRAWLER_LOGS_PATH + dataset_name
150
        os.mkdir(path)
151
        create_ignore_file(path, None)
152 34baf808 petrh
        create_updated_file(path)
153 c8f3051b petrh
    except OSError:
154
        print("Creation of the directory %s failed" % path)
155
156
    create_default_crawler(dataset_name)
157
    create_default_processor(dataset_name)
158
    create_default_config_file(dataset_name)
159
160 2494ea3a petrh
print("Zadejte jméno nového datasetu:\n")
161
prepare_dataset_structure(input())