Projekt

Obecné

Profil

Stáhnout (5.95 KB) Statistiky
| Větev: | Revize:
1 c8f3051b petrh
import os
2
3 04a2b5a4 petrh
# Path to crawled data
4 c8f3051b petrh
CRAWLED_DATA_PATH = "../CrawledData/"
5 04a2b5a4 petrh
# Path to processed data
6 c8f3051b petrh
PROCESSED_DATA_PATH = "../ProcessedData/"
7 04a2b5a4 petrh
# Path to crawler logs
8 c8f3051b petrh
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
9 04a2b5a4 petrh
# Path for DatasetCrawlers implementations
10 c8f3051b petrh
CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
11 04a2b5a4 petrh
# Path for DatasetProcessors implementations
12 c8f3051b petrh
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
13 04a2b5a4 petrh
# Path to dataset configuration files
14 c8f3051b petrh
CONFIG_FILES_PATH = "../DatasetConfigs"
15
16
17
def create_default_config_file(dataset_name):
18 04a2b5a4 petrh
    """
19
    Creates default config file
20 c8f3051b petrh
21 04a2b5a4 petrh
    Args:
22
        dataset_name: Name of newly created dataset
23
    """
24 c8f3051b petrh
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
25
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
26
        file.write("dataset-name: " + dataset_name + "\n")
27
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
28 04a2b5a4 petrh
        file.write("url: ZDE VLOZTE URL\n")
29 70e660a8 petrh
        file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
30 c8f3051b petrh
        file.write("regex: ZDE VLOZTE REGEX\n")
31
        file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
32
                   "tak defaultni hodnota (dny)\n")
33
        file.write("update-period: ZDE VLOZTE HODNOTU\n")
34 70e660a8 petrh
        file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
35
        file.write("devices:\n")
36 c8f3051b petrh
37
38
def create_default_processor(dataset_name):
39 04a2b5a4 petrh
    """
40
    Creates default processor for dataset
41
42
    Args:
43
        dataset_name: Name of newly created dataset
44
    """
45 c8f3051b petrh
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
46 04a2b5a4 petrh
        file.write("from Utilities.CSV import CSVDataLine, CSVutils")
47
        file.write("\n")
48
        file.write("\n")
49 c8f3051b petrh
        file.write("def process_file(filename):\n")
50 04a2b5a4 petrh
        file.write("    \"\"\"\n")
51
        file.write("    Method that take path to crawled file and outputs date dictionary using method:\n")
52
        file.write("    CSVutils.export_data_to_csv(filename, date_dict)\n")
53
        file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
54
        file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
55
        file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
56
        file.write("\n")
57
        file.write("    Args:\n")
58
        file.write("    filename: name of processed file\n")
59
        file.write("\n")
60
        file.write("    Returns:\n")
61
        file.write("    False if not implemented\n")
62
        file.write("    True when implemented\n")
63
        file.write("    \"\"\"\n")
64 c8f3051b petrh
        file.write("    print(\"You must implements process_file method first!\")\n")
65 04a2b5a4 petrh
        file.write("    #CSVutils.export_data_to_csv(filename, date_dict)\n")
66
        file.write("    return False\n")
67 c8f3051b petrh
68
69
def create_default_crawler(dataset_name):
70 04a2b5a4 petrh
    """
71
    Creates default crawler for dataset
72
73
    Args:
74
        dataset_name: Name of newly created dataset
75
    """
76 c8f3051b petrh
77
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
78 04a2b5a4 petrh
        file.write("# Path to crawled data\n")
79
        file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
80
        file.write("\n")
81
        file.write("\n")
82 c8f3051b petrh
        file.write("def crawl(config):\n")
83 04a2b5a4 petrh
        file.write("    \"\"\"\n")
84
        file.write("    Implement crawl method that downloads new data to path_for_files\n")
85
        file.write("    For keeping the project structure\n")
86
        file.write("    url , regex, and dataset_name from config\n")
87
        file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
88
        file.write("\n")
89
        file.write("    Args:\n")
90
        file.write("        config: loaded configuration file of dataset\n")
91
        file.write("    \"\"\"\n")
92
        file.write("    dataset_name = config[\"dataset-name\"]\n")
93
        file.write("    url = config['url']\n")
94
        file.write("    regex = config['regex']\n")
95
        file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
96 c8f3051b petrh
        file.write("    print(\"You must implements Crawl method first!\")\n")
97
98
99 04a2b5a4 petrh
def create_ignore_file(path, text):
100
    """
101
    Creates ignore file
102
    Args:
103
        path: path to directory for creating ignore.txt
104
        text: text that will be on first line of ignore.txt can be None
105
    """
106 c8f3051b petrh
    with open(path + "/ignore.txt", "w") as file:
107
        if text is not None:
108
            file.write(text + "\n")
109
110 34baf808 petrh
def create_updated_file(path):
111
    """
112
    Creates updated file
113
    Args:
114
        path: path to directory for creating updated.txt
115
    """
116
    with open(path + "/updated.txt", "w") as file:
117
            file.write(str(0) + "\n")
118
119 c8f3051b petrh
120
def prepare_dataset_structure(dataset_name):
121 04a2b5a4 petrh
    """
122
    Prepares folders for new dataset
123
    Args:
124
        dataset_name: Name of newly created dataset
125
    """
126 c8f3051b petrh
127
    # create folder for crawled data
128
    try:
129
        path = CRAWLED_DATA_PATH+dataset_name
130
        os.mkdir(path)
131 04a2b5a4 petrh
        create_ignore_file(path, "ignore.txt")
132 c8f3051b petrh
    except os.error as e:
133
        print(e)
134
        print("Creation of the directory %s failed" % path)
135
136
    # create folder for processed data
137
    try:
138
        path = PROCESSED_DATA_PATH + dataset_name
139
        os.mkdir(path)
140
        create_ignore_file(path, "ignore.txt")
141
    except OSError:
142
        print("Creation of the directory %s failed" % path)
143
144
    # create folder for crawler logs
145
    try:
146
        path = CRAWLER_LOGS_PATH + dataset_name
147
        os.mkdir(path)
148
        create_ignore_file(path, None)
149 34baf808 petrh
        create_updated_file(path)
150 c8f3051b petrh
    except OSError:
151
        print("Creation of the directory %s failed" % path)
152
153
    create_default_crawler(dataset_name)
154
    create_default_processor(dataset_name)
155
    create_default_config_file(dataset_name)
156
157
158 04a2b5a4 petrh
prepare_dataset_structure("TEST")