Projekt

Obecné

Profil

Stáhnout (6.1 KB) Statistiky
| Větev: | Revize:
1
import os
2

    
3
# Path to crawled data
4
CRAWLED_DATA_PATH = "CrawledData/"
5
# Path to processed data
6
PROCESSED_DATA_PATH = "ProcessedData/"
7
# Path to crawler logs
8
CRAWLER_LOGS_PATH = "CrawlerLogs/"
9
# Path for DatasetCrawlers implementations
10
CRAWLER_PROGRAM_PATH = "DatasetCrawler"
11
# Path for DatasetProcessors implementations
12
PROCESSOR_PROGRAM_PATH = "DatasetProcessing"
13
# Path to dataset configuration files
14
CONFIG_FILES_PATH = "DatasetConfigs"
15

    
16

    
17
def create_default_config_file(dataset_name):
18
    """
19
    Creates default config file
20

    
21
    Args:
22
        dataset_name: Name of newly created dataset
23
    """
24
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
25
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
26
        file.write("dataset-name: " + dataset_name + "\n")
27
        file.write("# jednoslovný název datasetu, pod kterym bude reprezentovana v architektuře\n")
28
        file.write("dataset-name: " + dataset_name + "\n")
29
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
30
        file.write("url: ZDE VLOZTE URL\n")
31
        file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
32
        file.write("regex: ZDE VLOZTE REGEX\n")
33
        file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
34
                   "tak defaultni hodnota (dny)\n")
35
        file.write("update-period: ZDE VLOZTE HODNOTU\n")
36
        file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
37
        file.write("devices:\n")
38

    
39

    
40
def create_default_processor(dataset_name):
41
    """
42
    Creates default processor for dataset
43

    
44
    Args:
45
        dataset_name: Name of newly created dataset
46
    """
47
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
48
        file.write("from Utilities.CSV import CSVDataLine")
49
        file.write("\n")
50
        file.write("\n")
51
        file.write("def process_file(filename):\n")
52
        file.write("    \"\"\"\n")
53
        file.write("    Method that take path to crawled file and outputs date dictionary:\n")
54
        file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
55
        file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
56
        file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
57
        file.write("\n")
58
        file.write("    Args:\n")
59
        file.write("    filename: name of processed file\n")
60
        file.write("\n")
61
        file.write("    Returns:\n")
62
        file.write("    None if not implemented\n")
63
        file.write("    date_dict when implemented\n")
64
        file.write("    \"\"\"\n")
65
        file.write("    date_dict = dict()\n")
66
        file.write("\n")
67
        file.write("    #with open(filename, \"r\") as file:\n")
68
        file.write("    print(\"You must implements process_file method first!\")\n")
69
        file.write("    return None\n")
70

    
71

    
72
def create_default_crawler(dataset_name):
73
    """
74
    Creates default crawler for dataset
75

    
76
    Args:
77
        dataset_name: Name of newly created dataset
78
    """
79

    
80
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
81
        file.write("# Path to crawled data\n")
82
        file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
83
        file.write("\n")
84
        file.write("\n")
85
        file.write("def crawl(config):\n")
86
        file.write("    \"\"\"\n")
87
        file.write("    Implement crawl method that downloads new data to path_for_files\n")
88
        file.write("    For keeping the project structure\n")
89
        file.write("    url , regex, and dataset_name from config\n")
90
        file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
91
        file.write("\n")
92
        file.write("    Args:\n")
93
        file.write("        config: loaded configuration file of dataset\n")
94
        file.write("    \"\"\"\n")
95
        file.write("    dataset_name = config[\"dataset-name\"]\n")
96
        file.write("    url = config['url']\n")
97
        file.write("    regex = config['regex']\n")
98
        file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
99
        file.write("    print(\"You must implements Crawl method first!\")\n")
100

    
101

    
102
def create_ignore_file(path, text):
103
    """
104
    Creates ignore file
105
    Args:
106
        path: path to directory for creating ignore.txt
107
        text: text that will be on first line of ignore.txt can be None
108
    """
109
    with open(path + "/ignore.txt", "w") as file:
110
        if text is not None:
111
            file.write(text + "\n")
112

    
113
def create_updated_file(path):
114
    """
115
    Creates updated file
116
    Args:
117
        path: path to directory for creating updated.txt
118
    """
119
    with open(path + "/updated.txt", "w") as file:
120
            file.write(str(0) + "\n")
121

    
122

    
123
def prepare_dataset_structure(dataset_name):
124
    """
125
    Prepares folders for new dataset
126
    Args:
127
        dataset_name: Name of newly created dataset
128
    """
129

    
130
    # create folder for crawled data
131
    try:
132
        path = CRAWLED_DATA_PATH+dataset_name
133
        os.mkdir(path)
134
        create_ignore_file(path, "ignore.txt")
135
    except os.error as e:
136
        print(e)
137
        print("Creation of the directory %s failed" % path)
138

    
139
    # create folder for processed data
140
    try:
141
        path = PROCESSED_DATA_PATH + dataset_name
142
        os.mkdir(path)
143
        create_ignore_file(path, "ignore.txt")
144
    except OSError:
145
        print("Creation of the directory %s failed" % path)
146

    
147
    # create folder for crawler logs
148
    try:
149
        path = CRAWLER_LOGS_PATH + dataset_name
150
        os.mkdir(path)
151
        create_ignore_file(path, None)
152
        create_updated_file(path)
153
    except OSError:
154
        print("Creation of the directory %s failed" % path)
155

    
156
    create_default_crawler(dataset_name)
157
    create_default_processor(dataset_name)
158
    create_default_config_file(dataset_name)
159

    
160
print("Zadejte jméno nového datasetu:\n")
161
prepare_dataset_structure(input())
(6-6/14)