Projekt

Obecné

Profil

Stáhnout (6.04 KB) Statistiky
| Větev: | Revize:
1
import os
2

    
3
# Path to crawled data
4
CRAWLED_DATA_PATH = "CrawledData/"
5
# Path to processed data
6
PROCESSED_DATA_PATH = "ProcessedData/"
7
# Path to crawler logs
8
CRAWLER_LOGS_PATH = "CrawlerLogs/"
9
# Path for DatasetCrawlers implementations
10
CRAWLER_PROGRAM_PATH = "DatasetCrawler"
11
# Path for DatasetProcessors implementations
12
PROCESSOR_PROGRAM_PATH = "DatasetProcessing"
13
# Path to dataset configuration files
14
CONFIG_FILES_PATH = "DatasetConfigs"
15

    
16

    
17
def create_default_config_file(dataset_name):
18
    """
19
    Creates default config file
20

    
21
    Args:
22
        dataset_name: Name of newly created dataset
23
    """
24
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
25
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
26
        file.write("dataset-name: " + dataset_name + "\n")
27
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
28
        file.write("url: ZDE VLOZTE URL\n")
29
        file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
30
        file.write("regex: ZDE VLOZTE REGEX\n")
31
        file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
32
                   "tak defaultni hodnota (dny)\n")
33
        file.write("update-period: ZDE VLOZTE HODNOTU\n")
34
        file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
35
        file.write("devices:\n")
36

    
37

    
38
def create_default_processor(dataset_name):
39
    """
40
    Creates default processor for dataset
41

    
42
    Args:
43
        dataset_name: Name of newly created dataset
44
    """
45
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
46
        file.write("from Utilities.CSV import CSVDataLine, CSVutils")
47
        file.write("\n")
48
        file.write("\n")
49
        file.write("def process_file(filename):\n")
50
        file.write("    \"\"\"\n")
51
        file.write("    Method that take path to crawled file and outputs date dictionary using method:\n")
52
        file.write("    CSVutils.export_data_to_csv(filename, date_dict)\n")
53
        file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
54
        file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
55
        file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
56
        file.write("\n")
57
        file.write("    Args:\n")
58
        file.write("    filename: name of processed file\n")
59
        file.write("\n")
60
        file.write("    Returns:\n")
61
        file.write("    False if not implemented\n")
62
        file.write("    True when implemented\n")
63
        file.write("    \"\"\"\n")
64
        file.write("    #with open(filename, \"r\") as file:\n")
65
        file.write("    print(\"You must implements process_file method first!\")\n")
66
        file.write("    #CSVutils.export_data_to_csv(filename, date_dict)\n")
67
        file.write("    return False\n")
68

    
69

    
70
def create_default_crawler(dataset_name):
71
    """
72
    Creates default crawler for dataset
73

    
74
    Args:
75
        dataset_name: Name of newly created dataset
76
    """
77

    
78
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
79
        file.write("# Path to crawled data\n")
80
        file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
81
        file.write("\n")
82
        file.write("\n")
83
        file.write("def crawl(config):\n")
84
        file.write("    \"\"\"\n")
85
        file.write("    Implement crawl method that downloads new data to path_for_files\n")
86
        file.write("    For keeping the project structure\n")
87
        file.write("    url , regex, and dataset_name from config\n")
88
        file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
89
        file.write("\n")
90
        file.write("    Args:\n")
91
        file.write("        config: loaded configuration file of dataset\n")
92
        file.write("    \"\"\"\n")
93
        file.write("    dataset_name = config[\"dataset-name\"]\n")
94
        file.write("    url = config['url']\n")
95
        file.write("    regex = config['regex']\n")
96
        file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
97
        file.write("    print(\"You must implements Crawl method first!\")\n")
98

    
99

    
100
def create_ignore_file(path, text):
101
    """
102
    Creates ignore file
103
    Args:
104
        path: path to directory for creating ignore.txt
105
        text: text that will be on first line of ignore.txt can be None
106
    """
107
    with open(path + "/ignore.txt", "w") as file:
108
        if text is not None:
109
            file.write(text + "\n")
110

    
111
def create_updated_file(path):
112
    """
113
    Creates updated file
114
    Args:
115
        path: path to directory for creating updated.txt
116
    """
117
    with open(path + "/updated.txt", "w") as file:
118
            file.write(str(0) + "\n")
119

    
120

    
121
def prepare_dataset_structure(dataset_name):
122
    """
123
    Prepares folders for new dataset
124
    Args:
125
        dataset_name: Name of newly created dataset
126
    """
127

    
128
    # create folder for crawled data
129
    try:
130
        path = CRAWLED_DATA_PATH+dataset_name
131
        os.mkdir(path)
132
        create_ignore_file(path, "ignore.txt")
133
    except os.error as e:
134
        print(e)
135
        print("Creation of the directory %s failed" % path)
136

    
137
    # create folder for processed data
138
    try:
139
        path = PROCESSED_DATA_PATH + dataset_name
140
        os.mkdir(path)
141
        create_ignore_file(path, "ignore.txt")
142
    except OSError:
143
        print("Creation of the directory %s failed" % path)
144

    
145
    # create folder for crawler logs
146
    try:
147
        path = CRAWLER_LOGS_PATH + dataset_name
148
        os.mkdir(path)
149
        create_ignore_file(path, None)
150
        create_updated_file(path)
151
    except OSError:
152
        print("Creation of the directory %s failed" % path)
153

    
154
    create_default_crawler(dataset_name)
155
    create_default_processor(dataset_name)
156
    create_default_config_file(dataset_name)
157

    
158
print("Zadejte jméno nového datasetu:\n")
159
prepare_dataset_structure(input())
(6-6/14)