Projekt

Obecné

Profil

Stáhnout (5.71 KB) Statistiky
| Větev: | Revize:
1 b3262a44 petrh
import os
2
3 9990127e petrh
# Path to crawled data
4 b3262a44 petrh
CRAWLED_DATA_PATH = "../CrawledData/"
5 9990127e petrh
# Path to processed data
6 b3262a44 petrh
PROCESSED_DATA_PATH = "../ProcessedData/"
7 9990127e petrh
# Path to crawler logs
8 b3262a44 petrh
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
9 9990127e petrh
# Path for DatasetCrawlers implementations
10 b3262a44 petrh
CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
11 9990127e petrh
# Path for DatasetProcessors implementations
12 b3262a44 petrh
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
13 9990127e petrh
# Path to dataset configuration files
14 b3262a44 petrh
CONFIG_FILES_PATH = "../DatasetConfigs"
15
16
17
def create_default_config_file(dataset_name):
18 9990127e petrh
    """
19
    Creates default config file
20 b3262a44 petrh
21 9990127e petrh
    Args:
22
        dataset_name: Name of newly created dataset
23
    """
24 b3262a44 petrh
    with open(CONFIG_FILES_PATH + "/" + dataset_name + ".yaml", "w") as file:
25
        file.write("# jmeno datasetu, pod kterym bude zobrazen v aplikaci\n")
26
        file.write("dataset-name: " + dataset_name + "\n")
27
        file.write("# root slozka, ktera obsahuje odkazy na dataset\n")
28 9990127e petrh
        file.write("url: ZDE VLOZTE URL\n")
29 c3c67472 petrh
        file.write("# volitelny parameter, ktery specifikuje vzor jmrna datasetu, ktera se budou stahovat\n")
30 b3262a44 petrh
        file.write("regex: ZDE VLOZTE REGEX\n")
31
        file.write("# volitelny parametr, ktery udava jak casto se budou hledat nove datasety, pokud prazdne, "
32
                   "tak defaultni hodnota (dny)\n")
33
        file.write("update-period: ZDE VLOZTE HODNOTU\n")
34 c3c67472 petrh
        file.write("# pozice jednotlivych zarizeni, ktera jsou v datasetu\n")
35
        file.write("devices:\n")
36 b3262a44 petrh
37
38
def create_default_processor(dataset_name):
39 9990127e petrh
    """
40
    Creates default processor for dataset
41
42
    Args:
43
        dataset_name: Name of newly created dataset
44
    """
45 b3262a44 petrh
    with open(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py", "w") as file:
46 9990127e petrh
        file.write("from Utilities.CSV import CSVDataLine, CSVutils")
47
        file.write("\n")
48
        file.write("\n")
49 b3262a44 petrh
        file.write("def process_file(filename):\n")
50 9990127e petrh
        file.write("    \"\"\"\n")
51
        file.write("    Method that take path to crawled file and outputs date dictionary using method:\n")
52
        file.write("    CSVutils.export_data_to_csv(filename, date_dict)\n")
53
        file.write("    Date dictionary is a dictionary where keys are dates in format ddmmYYYYhh (0804201815)\n")
54
        file.write("    and value is dictionary where keys devices (specified in configuration file)\n")
55
        file.write("    and value is CSVDataLine.CSVDataLine with device,date and occurrence\n")
56
        file.write("\n")
57
        file.write("    Args:\n")
58
        file.write("    filename: name of processed file\n")
59
        file.write("\n")
60
        file.write("    Returns:\n")
61
        file.write("    False if not implemented\n")
62
        file.write("    True when implemented\n")
63
        file.write("    \"\"\"\n")
64 b3262a44 petrh
        file.write("    print(\"You must implements process_file method first!\")\n")
65 9990127e petrh
        file.write("    #CSVutils.export_data_to_csv(filename, date_dict)\n")
66
        file.write("    return False\n")
67 b3262a44 petrh
68
69
def create_default_crawler(dataset_name):
70 9990127e petrh
    """
71
    Creates default crawler for dataset
72
73
    Args:
74
        dataset_name: Name of newly created dataset
75
    """
76 b3262a44 petrh
77
    with open(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py", "w") as file:
78 9990127e petrh
        file.write("# Path to crawled data\n")
79
        file.write("CRAWLED_DATA_PATH = \"CrawledData/\"\n")
80
        file.write("\n")
81
        file.write("\n")
82 b3262a44 petrh
        file.write("def crawl(config):\n")
83 9990127e petrh
        file.write("    \"\"\"\n")
84
        file.write("    Implement crawl method that downloads new data to path_for_files\n")
85
        file.write("    For keeping the project structure\n")
86
        file.write("    url , regex, and dataset_name from config\n")
87
        file.write("    You can use already implemented functions from Utilities/Crawler/BasicCrawlerFunctions.py\n")
88
        file.write("\n")
89
        file.write("    Args:\n")
90
        file.write("        config: loaded configuration file of dataset\n")
91
        file.write("    \"\"\"\n")
92
        file.write("    dataset_name = config[\"dataset-name\"]\n")
93
        file.write("    url = config['url']\n")
94
        file.write("    regex = config['regex']\n")
95
        file.write("    path_for_files = CRAWLED_DATA_PATH + dataset_name + '/'\n")
96 b3262a44 petrh
        file.write("    print(\"You must implements Crawl method first!\")\n")
97
98
99 9990127e petrh
def create_ignore_file(path, text):
100
    """
101
    Creates ignore file
102
    Args:
103
        path: path to directory for creating ignore.txt
104
        text: text that will be on first line of ignore.txt can be None
105
    """
106 b3262a44 petrh
    with open(path + "/ignore.txt", "w") as file:
107
        if text is not None:
108
            file.write(text + "\n")
109
110
111
def prepare_dataset_structure(dataset_name):
112 9990127e petrh
    """
113
    Prepares folders for new dataset
114
    Args:
115
        dataset_name: Name of newly created dataset
116
    """
117 b3262a44 petrh
    jump_folder = "../"
118
119
    # create folder for crawled data
120
    try:
121
        path = CRAWLED_DATA_PATH+dataset_name
122
        os.mkdir(path)
123 9990127e petrh
        create_ignore_file(path, "ignore.txt")
124 b3262a44 petrh
    except os.error as e:
125
        print(e)
126
        print("Creation of the directory %s failed" % path)
127
128
    # create folder for processed data
129
    try:
130
        path = PROCESSED_DATA_PATH + dataset_name
131
        os.mkdir(path)
132
        create_ignore_file(path, "ignore.txt")
133
    except OSError:
134
        print("Creation of the directory %s failed" % path)
135
136
    # create folder for crawler logs
137
    try:
138
        path = CRAWLER_LOGS_PATH + dataset_name
139
        os.mkdir(path)
140
        create_ignore_file(path, None)
141
    except OSError:
142
        print("Creation of the directory %s failed" % path)
143
144
    create_default_crawler(dataset_name)
145
    create_default_processor(dataset_name)
146
    create_default_config_file(dataset_name)
147
148
149 9990127e petrh
prepare_dataset_structure("TEST")