Revize 2494ea3a
Přidáno uživatelem Petr Hlaváč před téměř 5 roky(ů)
modules/crawler/PrepareNewDataset.py | ||
---|---|---|
1 | 1 |
import os |
2 | 2 |
|
3 | 3 |
# Path to crawled data |
4 |
CRAWLED_DATA_PATH = "../CrawledData/"
|
|
4 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
5 | 5 |
# Path to processed data |
6 |
PROCESSED_DATA_PATH = "../ProcessedData/"
|
|
6 |
PROCESSED_DATA_PATH = "ProcessedData/" |
|
7 | 7 |
# Path to crawler logs |
8 |
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
|
|
8 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
|
9 | 9 |
# Path for DatasetCrawlers implementations |
10 |
CRAWLER_PROGRAM_PATH = "../DatasetCrawler"
|
|
10 |
CRAWLER_PROGRAM_PATH = "DatasetCrawler" |
|
11 | 11 |
# Path for DatasetProcessors implementations |
12 |
PROCESSOR_PROGRAM_PATH = "../DatasetProcessing"
|
|
12 |
PROCESSOR_PROGRAM_PATH = "DatasetProcessing" |
|
13 | 13 |
# Path to dataset configuration files |
14 |
CONFIG_FILES_PATH = "../DatasetConfigs"
|
|
14 |
CONFIG_FILES_PATH = "DatasetConfigs" |
|
15 | 15 |
|
16 | 16 |
|
17 | 17 |
def create_default_config_file(dataset_name): |
... | ... | |
61 | 61 |
file.write(" False if not implemented\n") |
62 | 62 |
file.write(" True when implemented\n") |
63 | 63 |
file.write(" \"\"\"\n") |
64 |
file.write(" #with open(filename, \"r\") as file:\n") |
|
64 | 65 |
file.write(" print(\"You must implements process_file method first!\")\n") |
65 | 66 |
file.write(" #CSVutils.export_data_to_csv(filename, date_dict)\n") |
66 | 67 |
file.write(" return False\n") |
... | ... | |
154 | 155 |
create_default_processor(dataset_name) |
155 | 156 |
create_default_config_file(dataset_name) |
156 | 157 |
|
157 |
|
|
158 |
prepare_dataset_structure("TEST") |
|
158 |
print("Zadejte jméno nového datasetu:\n") |
|
159 |
prepare_dataset_structure(input()) |
modules/crawler/RemoveDataset.py | ||
---|---|---|
29 | 29 |
os.remove(CRAWLER_PROGRAM_PATH + "/" + dataset_name + "Crawler.py") |
30 | 30 |
os.remove(PROCESSOR_PROGRAM_PATH + "/" + dataset_name + "Processor.py") |
31 | 31 |
|
32 |
print("Dataset: " + dataset_name + " removed")
|
|
32 |
print("Dataset " + dataset_name + " removed") |
|
33 | 33 |
|
34 |
remove_dataset("TEST") |
|
34 |
print("Zadejte jméno Datasetu který chcete odstranit:\n") |
|
35 |
remove_dataset(input()) |
modules/crawler/RemoveDatasetDatabase.py | ||
---|---|---|
28 | 28 |
|
29 | 29 |
print("Database Cleaned") |
30 | 30 |
|
31 |
|
|
32 |
remove_dataset_database("KOLOBEZKY") |
|
31 |
print("Zadejte jméno Datasetu který chcete odstranit z databáze:\n") |
|
32 |
remove_dataset_database(input()) |
modules/crawler/ResetDataset.py | ||
---|---|---|
52 | 52 |
create_ignore_file(path, None) |
53 | 53 |
create_updated_file(path) |
54 | 54 |
|
55 |
|
|
56 |
def reset_all_datasets(): |
|
57 |
""" |
|
58 |
Resets all saved data in all datasets with config file except configs and implementation |
|
59 |
""" |
|
60 |
datasets = os.listdir(CONFIG_FILES_PATH) |
|
61 |
|
|
62 |
for dataset in datasets: |
|
63 |
reset_dataset(dataset.split('.')[0]) |
|
64 |
|
|
65 |
|
|
66 |
reset_all_datasets() |
|
55 |
print("Zadejte jméno Datasetu který chcete resetovat:\n") |
|
56 |
reset_dataset(input()) |
modules/crawler/ResetDatasets.py | ||
---|---|---|
1 |
import os |
|
2 |
from Utilities import FolderProcessor |
|
3 |
|
|
4 |
# Path to crawled data |
|
5 |
CRAWLED_DATA_PATH = "CrawledData/" |
|
6 |
# Path to processed data |
|
7 |
PROCESSED_DATA_PATH = "ProcessedData/" |
|
8 |
# Path to crawler logs |
|
9 |
CRAWLER_LOGS_PATH = "CrawlerLogs/" |
|
10 |
# Path to dataset configuration files |
|
11 |
CONFIG_FILES_PATH = "DatasetConfigs" |
|
12 |
|
|
13 |
|
|
14 |
def create_ignore_file(path, text): |
|
15 |
""" |
|
16 |
Creates ignore file |
|
17 |
Args: |
|
18 |
path: path to directory for creating ignore.txt |
|
19 |
text: text that will be on first line of ignore.txt can be None |
|
20 |
""" |
|
21 |
with open(path + "/ignore.txt", "w") as file: |
|
22 |
if text is not None: |
|
23 |
file.write(text + "\n") |
|
24 |
|
|
25 |
|
|
26 |
def create_updated_file(path): |
|
27 |
""" |
|
28 |
Creates updated file |
|
29 |
Args: |
|
30 |
path: path to directory for creating updated.txt |
|
31 |
""" |
|
32 |
with open(path + "/updated.txt", "w") as file: |
|
33 |
file.write(str(0) + "\n") |
|
34 |
|
|
35 |
|
|
36 |
def reset_dataset(dataset_name): |
|
37 |
""" |
|
38 |
Resets all saved data in dataset except config and implementation |
|
39 |
Args: |
|
40 |
dataset_name: name of dataset that has existing configuration file |
|
41 |
""" |
|
42 |
path = CRAWLED_DATA_PATH + dataset_name + "/" |
|
43 |
FolderProcessor.clean_folder(path) |
|
44 |
create_ignore_file(path, "ignore.txt") |
|
45 |
|
|
46 |
path = PROCESSED_DATA_PATH + dataset_name + "/" |
|
47 |
FolderProcessor.clean_folder(path) |
|
48 |
create_ignore_file(path, "ignore.txt") |
|
49 |
|
|
50 |
path = CRAWLER_LOGS_PATH + dataset_name + "/" |
|
51 |
FolderProcessor.clean_folder(path) |
|
52 |
create_ignore_file(path, None) |
|
53 |
create_updated_file(path) |
|
54 |
|
|
55 |
|
|
56 |
def reset_all_datasets(): |
|
57 |
""" |
|
58 |
Resets all saved data in all datasets with config file except configs and implementation |
|
59 |
""" |
|
60 |
datasets = os.listdir(CONFIG_FILES_PATH) |
|
61 |
|
|
62 |
for dataset in datasets: |
|
63 |
reset_dataset(dataset.split('.')[0]) |
|
64 |
|
|
65 |
|
|
66 |
reset_all_datasets() |
Také k dispozici: Unified diff
Upravenen system spousteni scriptu