1
|
import os
|
2
|
from Utilities import FolderProcessor
|
3
|
|
4
|
# Path to crawled data
|
5
|
CRAWLED_DATA_PATH = "../CrawledData/"
|
6
|
# Path to processed data
|
7
|
PROCESSED_DATA_PATH = "../ProcessedData/"
|
8
|
# Path to crawler logs
|
9
|
CRAWLER_LOGS_PATH = "../CrawlerLogs/"
|
10
|
# Path to dataset configuration files
|
11
|
CONFIG_FILES_PATH = "../DatasetConfigs"
|
12
|
|
13
|
|
14
|
def create_ignore_file(path, text):
|
15
|
"""
|
16
|
Creates ignore file
|
17
|
Args:
|
18
|
path: path to directory for creating ignore.txt
|
19
|
text: text that will be on first line of ignore.txt can be None
|
20
|
"""
|
21
|
with open(path + "/ignore.txt", "w") as file:
|
22
|
if text is not None:
|
23
|
file.write(text + "\n")
|
24
|
|
25
|
|
26
|
def reset_dataset(dataset_name):
|
27
|
"""
|
28
|
Resets all saved data in dataset except config and implementation
|
29
|
Args:
|
30
|
dataset_name: name of dataset that has existing configuration file
|
31
|
"""
|
32
|
path = CRAWLED_DATA_PATH + dataset_name + "/"
|
33
|
FolderProcessor.clean_folder(path)
|
34
|
create_ignore_file(path, "ignore.txt")
|
35
|
|
36
|
path = PROCESSED_DATA_PATH + dataset_name + "/"
|
37
|
FolderProcessor.clean_folder(path)
|
38
|
create_ignore_file(path, "ignore.txt")
|
39
|
|
40
|
path = CRAWLER_LOGS_PATH + dataset_name + "/"
|
41
|
FolderProcessor.clean_folder(path)
|
42
|
create_ignore_file(path, None)
|
43
|
|
44
|
|
45
|
def reset_all_datasets():
|
46
|
"""
|
47
|
Resets all saved data in all datasets with config file except configs and implementation
|
48
|
"""
|
49
|
datasets = os.listdir(CONFIG_FILES_PATH)
|
50
|
|
51
|
for dataset in datasets:
|
52
|
reset_dataset(dataset.split('.')[0])
|
53
|
|
54
|
|
55
|
reset_all_datasets()
|