Projekt

Obecné

Profil

Stáhnout (5.18 KB) Statistiky
| Větev: | Revize:
1
from Utilities.Database import database_loader
2
from shared_types import StringSetType
3
# mongodb collection with with already downloaded links
4
MONGODB_DATASET_LINK_COLLECTION = "LINKS"
5
# mongodb collection with with already processed files
6
MONGODB_DATASET_PROCESSED_COLLECTION = "PROCESSED"
7
# mongodb collection with with already loaded links
8
MONGODB_DATASET_LOADED_COLLECTION = "LOADED"
9
# mongodb collection with aviable datasets with number of days from last update
10
MONGODB_DATASET_COLLECTION = "DATASETS"
11

    
12

    
13
def load_ignore_set_links(dataset_name: str) -> StringSetType:
14
    """
15
    Loades from database links of already downloaded files by crawler
16
    
17
    Returns:
18
        dataset_name name of dataset that has existing configuration file
19
    """
20

    
21
    ignore_set = set()
22

    
23
    connection = database_loader.create_database_connection()
24

    
25
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
26

    
27
    data = my_col.find()
28

    
29
    for part in data:
30
        ignore_set.add(part['name'])
31

    
32
    return ignore_set
33

    
34

    
35
def update_ignore_set_links(dataset_name: str, link: str) -> None:
36
    """
37
    Adds links of newly crawled files to the database
38
    
39
    Returns:
40
        dataset_name name of dataset that has existing configuration file
41
    """
42

    
43
    connection = database_loader.create_database_connection()
44

    
45
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
46

    
47
    my_col.insert({"name": link})
48

    
49

    
50
def reset_ignore_set_links(dataset_name: str) -> None:
51
    """
52
    Drops collection of already downloaded links
53
    
54
    Returns:
55
        dataset_name name of dataset that has existing configuration file
56
    """
57

    
58
    connection = database_loader.create_database_connection()
59

    
60
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
61

    
62
    my_col.drop()
63

    
64

    
65
def load_ignore_set_processed(dataset_name: str) -> StringSetType:
66
    """
67
    Loads from database set of already processed files
68
    
69
    Returns:
70
        dataset_name name of dataset that has existing configuration file
71
    """
72

    
73
    ignore_set = set()
74

    
75
    connection = database_loader.create_database_connection()
76

    
77
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
78

    
79
    data = my_col.find()
80

    
81
    for part in data:
82
        ignore_set.add(part['name'])
83

    
84
    return ignore_set
85

    
86

    
87
def update_ignore_set_processed(dataset_name: str, filename: str) -> None:
88
    """
89
    Adds files of newly processed files to the database
90
    
91
    Returns:
92
        dataset_name name of dataset that has existing configuration file
93
    """
94

    
95
    connection = database_loader.create_database_connection()
96

    
97
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
98

    
99
    my_col.insert({"name": filename})
100

    
101

    
102
def reset_ignore_set_processed(dataset_name: str) -> None:
103
    """
104
    Drops collection of already processed files
105
    
106
    Returns:
107
        dataset_name name of dataset that has existing configuration file
108
    """
109

    
110
    connection = database_loader.create_database_connection()
111

    
112
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
113

    
114
    my_col.drop()
115

    
116

    
117
def load_ignore_set_loaded(dataset_name: str) -> StringSetType:
118
    """
119
    Loads from database set of already loaded files in database
120
    
121
    Returns:
122
        dataset_name name of dataset that has existing configuration file
123
    """
124

    
125
    ignore_set = set()
126

    
127
    connection = database_loader.create_database_connection()
128

    
129
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
130

    
131
    data = my_col.find()
132

    
133
    for part in data:
134
        ignore_set.add(part['name'])
135

    
136
    return ignore_set
137

    
138

    
139
def update_ignore_set_loaded(dataset_name: str, filename: str) -> None:
140
    """
141
    Adds files of newly loaded files to the database
142
    
143
    Returns:
144
        dataset_name name of dataset that has existing configuration file
145
    """
146

    
147
    connection = database_loader.create_database_connection()
148

    
149
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
150

    
151
    my_col.insert({"name": filename})
152

    
153

    
154
def reset_ignore_set_loaded(dataset_name: str) -> None:
155
    """
156
    Drops collection of already loaded files
157
    
158
    Returns:
159
        dataset_name name of dataset that has existing configuration file
160
    """
161

    
162
    connection = database_loader.create_database_connection()
163

    
164
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
165

    
166
    my_col.drop()
167

    
168

    
169
def load_updated(dataset_name: str) -> int:
170
    """
171
    Loads value of (days from last update) from db
172
    
173
    Returns:
174
        dataset_name name of dataset that has existing configuration file
175
    """
176

    
177
    updated = 0
178

    
179
    connection = database_loader.create_database_connection()
180

    
181
    my_col = connection[MONGODB_DATASET_COLLECTION]
182

    
183
    data = my_col.find_one({'key-name': dataset_name}, {'updated'})
184

    
185
    updated = int(data['updated'])
186

    
187
    return updated
188

    
189

    
190
def update_updated(dataset_name: str, value: int):
191
    """
192
    Updates value of (days from last update) in db
193
    
194
    Returns:
195
        dataset_name name of dataset that has existing configuration file
196
    """
197

    
198
    connection = database_loader.create_database_connection()
199

    
200
    my_col = connection[MONGODB_DATASET_COLLECTION]
201

    
202
    myquery = {'key-name': dataset_name}
203
    new_values = {"$set": {"updated": value}}
204

    
205
    my_col.update_one(myquery, new_values)
(3-3/3)