Projekt

Obecné

Profil

Stáhnout (5.18 KB) Statistiky
| Větev: | Revize:
1 d6ca840d petrh
from Utilities.Database import database_loader
2 af7609b5 Tomáš Ballák
from shared_types import StringSetType
3 d6ca840d petrh
# mongodb collection with with already downloaded links
4
MONGODB_DATASET_LINK_COLLECTION = "LINKS"
5
# mongodb collection with with already processed files
6
MONGODB_DATASET_PROCESSED_COLLECTION = "PROCESSED"
7
# mongodb collection with with already loaded links
8
MONGODB_DATASET_LOADED_COLLECTION = "LOADED"
9
# mongodb collection with aviable datasets with number of days from last update
10
MONGODB_DATASET_COLLECTION = "DATASETS"
11
12
13 af7609b5 Tomáš Ballák
def load_ignore_set_links(dataset_name: str) -> StringSetType:
14 d6ca840d petrh
    """
15
    Loades from database links of already downloaded files by crawler
16
    
17
    Returns:
18
        dataset_name name of dataset that has existing configuration file
19
    """
20
21
    ignore_set = set()
22
23
    connection = database_loader.create_database_connection()
24
25
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
26
27
    data = my_col.find()
28
29
    for part in data:
30
        ignore_set.add(part['name'])
31
32
    return ignore_set
33
34
35 af7609b5 Tomáš Ballák
def update_ignore_set_links(dataset_name: str, link: str) -> None:
36 d6ca840d petrh
    """
37
    Adds links of newly crawled files to the database
38
    
39
    Returns:
40
        dataset_name name of dataset that has existing configuration file
41
    """
42
43
    connection = database_loader.create_database_connection()
44
45
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
46
47 af7609b5 Tomáš Ballák
    my_col.insert({"name": link})
48 d6ca840d petrh
49
50 af7609b5 Tomáš Ballák
def reset_ignore_set_links(dataset_name: str) -> None:
51 d6ca840d petrh
    """
52
    Drops collection of already downloaded links
53
    
54
    Returns:
55
        dataset_name name of dataset that has existing configuration file
56
    """
57
58
    connection = database_loader.create_database_connection()
59
60
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
61
62
    my_col.drop()
63
64
65 af7609b5 Tomáš Ballák
def load_ignore_set_processed(dataset_name: str) -> StringSetType:
66 d6ca840d petrh
    """
67
    Loads from database set of already processed files
68
    
69
    Returns:
70
        dataset_name name of dataset that has existing configuration file
71
    """
72
73
    ignore_set = set()
74
75
    connection = database_loader.create_database_connection()
76
77
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
78
79
    data = my_col.find()
80
81
    for part in data:
82
        ignore_set.add(part['name'])
83
84
    return ignore_set
85
86
87 af7609b5 Tomáš Ballák
def update_ignore_set_processed(dataset_name: str, filename: str) -> None:
88 d6ca840d petrh
    """
89
    Adds files of newly processed files to the database
90
    
91
    Returns:
92
        dataset_name name of dataset that has existing configuration file
93
    """
94
95
    connection = database_loader.create_database_connection()
96
97
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
98
99 af7609b5 Tomáš Ballák
    my_col.insert({"name": filename})
100 d6ca840d petrh
101
102 af7609b5 Tomáš Ballák
def reset_ignore_set_processed(dataset_name: str) -> None:
103 d6ca840d petrh
    """
104
    Drops collection of already processed files
105
    
106
    Returns:
107
        dataset_name name of dataset that has existing configuration file
108
    """
109
110
    connection = database_loader.create_database_connection()
111
112
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
113
114
    my_col.drop()
115
116
117 af7609b5 Tomáš Ballák
def load_ignore_set_loaded(dataset_name: str) -> StringSetType:
118 d6ca840d petrh
    """
119
    Loads from database set of already loaded files in database
120
    
121
    Returns:
122
        dataset_name name of dataset that has existing configuration file
123
    """
124
125
    ignore_set = set()
126
127
    connection = database_loader.create_database_connection()
128
129
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
130
131
    data = my_col.find()
132
133
    for part in data:
134
        ignore_set.add(part['name'])
135
136
    return ignore_set
137
138
139 af7609b5 Tomáš Ballák
def update_ignore_set_loaded(dataset_name: str, filename: str) -> None:
140 d6ca840d petrh
    """
141
    Adds files of newly loaded files to the database
142
    
143
    Returns:
144
        dataset_name name of dataset that has existing configuration file
145
    """
146
147
    connection = database_loader.create_database_connection()
148
149
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
150
151 af7609b5 Tomáš Ballák
    my_col.insert({"name": filename})
152 d6ca840d petrh
153
154 af7609b5 Tomáš Ballák
def reset_ignore_set_loaded(dataset_name: str) -> None:
155 d6ca840d petrh
    """
156
    Drops collection of already loaded files
157
    
158
    Returns:
159
        dataset_name name of dataset that has existing configuration file
160
    """
161
162
    connection = database_loader.create_database_connection()
163
164
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
165
166
    my_col.drop()
167
168
169 af7609b5 Tomáš Ballák
def load_updated(dataset_name: str) -> int:
170 d6ca840d petrh
    """
171
    Loads value of (days from last update) from db
172
    
173
    Returns:
174
        dataset_name name of dataset that has existing configuration file
175
    """
176
177
    updated = 0
178
179
    connection = database_loader.create_database_connection()
180
181
    my_col = connection[MONGODB_DATASET_COLLECTION]
182
183 af7609b5 Tomáš Ballák
    data = my_col.find_one({'key-name': dataset_name}, {'updated'})
184 d6ca840d petrh
185
    updated = int(data['updated'])
186
187
    return updated
188
189
190 af7609b5 Tomáš Ballák
def update_updated(dataset_name: str, value: int):
191 d6ca840d petrh
    """
192
    Updates value of (days from last update) in db
193
    
194
    Returns:
195
        dataset_name name of dataset that has existing configuration file
196
    """
197
198
    connection = database_loader.create_database_connection()
199
200
    my_col = connection[MONGODB_DATASET_COLLECTION]
201
202 af7609b5 Tomáš Ballák
    myquery = {'key-name': dataset_name}
203
    new_values = {"$set": {"updated": value}}
204 d6ca840d petrh
205 af7609b5 Tomáš Ballák
    my_col.update_one(myquery, new_values)