Revize af7609b5
Přidáno uživatelem Tomáš Ballák před více než 3 roky(ů)
modules/crawler/Utilities/Database/database_record_logs.py | ||
---|---|---|
1 | 1 |
from Utilities.Database import database_loader |
2 |
|
|
2 |
from shared_types import StringSetType |
|
3 | 3 |
# mongodb collection with with already downloaded links |
4 | 4 |
MONGODB_DATASET_LINK_COLLECTION = "LINKS" |
5 | 5 |
# mongodb collection with with already processed files |
... | ... | |
10 | 10 |
MONGODB_DATASET_COLLECTION = "DATASETS" |
11 | 11 |
|
12 | 12 |
|
13 |
def load_ignore_set_links(dataset_name):
|
|
13 |
def load_ignore_set_links(dataset_name: str) -> StringSetType:
|
|
14 | 14 |
""" |
15 | 15 |
Loades from database links of already downloaded files by crawler |
16 | 16 |
|
... | ... | |
32 | 32 |
return ignore_set |
33 | 33 |
|
34 | 34 |
|
35 |
def update_ignore_set_links(dataset_name,link):
|
|
35 |
def update_ignore_set_links(dataset_name: str, link: str) -> None:
|
|
36 | 36 |
""" |
37 | 37 |
Adds links of newly crawled files to the database |
38 | 38 |
|
... | ... | |
44 | 44 |
|
45 | 45 |
my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION] |
46 | 46 |
|
47 |
my_col.insert({ "name": link})
|
|
47 |
my_col.insert({"name": link}) |
|
48 | 48 |
|
49 | 49 |
|
50 |
def reset_ignore_set_links(dataset_name):
|
|
50 |
def reset_ignore_set_links(dataset_name: str) -> None:
|
|
51 | 51 |
""" |
52 | 52 |
Drops collection of already downloaded links |
53 | 53 |
|
... | ... | |
62 | 62 |
my_col.drop() |
63 | 63 |
|
64 | 64 |
|
65 |
|
|
66 |
def load_ignore_set_processed(dataset_name): |
|
65 |
def load_ignore_set_processed(dataset_name: str) -> StringSetType: |
|
67 | 66 |
""" |
68 | 67 |
Loads from database set of already processed files |
69 | 68 |
|
... | ... | |
85 | 84 |
return ignore_set |
86 | 85 |
|
87 | 86 |
|
88 |
def update_ignore_set_processed(dataset_name,filename):
|
|
87 |
def update_ignore_set_processed(dataset_name: str, filename: str) -> None:
|
|
89 | 88 |
""" |
90 | 89 |
Adds files of newly processed files to the database |
91 | 90 |
|
... | ... | |
97 | 96 |
|
98 | 97 |
my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION] |
99 | 98 |
|
100 |
my_col.insert({ "name": filename}) |
|
101 |
|
|
99 |
my_col.insert({"name": filename}) |
|
102 | 100 |
|
103 | 101 |
|
104 |
def reset_ignore_set_processed(dataset_name):
|
|
102 |
def reset_ignore_set_processed(dataset_name: str) -> None:
|
|
105 | 103 |
""" |
106 | 104 |
Drops collection of already processed files |
107 | 105 |
|
... | ... | |
116 | 114 |
my_col.drop() |
117 | 115 |
|
118 | 116 |
|
119 |
|
|
120 |
def load_ignore_set_loaded(dataset_name): |
|
117 |
def load_ignore_set_loaded(dataset_name: str) -> StringSetType: |
|
121 | 118 |
""" |
122 | 119 |
Loads from database set of already loaded files in database |
123 | 120 |
|
... | ... | |
139 | 136 |
return ignore_set |
140 | 137 |
|
141 | 138 |
|
142 |
|
|
143 |
def update_ignore_set_loaded(dataset_name,filename): |
|
139 |
def update_ignore_set_loaded(dataset_name: str, filename: str) -> None: |
|
144 | 140 |
""" |
145 | 141 |
Adds files of newly loaded files to the database |
146 | 142 |
|
... | ... | |
152 | 148 |
|
153 | 149 |
my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION] |
154 | 150 |
|
155 |
my_col.insert({ "name": filename})
|
|
151 |
my_col.insert({"name": filename}) |
|
156 | 152 |
|
157 | 153 |
|
158 |
def reset_ignore_set_loaded(dataset_name):
|
|
154 |
def reset_ignore_set_loaded(dataset_name: str) -> None:
|
|
159 | 155 |
""" |
160 | 156 |
Drops collection of already loaded files |
161 | 157 |
|
... | ... | |
170 | 166 |
my_col.drop() |
171 | 167 |
|
172 | 168 |
|
173 |
def load_updated(dataset_name):
|
|
169 |
def load_updated(dataset_name: str) -> int:
|
|
174 | 170 |
""" |
175 | 171 |
Loads value of (days from last update) from db |
176 | 172 |
|
... | ... | |
184 | 180 |
|
185 | 181 |
my_col = connection[MONGODB_DATASET_COLLECTION] |
186 | 182 |
|
187 |
data = my_col.find_one({'key-name': dataset_name},{'updated'}) |
|
183 |
data = my_col.find_one({'key-name': dataset_name}, {'updated'})
|
|
188 | 184 |
|
189 | 185 |
updated = int(data['updated']) |
190 | 186 |
|
191 | 187 |
return updated |
192 | 188 |
|
193 | 189 |
|
194 |
def update_updated(dataset_name,value):
|
|
190 |
def update_updated(dataset_name: str, value: int):
|
|
195 | 191 |
""" |
196 | 192 |
Updates value of (days from last update) in db |
197 | 193 |
|
... | ... | |
203 | 199 |
|
204 | 200 |
my_col = connection[MONGODB_DATASET_COLLECTION] |
205 | 201 |
|
206 |
myquery = { 'key-name': dataset_name }
|
|
207 |
new_values = { "$set": { "updated": value } }
|
|
202 |
myquery = {'key-name': dataset_name}
|
|
203 |
new_values = {"$set": {"updated": value}}
|
|
208 | 204 |
|
209 |
my_col.update_one(myquery,new_values) |
|
205 |
my_col.update_one(myquery, new_values) |
Také k dispozici: Unified diff
Re #8193 - refactoring crawler