Projekt

Obecné

Profil

Stáhnout (4.97 KB) Statistiky
| Větev: | Revize:
1 d6ca840d petrh
from Utilities.Database import database_loader
2
3
# mongodb collection with with already downloaded links
4
MONGODB_DATASET_LINK_COLLECTION = "LINKS"
5
# mongodb collection with with already processed files
6
MONGODB_DATASET_PROCESSED_COLLECTION = "PROCESSED"
7
# mongodb collection with with already loaded links
8
MONGODB_DATASET_LOADED_COLLECTION = "LOADED"
9
# mongodb collection with aviable datasets with number of days from last update
10
MONGODB_DATASET_COLLECTION = "DATASETS"
11
12
13
def load_ignore_set_links(dataset_name):
14
    """
15
    Loades from database links of already downloaded files by crawler
16
    
17
    Returns:
18
        dataset_name name of dataset that has existing configuration file
19
    """
20
21
    ignore_set = set()
22
23
    connection = database_loader.create_database_connection()
24
25
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
26
27
    data = my_col.find()
28
29
    for part in data:
30
        ignore_set.add(part['name'])
31
32
    return ignore_set
33
34
35
def update_ignore_set_links(dataset_name,link):
36
    """
37
    Adds links of newly crawled files to the database
38
    
39
    Returns:
40
        dataset_name name of dataset that has existing configuration file
41
    """
42
43
    connection = database_loader.create_database_connection()
44
45
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
46
47
    my_col.insert({ "name": link})
48
49
50
def reset_ignore_set_links(dataset_name):
51
    """
52
    Drops collection of already downloaded links
53
    
54
    Returns:
55
        dataset_name name of dataset that has existing configuration file
56
    """
57
58
    connection = database_loader.create_database_connection()
59
60
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
61
62
    my_col.drop()
63
64
65
66
def load_ignore_set_processed(dataset_name):
67
    """
68
    Loads from database set of already processed files
69
    
70
    Returns:
71
        dataset_name name of dataset that has existing configuration file
72
    """
73
74
    ignore_set = set()
75
76
    connection = database_loader.create_database_connection()
77
78
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
79
80
    data = my_col.find()
81
82
    for part in data:
83
        ignore_set.add(part['name'])
84
85
    return ignore_set
86
87
88
def update_ignore_set_processed(dataset_name,filename):
89
    """
90
    Adds files of newly processed files to the database
91
    
92
    Returns:
93
        dataset_name name of dataset that has existing configuration file
94
    """
95
96
    connection = database_loader.create_database_connection()
97
98
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
99
100
    my_col.insert({ "name": filename})
101
102
103
104
def reset_ignore_set_processed(dataset_name):
105
    """
106
    Drops collection of already processed files
107
    
108
    Returns:
109
        dataset_name name of dataset that has existing configuration file
110
    """
111
112
    connection = database_loader.create_database_connection()
113
114
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
115
116
    my_col.drop()
117
118
119
120
def load_ignore_set_loaded(dataset_name):
121
    """
122
    Loads from database set of already loaded files in database
123
    
124
    Returns:
125
        dataset_name name of dataset that has existing configuration file
126
    """
127
128
    ignore_set = set()
129
130
    connection = database_loader.create_database_connection()
131
132
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
133
134
    data = my_col.find()
135
136
    for part in data:
137
        ignore_set.add(part['name'])
138
139
    return ignore_set
140
141
142
143
def update_ignore_set_loaded(dataset_name,filename):
144
    """
145
    Adds files of newly loaded files to the database
146
    
147
    Returns:
148
        dataset_name name of dataset that has existing configuration file
149
    """
150
151
    connection = database_loader.create_database_connection()
152
153
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
154
155
    my_col.insert({ "name": filename})
156
157
158
def reset_ignore_set_loaded(dataset_name):
159
    """
160
    Drops collection of already loaded files
161
    
162
    Returns:
163
        dataset_name name of dataset that has existing configuration file
164
    """
165
166
    connection = database_loader.create_database_connection()
167
168
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
169
170
    my_col.drop()
171
172
173
def load_updated(dataset_name):
174
    """
175
    Loads value of (days from last update) from db
176
    
177
    Returns:
178
        dataset_name name of dataset that has existing configuration file
179
    """
180
181
    updated = 0
182
183
    connection = database_loader.create_database_connection()
184
185
    my_col = connection[MONGODB_DATASET_COLLECTION]
186
187
    data = my_col.find_one({'key-name': dataset_name},{'updated'})
188
189
    updated = int(data['updated'])
190
191
    return updated
192
193
194
def update_updated(dataset_name,value):
195
    """
196
    Updates value of (days from last update) in db
197
    
198
    Returns:
199
        dataset_name name of dataset that has existing configuration file
200
    """
201
202
    connection = database_loader.create_database_connection()
203
204
    my_col = connection[MONGODB_DATASET_COLLECTION]
205
206
    myquery = { 'key-name': dataset_name }
207
    new_values = { "$set": { "updated": value } }
208
209
    my_col.update_one(myquery,new_values)