Projekt

Obecné

Profil

Stáhnout (4.97 KB) Statistiky
| Větev: | Revize:
1
from Utilities.Database import database_loader
2

    
3
# mongodb collection with with already downloaded links
4
MONGODB_DATASET_LINK_COLLECTION = "LINKS"
5
# mongodb collection with with already processed files
6
MONGODB_DATASET_PROCESSED_COLLECTION = "PROCESSED"
7
# mongodb collection with with already loaded links
8
MONGODB_DATASET_LOADED_COLLECTION = "LOADED"
9
# mongodb collection with aviable datasets with number of days from last update
10
MONGODB_DATASET_COLLECTION = "DATASETS"
11

    
12

    
13
def load_ignore_set_links(dataset_name):
14
    """
15
    Loades from database links of already downloaded files by crawler
16
    
17
    Returns:
18
        dataset_name name of dataset that has existing configuration file
19
    """
20

    
21
    ignore_set = set()
22

    
23
    connection = database_loader.create_database_connection()
24

    
25
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
26

    
27
    data = my_col.find()
28

    
29
    for part in data:
30
        ignore_set.add(part['name'])
31

    
32
    return ignore_set
33

    
34

    
35
def update_ignore_set_links(dataset_name,link):
36
    """
37
    Adds links of newly crawled files to the database
38
    
39
    Returns:
40
        dataset_name name of dataset that has existing configuration file
41
    """
42

    
43
    connection = database_loader.create_database_connection()
44

    
45
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
46

    
47
    my_col.insert({ "name": link})
48

    
49

    
50
def reset_ignore_set_links(dataset_name):
51
    """
52
    Drops collection of already downloaded links
53
    
54
    Returns:
55
        dataset_name name of dataset that has existing configuration file
56
    """
57

    
58
    connection = database_loader.create_database_connection()
59

    
60
    my_col = connection[dataset_name + MONGODB_DATASET_LINK_COLLECTION]
61

    
62
    my_col.drop()
63

    
64

    
65

    
66
def load_ignore_set_processed(dataset_name):
67
    """
68
    Loads from database set of already processed files
69
    
70
    Returns:
71
        dataset_name name of dataset that has existing configuration file
72
    """
73

    
74
    ignore_set = set()
75

    
76
    connection = database_loader.create_database_connection()
77

    
78
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
79

    
80
    data = my_col.find()
81

    
82
    for part in data:
83
        ignore_set.add(part['name'])
84

    
85
    return ignore_set
86

    
87

    
88
def update_ignore_set_processed(dataset_name,filename):
89
    """
90
    Adds files of newly processed files to the database
91
    
92
    Returns:
93
        dataset_name name of dataset that has existing configuration file
94
    """
95

    
96
    connection = database_loader.create_database_connection()
97

    
98
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
99

    
100
    my_col.insert({ "name": filename})
101

    
102

    
103

    
104
def reset_ignore_set_processed(dataset_name):
105
    """
106
    Drops collection of already processed files
107
    
108
    Returns:
109
        dataset_name name of dataset that has existing configuration file
110
    """
111

    
112
    connection = database_loader.create_database_connection()
113

    
114
    my_col = connection[dataset_name + MONGODB_DATASET_PROCESSED_COLLECTION]
115

    
116
    my_col.drop()
117

    
118

    
119

    
120
def load_ignore_set_loaded(dataset_name):
121
    """
122
    Loads from database set of already loaded files in database
123
    
124
    Returns:
125
        dataset_name name of dataset that has existing configuration file
126
    """
127

    
128
    ignore_set = set()
129

    
130
    connection = database_loader.create_database_connection()
131

    
132
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
133

    
134
    data = my_col.find()
135

    
136
    for part in data:
137
        ignore_set.add(part['name'])
138

    
139
    return ignore_set
140

    
141

    
142

    
143
def update_ignore_set_loaded(dataset_name,filename):
144
    """
145
    Adds files of newly loaded files to the database
146
    
147
    Returns:
148
        dataset_name name of dataset that has existing configuration file
149
    """
150

    
151
    connection = database_loader.create_database_connection()
152

    
153
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
154

    
155
    my_col.insert({ "name": filename})
156

    
157

    
158
def reset_ignore_set_loaded(dataset_name):
159
    """
160
    Drops collection of already loaded files
161
    
162
    Returns:
163
        dataset_name name of dataset that has existing configuration file
164
    """
165

    
166
    connection = database_loader.create_database_connection()
167

    
168
    my_col = connection[dataset_name + MONGODB_DATASET_LOADED_COLLECTION]
169

    
170
    my_col.drop()
171

    
172

    
173
def load_updated(dataset_name):
174
    """
175
    Loads value of (days from last update) from db
176
    
177
    Returns:
178
        dataset_name name of dataset that has existing configuration file
179
    """
180

    
181
    updated = 0
182

    
183
    connection = database_loader.create_database_connection()
184

    
185
    my_col = connection[MONGODB_DATASET_COLLECTION]
186

    
187
    data = my_col.find_one({'key-name': dataset_name},{'updated'})
188

    
189
    updated = int(data['updated'])
190

    
191
    return updated
192

    
193

    
194
def update_updated(dataset_name,value):
195
    """
196
    Updates value of (days from last update) in db
197
    
198
    Returns:
199
        dataset_name name of dataset that has existing configuration file
200
    """
201

    
202
    connection = database_loader.create_database_connection()
203

    
204
    my_col = connection[MONGODB_DATASET_COLLECTION]
205

    
206
    myquery = { 'key-name': dataset_name }
207
    new_values = { "$set": { "updated": value } }
208

    
209
    my_col.update_one(myquery,new_values)
(3-3/3)