1
|
//
|
2
|
// Author: Eliska Mourycova
|
3
|
//
|
4
|
|
5
|
using System;
|
6
|
using System.Collections.Generic;
|
7
|
using System.IO;
|
8
|
using System.IO.Compression;
|
9
|
using System.Net;
|
10
|
|
11
|
namespace ServerApp.DataDownload
|
12
|
{
|
13
|
/// <summary>
|
14
|
/// Enum representing all of the available data types (not all will be used in this project)
|
15
|
/// They are in Czech for easier handling file names.
|
16
|
/// TBD: They might be translated to English later.
|
17
|
/// </summary>
|
18
|
public enum DataType
|
19
|
{
|
20
|
POCASI, ENERGO, STROJE, EMAIL, OBSAZENI_MISTNOSTI, JIS, KOLOBEZKY, WIFI
|
21
|
}
|
22
|
|
23
|
/// <summary>
|
24
|
/// Represent all available data formats.
|
25
|
/// </summary>
|
26
|
public enum DataFormat
|
27
|
{
|
28
|
XML, JSON, CSV
|
29
|
}
|
30
|
|
31
|
/// <summary>
|
32
|
/// This class takes care of downloading of data. Download happens from http://openstore.zcu.cz/.
|
33
|
/// </summary>
|
34
|
public class DataDownloader
|
35
|
{
|
36
|
/// <summary>
|
37
|
/// The root directory containing all downloaded data
|
38
|
/// </summary>
|
39
|
public string RootDataDirectory { get; }
|
40
|
|
41
|
/// <summary>
|
42
|
/// For a DataType key returns full (absolute) path to a direcotry, where this type of data is stored
|
43
|
/// </summary>
|
44
|
public Dictionary<DataType, string> DataSubDirectories { get; }
|
45
|
|
46
|
/// <summary>
|
47
|
/// Flag stating whether files which already exist should be overwritten when downloaded again
|
48
|
/// </summary>
|
49
|
public bool OverwriteExisting { get; set; }
|
50
|
|
51
|
// the main site where the data can be downloaded from
|
52
|
private string site;
|
53
|
|
54
|
// the substring at the start of every file name
|
55
|
private string dataStr;
|
56
|
|
57
|
// WebClient instance used for the actual download
|
58
|
private WebClient webClient;
|
59
|
|
60
|
// a shortcut to writing Path.DirectorySeparatorChar
|
61
|
private char sep = Path.DirectorySeparatorChar;
|
62
|
|
63
|
// lists used for parsing the file names:
|
64
|
private List<string> separatedFileName;
|
65
|
private List<string> variablesInsertions;
|
66
|
private List<char> nameTurns;
|
67
|
|
68
|
public DataDownloader(string rootDataDir, string website, string namingConvention) // todo: take naming conventons specifiaction into account
|
69
|
{
|
70
|
// initialize all needed variables:
|
71
|
|
72
|
//Console.WriteLine(Directory.GetCurrentDirectory());
|
73
|
site = website;//"http://openstore.zcu.cz/";
|
74
|
|
75
|
ParseNaming(namingConvention);
|
76
|
dataStr = "OD_ZCU_";
|
77
|
|
78
|
RootDataDirectory = rootDataDir;//$"..{sep}..{sep}..{sep}data{sep}auto";
|
79
|
OverwriteExisting = false;
|
80
|
|
81
|
DataSubDirectories = new Dictionary<DataType, string>();
|
82
|
CreateSubdirectories(); // todo should we do it like this?
|
83
|
|
84
|
webClient = new WebClient();
|
85
|
}
|
86
|
|
87
|
/// <summary>
|
88
|
/// Downloads json file - returns contents of said file
|
89
|
/// </summary>
|
90
|
/// <returns> Path to file </returns>
|
91
|
public string DownloadWeatherPrediction()
|
92
|
{
|
93
|
// TODO either set this path as attribute or if parameter JsonParser needs an attribute that would be set through constructor
|
94
|
string predictionSite = "http://wttr.in/Plzen,czechia?format=j1";
|
95
|
|
96
|
DateTime now = DateTime.Now;
|
97
|
WebClient webClient = new WebClient();
|
98
|
webClient.DownloadFile(predictionSite, $"data/{now.Year}{now.Month}{now.Day}.json");
|
99
|
|
100
|
return $"data/{now.Year}{now.Month}{now.Day}.json";
|
101
|
}
|
102
|
|
103
|
/// <summary>
|
104
|
/// Creates subdirectories for all data types
|
105
|
/// TBD if we want to do it this way
|
106
|
/// </summary>
|
107
|
private void CreateSubdirectories()
|
108
|
{
|
109
|
foreach (DataType type in (DataType[])Enum.GetValues(typeof(DataType)))
|
110
|
{
|
111
|
string subDirectory = RootDataDirectory + sep + type;
|
112
|
DirectoryInfo di = Directory.CreateDirectory(subDirectory);
|
113
|
|
114
|
// create subdirectory record if it doesn't exist:
|
115
|
if (!DataSubDirectories.ContainsKey(type))
|
116
|
DataSubDirectories.Add(type, Path.GetFullPath(subDirectory));
|
117
|
}
|
118
|
}
|
119
|
|
120
|
/// <summary>
|
121
|
/// Parses the naming convention to be later used for determining URLs for data download
|
122
|
/// </summary>
|
123
|
/// <param name="namingConvention">The configured naming convention</param>
|
124
|
private void ParseNaming(string namingConvention)
|
125
|
{
|
126
|
separatedFileName = new List<string>();
|
127
|
variablesInsertions = new List<string>();
|
128
|
nameTurns = new List<char>();
|
129
|
|
130
|
string currPart = "";
|
131
|
string currVar = "";
|
132
|
bool readingNormal = true;
|
133
|
foreach (char c in namingConvention)
|
134
|
{
|
135
|
if (c == '{')
|
136
|
{
|
137
|
AddToNameParts(currPart);
|
138
|
readingNormal = false;
|
139
|
currPart = "";
|
140
|
}
|
141
|
else if (c == '}')
|
142
|
{
|
143
|
AddToVariables(currVar);
|
144
|
readingNormal = true;
|
145
|
currVar = "";
|
146
|
}
|
147
|
else
|
148
|
{
|
149
|
// normal char
|
150
|
if (readingNormal)
|
151
|
currPart += c;
|
152
|
else
|
153
|
currVar += c;
|
154
|
}
|
155
|
}
|
156
|
|
157
|
// add the rest if there is any:
|
158
|
if (readingNormal)
|
159
|
AddToNameParts(currPart);
|
160
|
else
|
161
|
AddToVariables(currVar);
|
162
|
|
163
|
Console.WriteLine();
|
164
|
}
|
165
|
|
166
|
// Adds to name parts
|
167
|
private void AddToNameParts(string s)
|
168
|
{
|
169
|
if (s.Length > 0)
|
170
|
{
|
171
|
separatedFileName.Add(s);
|
172
|
nameTurns.Add('n');
|
173
|
}
|
174
|
|
175
|
}
|
176
|
|
177
|
// Adds to variable name parts
|
178
|
private void AddToVariables(string s)
|
179
|
{
|
180
|
if (s.Length > 0)
|
181
|
{
|
182
|
variablesInsertions.Add(s);
|
183
|
nameTurns.Add('v');
|
184
|
}
|
185
|
|
186
|
}
|
187
|
|
188
|
/// <summary>
|
189
|
/// Builds the name of the downloaded file. Takes naming convention into account
|
190
|
/// </summary>
|
191
|
/// <param name="type">The type of data</param>
|
192
|
/// <param name="format">The data format</param>
|
193
|
/// <param name="year">The year</param>
|
194
|
/// <param name="month">The month</param>
|
195
|
/// <returns></returns>
|
196
|
private string BuildDownloadedName(DataType type, DataFormat format, int year, int month)
|
197
|
{
|
198
|
string nameZip = "";
|
199
|
|
200
|
int partInd = 0;
|
201
|
int varInd = 0;
|
202
|
for(int i = 0; i < nameTurns.Count; i++)
|
203
|
{
|
204
|
if (nameTurns[i] == 'n')
|
205
|
{
|
206
|
nameZip += separatedFileName[partInd];
|
207
|
partInd++;
|
208
|
}
|
209
|
else if(nameTurns[i] == 'v')
|
210
|
{
|
211
|
string add = "";
|
212
|
switch (variablesInsertions[varInd])
|
213
|
{
|
214
|
case "type":
|
215
|
add = "" + type;
|
216
|
break;
|
217
|
case "month":
|
218
|
add = month < 10 ? "0" + month : "" + month;
|
219
|
break;
|
220
|
case "year":
|
221
|
add = "" + year;
|
222
|
break;
|
223
|
case "format":
|
224
|
add = "" + format;
|
225
|
break;
|
226
|
default: throw new Exception("Config file error - naming conventions can only contain variables with following names: type, month, year, format");
|
227
|
}
|
228
|
nameZip += add;
|
229
|
varInd++;
|
230
|
}
|
231
|
}
|
232
|
|
233
|
return nameZip;
|
234
|
}
|
235
|
|
236
|
|
237
|
/// <summary>
|
238
|
/// Downloads a specific archive.
|
239
|
/// </summary>
|
240
|
/// <param name="type">The type of data</param>
|
241
|
/// <param name="format">The format of the data</param>
|
242
|
/// <param name="year">The year</param>
|
243
|
/// <param name="month">The month</param>
|
244
|
/// <returns>A list of all extracted file names (should be only one)</returns>
|
245
|
private List<string> DownloadData(DataType type, DataFormat format, int year, int month)
|
246
|
{
|
247
|
// the list of all files potentially relevant to the caller
|
248
|
List<string> extractedFiles = new List<string>();
|
249
|
|
250
|
// Prepare the url string to be downloaded from:
|
251
|
string monthStr = month < 10 ? "0" + month : "" + month;
|
252
|
string yearStr = "" + year;
|
253
|
string monthYr = monthStr + "_" + yearStr;
|
254
|
|
255
|
|
256
|
string nameZip = BuildDownloadedName(type, format, year, month);//dataStr + type + "_" + monthYr + "_" + format + ".zip";
|
257
|
string url = site + "/" + dataStr + monthYr + "/" + nameZip;//+ dataStr + type + "_" + monthYr + "_" + format + ".zip";
|
258
|
|
259
|
string nameFolder = RootDataDirectory + sep + type + sep; //+ dataStr + type + "_" + monthYr + "_" + format;
|
260
|
|
261
|
try
|
262
|
{
|
263
|
//Console.WriteLine("Downloading .zip to " + Path.GetFullPath(nameZip) + "...");
|
264
|
|
265
|
// Download the zip file:
|
266
|
webClient.DownloadFile(url, nameZip);
|
267
|
|
268
|
//ZipFile.ExtractToDirectory(nameZip, nameFolder);
|
269
|
ZipArchive zipArchive = ZipFile.OpenRead(nameZip);
|
270
|
// Go through all the extracted files:
|
271
|
foreach (ZipArchiveEntry entry in zipArchive.Entries)
|
272
|
{
|
273
|
// get the relative path to the file:
|
274
|
string newFileName = $"{month}-{year}.{format}";
|
275
|
string extractedFile = nameFolder + newFileName; //+ entry.Name;
|
276
|
|
277
|
// add full path to the list:
|
278
|
extractedFiles.Add(Path.GetFullPath(extractedFile));
|
279
|
|
280
|
if (OverwriteExisting)
|
281
|
{
|
282
|
// if overwrite is desired, execute it:
|
283
|
entry.ExtractToFile(extractedFile, OverwriteExisting);
|
284
|
|
285
|
}
|
286
|
else
|
287
|
{
|
288
|
// if overwrite is not desired, check if the file exists first:
|
289
|
if(File.Exists(extractedFile/*nameFolder + entry.Name*/))
|
290
|
{
|
291
|
continue;
|
292
|
}
|
293
|
else
|
294
|
{
|
295
|
// if it doesn't exist, save it:
|
296
|
entry.ExtractToFile(extractedFile, OverwriteExisting);
|
297
|
}
|
298
|
}
|
299
|
|
300
|
|
301
|
}
|
302
|
// dispose of the archive:
|
303
|
zipArchive.Dispose();
|
304
|
|
305
|
//Console.WriteLine("Extracted to " + Path.GetFullPath(nameFolder));
|
306
|
//Console.WriteLine("Deleting .zip from " + Path.GetFullPath(nameZip) + "...");
|
307
|
//Console.WriteLine("Finished downloading " + nameZip);
|
308
|
|
309
|
// delete the previously downloaded zip file, files contained in it have been extracted:
|
310
|
File.Delete(nameZip); // todo check?
|
311
|
|
312
|
}
|
313
|
catch(System.Net.WebException we)
|
314
|
{
|
315
|
// download fails, if the specified url is invalid
|
316
|
//Console.WriteLine("Download from " + url + " failed.");
|
317
|
//Console.WriteLine(we.Message);
|
318
|
}
|
319
|
|
320
|
|
321
|
|
322
|
return extractedFiles;
|
323
|
}
|
324
|
|
325
|
|
326
|
|
327
|
/// <summary>
|
328
|
/// Downloads selected type and time span of data in the desired format, returns a list of full paths to all successfully saved files.
|
329
|
/// If some of the files already existed and were not overwritten, then the returned List contains paths to these files also.
|
330
|
/// </summary>
|
331
|
/// <param name="type">The requested data type</param>
|
332
|
/// <param name="format">The data format</param>
|
333
|
/// <param name="startDate">The start date</param>
|
334
|
/// <param name="endDate">The end date</param>
|
335
|
/// <returns></returns>
|
336
|
public List<string> DownloadData(DataType type, DataFormat format, Date startDate, Date endDate/*int startYear, int endYear, int startMonth, int endMonth*/)
|
337
|
{
|
338
|
if (startDate > endDate)
|
339
|
throw new ArgumentException("startDate must be the same as or before the endDate.");
|
340
|
|
341
|
// initialize:
|
342
|
List<string> savedFiles = new List<string>();
|
343
|
//string subDirectory = RootDataDirectory + sep + type;
|
344
|
//DirectoryInfo di = Directory.CreateDirectory(subDirectory);
|
345
|
|
346
|
//// create subdirectory record if it doesn't exist:
|
347
|
//if (!DataSubDirectories.ContainsKey(type))
|
348
|
// DataSubDirectories.Add(type, Path.GetFullPath(subDirectory));
|
349
|
|
350
|
|
351
|
Date currentDate = startDate;
|
352
|
bool firstLoop = true;
|
353
|
do
|
354
|
{
|
355
|
Console.WriteLine("current date: " + currentDate);
|
356
|
savedFiles.AddRange(DownloadData(type, format, (int)currentDate.Year, (int)currentDate.Month));
|
357
|
Date nextDate = currentDate.IncreaseMonthByOne();
|
358
|
|
359
|
// also try to find the 00 file for each year:
|
360
|
if(nextDate.Year > currentDate.Year || firstLoop)
|
361
|
{
|
362
|
savedFiles.AddRange(DownloadData(type, format, (int)currentDate.Year, 0));
|
363
|
if (firstLoop)
|
364
|
firstLoop = false; // so that we don't download the same thing all the time
|
365
|
|
366
|
}
|
367
|
|
368
|
// assign the increased date to the current date:
|
369
|
currentDate = nextDate;
|
370
|
|
371
|
|
372
|
} while (currentDate <= endDate);
|
373
|
|
374
|
|
375
|
|
376
|
|
377
|
//for (int y = startYear; y <= endYear; y++)
|
378
|
//{
|
379
|
// for (int m = startMonth; m <= endMonth; m++)
|
380
|
// {
|
381
|
// savedFiles.AddRange(DownloadData(type, format, y, m));
|
382
|
// }
|
383
|
//}
|
384
|
|
385
|
return savedFiles;
|
386
|
}
|
387
|
|
388
|
public bool CheckForNewData()
|
389
|
{
|
390
|
throw new NotImplementedException();
|
391
|
}
|
392
|
|
393
|
|
394
|
/// <summary>
|
395
|
/// Retrieves all data files with dates falling within the specified range. If not all data for the specified range is found
|
396
|
/// then returns also file/s with month 0 if exists.
|
397
|
/// </summary>
|
398
|
/// <param name="subDirectory">The subdirectory to search</param>
|
399
|
/// <param name="startDate">The start date</param>
|
400
|
/// <param name="endDate">The end date</param>
|
401
|
/// <returns></returns>
|
402
|
public List<string> GetData(string subDirectory, Date startDate, Date endDate)
|
403
|
{
|
404
|
if (startDate == null || endDate == null)
|
405
|
return GetData(subDirectory);
|
406
|
|
407
|
string[] files = Directory.GetFiles(subDirectory);
|
408
|
List<string> found00Files = new List<string>();
|
409
|
List<string> relevantFiles = new List<string>();
|
410
|
List<Date> requestedDates = new List<Date>();
|
411
|
|
412
|
// prepare a list of requested dates:
|
413
|
Date currentDate = startDate;
|
414
|
do
|
415
|
{
|
416
|
requestedDates.Add(currentDate);
|
417
|
Date nextDate = currentDate.IncreaseMonthByOne();
|
418
|
// assign the increased date to the current date:
|
419
|
currentDate = nextDate;
|
420
|
|
421
|
} while (currentDate <= endDate);
|
422
|
|
423
|
|
424
|
|
425
|
for (int i = 0; i < files.Length; i++)
|
426
|
{
|
427
|
string currFileName = Path.GetFileName(files[i]);
|
428
|
Console.WriteLine("curr file: " + currFileName);
|
429
|
string[] splits = currFileName.Split(new char[] { '-', '.' });
|
430
|
|
431
|
int month = int.Parse(splits[0]);
|
432
|
int year = int.Parse(splits[1]);
|
433
|
|
434
|
|
435
|
|
436
|
if (month == 0)
|
437
|
{
|
438
|
found00Files.Add(files[i]);
|
439
|
continue;
|
440
|
}
|
441
|
|
442
|
Date d = new Date((uint)month, (uint)year);
|
443
|
|
444
|
if (d >= startDate && d <= endDate)
|
445
|
{
|
446
|
// we want to add this
|
447
|
relevantFiles.Add(files[i]);
|
448
|
requestedDates.Remove(d);
|
449
|
}
|
450
|
}
|
451
|
|
452
|
|
453
|
|
454
|
|
455
|
// 00 will only appear once for every year?
|
456
|
foreach (string file00 in found00Files)
|
457
|
{
|
458
|
string fileName = Path.GetFileName(file00);
|
459
|
string[] splits = fileName.Split(new char[] { '-', '.' });
|
460
|
|
461
|
int month = int.Parse(splits[0]);
|
462
|
int year = int.Parse(splits[1]);
|
463
|
|
464
|
// now we have the year of one 00 file
|
465
|
// dates not found in the directory now remain in requested dates
|
466
|
if(requestedDates.Exists(d => d.Year == year))
|
467
|
{
|
468
|
// if the year of this 00 file remains in the list:
|
469
|
relevantFiles.Add(file00);
|
470
|
}
|
471
|
}
|
472
|
|
473
|
|
474
|
return relevantFiles;
|
475
|
}
|
476
|
|
477
|
/// <summary>
|
478
|
/// Returns all file paths in a given directory
|
479
|
/// </summary>
|
480
|
/// <param name="subDirectory"></param>
|
481
|
/// <returns></returns>
|
482
|
private List<string> GetData(string subDirectory)
|
483
|
{
|
484
|
string[] files = Directory.GetFiles(subDirectory);
|
485
|
List<string> relevantFiles = new List<string>(files.Length);
|
486
|
|
487
|
for (int i = 0; i < files.Length; i++)
|
488
|
relevantFiles.Add(files[i]);
|
489
|
|
490
|
return relevantFiles;
|
491
|
}
|
492
|
|
493
|
|
494
|
#region UNUSED
|
495
|
//public string GetDirectoryListingRegexForUrl(string url)
|
496
|
//{
|
497
|
// if (url.Equals(site))
|
498
|
// {
|
499
|
// //return "\\\"([^\"]*)\\\"";
|
500
|
// return @"\bOD_ZCU_\w*\b";
|
501
|
// //return @"\A[OD_ZCU_]";
|
502
|
// }
|
503
|
// else return null;
|
504
|
//}
|
505
|
//public void ListDirectory()
|
506
|
//{
|
507
|
// string url = site;
|
508
|
// HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
|
509
|
// using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
|
510
|
// {
|
511
|
// using (StreamReader reader = new StreamReader(response.GetResponseStream()))
|
512
|
// {
|
513
|
// string html = reader.ReadToEnd();
|
514
|
|
515
|
|
516
|
|
517
|
// Regex regex = new Regex(GetDirectoryListingRegexForUrl(url));
|
518
|
// MatchCollection matches = regex.Matches(html);
|
519
|
// Console.WriteLine(matches.Count);
|
520
|
|
521
|
// if (matches.Count > 0)
|
522
|
// {
|
523
|
// foreach (Match match in matches)
|
524
|
// {
|
525
|
// //if (match.Success)
|
526
|
// //{
|
527
|
// Console.WriteLine(match.ToString());
|
528
|
// //}
|
529
|
// }
|
530
|
// }
|
531
|
// }
|
532
|
// Console.ReadLine();
|
533
|
// }
|
534
|
//}
|
535
|
|
536
|
#endregion
|
537
|
}
|
538
|
}
|