Projekt

Obecné

Profil

Stáhnout (14.1 KB) Statistiky
| Větev: | Tag: | Revize:
1 34bf7aa2 Eliška Mourycová
//
2
// Author: Eliska Mourycova
3
//
4
5
using System;
6 d2d1c86a Eliška Mourycová
using System.Collections.Generic;
7
using System.IO;
8
using System.IO.Compression;
9
using System.Net;
10
11 7a998d66 Eliška Mourycová
namespace ServerApp.DataDownload
12 d2d1c86a Eliška Mourycová
{
13 34bf7aa2 Eliška Mourycová
	/// <summary>
14
	/// Enum representing all of the available data types (not all will be used in this project)
15
	/// They are in Czech for easier handling file names.
16
	/// TBD: They might be translated to English later.
17
	/// </summary>
18 d2d1c86a Eliška Mourycová
	public enum DataType
19
	{
20
		POCASI, ENERGO, STROJE, EMAIL, OBSAZENI_MISTNOSTI, JIS, KOLOBEZKY, WIFI
21
	}
22 34bf7aa2 Eliška Mourycová
23
	/// <summary>
24
	/// Represent all available data formats.
25
	/// </summary>
26 d2d1c86a Eliška Mourycová
	public enum DataFormat
27
	{
28
		XML, JSON, CSV
29
	}
30
31 34bf7aa2 Eliška Mourycová
	/// <summary>
32
	/// This class takes care of downloading of data. Download happens from http://openstore.zcu.cz/.
33
	/// </summary>
34 d2d1c86a Eliška Mourycová
	public class DataDownloader
35
	{
36 34bf7aa2 Eliška Mourycová
		/// <summary>
37
		/// The root directory containing all downloaded data
38
		/// </summary>
39 7a998d66 Eliška Mourycová
		public string RootDataDirectory { get; }
40
41
		/// <summary>
42
		/// For a DataType key returns full (absolute) path to a direcotry, where this type of data is stored
43
		/// </summary>
44
		public Dictionary<DataType, string> DataSubDirectories { get; }
45 34bf7aa2 Eliška Mourycová
46
		/// <summary>
47
		/// Flag stating whether files which already exist should be overwritten when downloaded again
48
		/// </summary>
49 d2d1c86a Eliška Mourycová
		public bool OverwriteExisting { get; set; }
50
51 34bf7aa2 Eliška Mourycová
		// the main site where the data can be downloaded from
52 d2d1c86a Eliška Mourycová
		private string site;
53 34bf7aa2 Eliška Mourycová
54
		// the substring at the start of every file name
55 d2d1c86a Eliška Mourycová
		private string dataStr;
56 34bf7aa2 Eliška Mourycová
57
		// WebClient instance used for the actual download
58 d2d1c86a Eliška Mourycová
		private WebClient webClient;
59 34bf7aa2 Eliška Mourycová
60
		// a shortcut to writing Path.DirectorySeparatorChar
61 d2d1c86a Eliška Mourycová
		private char sep = Path.DirectorySeparatorChar;
62
63 55d35561 Eliška Mourycová
		// lists used for parsing the file names:
64 4a417b8b Eliška Mourycová
		private List<string> separatedFileName;
65
		private List<string> variablesInsertions;
66
		private List<char> nameTurns;
67
68
		public DataDownloader(string rootDataDir, string website, string namingConvention) // todo: take naming conventons specifiaction into account
69 d2d1c86a Eliška Mourycová
		{
70 34bf7aa2 Eliška Mourycová
			// initialize all needed variables:
71
72 4a417b8b Eliška Mourycová
			//Console.WriteLine(Directory.GetCurrentDirectory());
73 7a998d66 Eliška Mourycová
			site = website;//"http://openstore.zcu.cz/";
74 4a417b8b Eliška Mourycová
75
			ParseNaming(namingConvention);
76 d2d1c86a Eliška Mourycová
			dataStr = "OD_ZCU_";
77
78 7a998d66 Eliška Mourycová
			RootDataDirectory = rootDataDir;//$"..{sep}..{sep}..{sep}data{sep}auto";
79 d2d1c86a Eliška Mourycová
			OverwriteExisting = false;
80
81 55d35561 Eliška Mourycová
			DataSubDirectories = new Dictionary<DataType, string>();
82
			CreateSubdirectories(); // todo should we do it like this?
83
84 d2d1c86a Eliška Mourycová
			webClient = new WebClient();
85
		}
86
87 55d35561 Eliška Mourycová
		/// <summary>
88
		/// Creates subdirectories for all data types
89
		/// TBD if we want to do it this way
90
		/// </summary>
91
		private void CreateSubdirectories()
92
		{
93
			foreach (DataType type in (DataType[])Enum.GetValues(typeof(DataType)))
94
			{
95
				string subDirectory = RootDataDirectory + sep + type;
96
				DirectoryInfo di = Directory.CreateDirectory(subDirectory);
97
98
				// create subdirectory record if it doesn't exist:
99
				if (!DataSubDirectories.ContainsKey(type))
100
					DataSubDirectories.Add(type, Path.GetFullPath(subDirectory));
101
			}
102
		}
103
104
		/// <summary>
105
		/// Parses the naming convention to be later used for determining URLs for data download
106
		/// </summary>
107
		/// <param name="namingConvention">The configured naming convention</param>
108 4a417b8b Eliška Mourycová
		private void ParseNaming(string namingConvention)
109
		{
110
			separatedFileName = new List<string>();
111
			variablesInsertions = new List<string>();
112
			nameTurns = new List<char>();
113
114
			string currPart = "";
115
			string currVar = "";
116
			bool readingNormal = true;
117
			foreach (char c in namingConvention)
118
			{
119
				if (c == '{')
120
				{
121
					AddToNameParts(currPart);
122
					readingNormal = false;
123
					currPart = "";
124
				}
125
				else if (c == '}')
126
				{
127
					AddToVariables(currVar);
128
					readingNormal = true;
129
					currVar = "";
130
				}
131
				else
132
				{
133
					// normal char
134
					if (readingNormal)
135
						currPart += c;
136
					else
137
						currVar += c;
138
				}
139
			}
140
141
			// add the rest if there is any:
142
			if (readingNormal)
143
				AddToNameParts(currPart);
144
			else
145
				AddToVariables(currVar);
146
147
			Console.WriteLine();
148
		}
149
150 55d35561 Eliška Mourycová
		// Adds to name parts
151 4a417b8b Eliška Mourycová
		private void AddToNameParts(string s)
152
		{
153
			if (s.Length > 0)
154
			{
155
				separatedFileName.Add(s);
156
				nameTurns.Add('n');
157
			}
158
				
159
		}
160
161 55d35561 Eliška Mourycová
		// Adds to variable name parts
162 4a417b8b Eliška Mourycová
		private void AddToVariables(string s)
163
		{
164
			if (s.Length > 0)
165
			{
166
				variablesInsertions.Add(s);
167
				nameTurns.Add('v');
168
			}
169
				
170
		}
171
172 55d35561 Eliška Mourycová
		/// <summary>
173
		/// Builds the name of the downloaded file. Takes naming convention into account
174
		/// </summary>
175
		/// <param name="type">The type of data</param>
176
		/// <param name="format">The data format</param>
177
		/// <param name="year">The year</param>
178
		/// <param name="month">The month</param>
179
		/// <returns></returns>
180 4a417b8b Eliška Mourycová
		private string BuildDownloadedName(DataType type, DataFormat format, int year, int month)
181
		{
182
			string nameZip = "";
183
184
			int partInd = 0;
185
			int varInd = 0;
186
			for(int i = 0; i < nameTurns.Count; i++)
187
			{
188
				if (nameTurns[i] == 'n')
189
				{
190
					nameZip += separatedFileName[partInd];
191
					partInd++;
192
				}
193
				else if(nameTurns[i] == 'v')
194
				{
195
					string add = "";
196
					switch (variablesInsertions[varInd])
197
					{
198
						case "type":
199
							add = "" + type;
200
							break;
201
						case "month": 
202
							add = month < 10 ? "0" + month : "" + month;
203
							break;
204
						case "year":
205
							add = "" + year;
206
							break;
207
						case "format":
208
							add = "" + format;
209
							break;
210
						default: throw new Exception("Config file error - naming conventions can only contain variables with following names: type, month, year, format");
211
					}
212
					nameZip += add;
213
					varInd++;
214
				}
215
			}
216
217
			return nameZip;
218
		}
219
220 085453be Eliška Mourycová
221 34bf7aa2 Eliška Mourycová
		/// <summary>
222
		/// Downloads a specific archive.
223
		/// </summary>
224
		/// <param name="type">The type of data</param>
225
		/// <param name="format">The format of the data</param>
226
		/// <param name="year">The year</param>
227
		/// <param name="month">The month</param>
228
		/// <returns>A list of all extracted file names (should be only one)</returns>
229 d2d1c86a Eliška Mourycová
		private List<string> DownloadData(DataType type, DataFormat format, int year, int month)
230
		{
231
			// the list of all files potentially relevant to the caller
232
			List<string> extractedFiles = new List<string>();
233
234
			// Prepare the url string to be downloaded from:
235
			string monthStr = month < 10 ? "0" + month : "" + month;
236
			string yearStr = "" + year;
237
			string monthYr = monthStr + "_" + yearStr;
238 4a417b8b Eliška Mourycová
239
240
			string nameZip = BuildDownloadedName(type, format, year, month);//dataStr + type + "_" + monthYr + "_" + format + ".zip";
241
			string url = site + "/" + dataStr + monthYr + "/" + nameZip;//+ dataStr + type + "_" + monthYr + "_" + format + ".zip";
242
			
243 7a998d66 Eliška Mourycová
			string nameFolder = RootDataDirectory + sep + type + sep; //+ dataStr + type + "_" + monthYr + "_" + format;
244 d2d1c86a Eliška Mourycová
245
			try
246
			{
247 7a998d66 Eliška Mourycová
				//Console.WriteLine("Downloading .zip to " + Path.GetFullPath(nameZip) + "...");
248 d2d1c86a Eliška Mourycová
249
				// Download the zip file:
250
				webClient.DownloadFile(url, nameZip);
251
252
				//ZipFile.ExtractToDirectory(nameZip, nameFolder);
253
				ZipArchive zipArchive = ZipFile.OpenRead(nameZip);
254
				// Go through all the extracted files:
255
				foreach (ZipArchiveEntry entry in zipArchive.Entries)
256
				{
257
					// get the relative path to the file:
258 085453be Eliška Mourycová
					string newFileName = $"{month}-{year}.{format}";
259
					string extractedFile = nameFolder + newFileName; //+ entry.Name;
260 d2d1c86a Eliška Mourycová
261
					// add full path to the list:
262
					extractedFiles.Add(Path.GetFullPath(extractedFile));
263
264
					if (OverwriteExisting)
265
					{
266
						// if overwrite is desired, execute it:
267
						entry.ExtractToFile(extractedFile, OverwriteExisting);
268
						
269
					}
270
					else
271
					{
272
						// if overwrite is not desired, check if the file exists first:
273 085453be Eliška Mourycová
						if(File.Exists(extractedFile/*nameFolder + entry.Name*/))
274 d2d1c86a Eliška Mourycová
						{
275
							continue;
276
						}
277
						else
278
						{
279
							// if it doesn't exist, save it:
280
							entry.ExtractToFile(extractedFile, OverwriteExisting);
281
						}
282
					}
283
284
					
285
				}
286
				// dispose of the archive:
287
				zipArchive.Dispose();
288
289 7a998d66 Eliška Mourycová
				//Console.WriteLine("Extracted to " + Path.GetFullPath(nameFolder));
290
				//Console.WriteLine("Deleting .zip from " + Path.GetFullPath(nameZip) + "...");
291
				//Console.WriteLine("Finished downloading " + nameZip);
292 d2d1c86a Eliška Mourycová
293
				// delete the previously downloaded zip file, files contained in it have been extracted:
294
				File.Delete(nameZip); // todo check?
295
296
			}
297
			catch(System.Net.WebException we)
298
			{
299
				// download fails, if the specified url is invalid
300 7a998d66 Eliška Mourycová
				//Console.WriteLine("Download from " + url + " failed.");
301
				//Console.WriteLine(we.Message);
302 d2d1c86a Eliška Mourycová
			}
303
304
305
306
			return extractedFiles;
307
		}
308
309
310
311
		/// <summary>
312
		/// Downloads selected type and time span of data in the desired format, returns a list of full paths to all successfully saved files. 
313
		/// If some of the files already existed and were not overwritten, then the returned List contains paths to these files also.
314
		/// </summary>
315 55d35561 Eliška Mourycová
		/// <param name="type">The requested data type</param>
316
		/// <param name="format">The data format</param>
317
		/// <param name="startDate">The start date</param>
318
		/// <param name="endDate">The end date</param>
319
		/// <returns></returns>
320 7a998d66 Eliška Mourycová
		public List<string> DownloadData(DataType type, DataFormat format, Date startDate, Date endDate/*int startYear, int endYear, int startMonth, int endMonth*/)
321 d2d1c86a Eliška Mourycová
		{
322 7a998d66 Eliška Mourycová
			if (startDate > endDate)
323
				throw new ArgumentException("startDate must be the same as or before the endDate.");
324
325
			// initialize:
326 d2d1c86a Eliška Mourycová
			List<string> savedFiles = new List<string>();
327 55d35561 Eliška Mourycová
			//string subDirectory = RootDataDirectory + sep + type;
328
			//DirectoryInfo di = Directory.CreateDirectory(subDirectory);
329 7a998d66 Eliška Mourycová
330 55d35561 Eliška Mourycová
			//// create subdirectory record if it doesn't exist:
331
			//if (!DataSubDirectories.ContainsKey(type))
332
			//	DataSubDirectories.Add(type, Path.GetFullPath(subDirectory));
333 d2d1c86a Eliška Mourycová
334 7a998d66 Eliška Mourycová
335
			Date currentDate = startDate;
336
			bool firstLoop = true;
337
			do
338 d2d1c86a Eliška Mourycová
			{
339 7a998d66 Eliška Mourycová
				Console.WriteLine("current date: " + currentDate);
340
				savedFiles.AddRange(DownloadData(type, format, (int)currentDate.Year, (int)currentDate.Month));
341
				Date nextDate = currentDate.IncreaseMonthByOne();
342
343
				// also try to find the 00 file for each year:
344
				if(nextDate.Year > currentDate.Year || firstLoop)
345 d2d1c86a Eliška Mourycová
				{
346 7a998d66 Eliška Mourycová
					savedFiles.AddRange(DownloadData(type, format, (int)currentDate.Year, 0));
347
					if (firstLoop)
348
						firstLoop = false; // so that we don't download the same thing all the time
349
					
350 d2d1c86a Eliška Mourycová
				}
351 7a998d66 Eliška Mourycová
352
				// assign the increased date to the current date:
353
				currentDate = nextDate;
354
355
356 4a417b8b Eliška Mourycová
			} while (currentDate <= endDate);
357 7a998d66 Eliška Mourycová
358
359
			
360
361
			//for (int y = startYear; y <= endYear; y++)
362
			//{
363
			//	for (int m = startMonth; m <= endMonth; m++)
364
			//	{
365
			//		savedFiles.AddRange(DownloadData(type, format, y, m));
366
			//	}
367
			//}
368 d2d1c86a Eliška Mourycová
369
			return savedFiles;
370
		}
371
372 7a998d66 Eliška Mourycová
		public bool CheckForNewData()
373
		{
374
			throw new NotImplementedException();
375
		}
376
377 085453be Eliška Mourycová
378 55d35561 Eliška Mourycová
		/// <summary>
379
		/// Retrieves all data files with dates falling within the specified range. If not all data for the specified range is found
380
		/// then returns also file/s with month 0 if exists.
381
		/// </summary>
382
		/// <param name="subDirectory">The subdirectory to search</param>
383
		/// <param name="startDate">The start date</param>
384
		/// <param name="endDate">The end date</param>
385
		/// <returns></returns>
386 4a417b8b Eliška Mourycová
		public List<string> GetData(string subDirectory, Date startDate, Date endDate)
387 7a998d66 Eliška Mourycová
		{
388 4129ce12 Eliška Mourycová
			if (startDate == null || endDate == null)
389
				return GetData(subDirectory);
390
391 4a417b8b Eliška Mourycová
			string[] files = Directory.GetFiles(subDirectory);
392 55d35561 Eliška Mourycová
			List<string> found00Files = new List<string>();
393
			List<string> relevantFiles = new List<string>();
394
			List<Date> requestedDates = new List<Date>();
395
396
			// prepare a list of requested dates:
397
			Date currentDate = startDate;
398
			do
399
			{
400
				requestedDates.Add(currentDate);
401
				Date nextDate = currentDate.IncreaseMonthByOne();
402
				// assign the increased date to the current date:
403
				currentDate = nextDate;
404
405
			} while (currentDate <= endDate);
406
407
408
409
			for (int i = 0; i < files.Length; i++)
410 4a417b8b Eliška Mourycová
			{
411 085453be Eliška Mourycová
				string currFileName = Path.GetFileName(files[i]);
412
				Console.WriteLine("curr file: " + currFileName);
413
				string[] splits = currFileName.Split(new char[] { '-', '.' });
414
415
				int month = int.Parse(splits[0]);
416
				int year = int.Parse(splits[1]);
417
418
419 55d35561 Eliška Mourycová
				
420
				if (month == 0)
421
				{
422
					found00Files.Add(files[i]);
423
					continue;
424
				}
425
426 085453be Eliška Mourycová
				Date d = new Date((uint)month, (uint)year);
427
428 55d35561 Eliška Mourycová
				if (d >= startDate && d <= endDate)
429
				{
430
					// we want to add this
431
					relevantFiles.Add(files[i]);
432
					requestedDates.Remove(d);
433
				}
434
			}
435
436
437
438
439
			// 00 will only appear once for every year?
440
			foreach (string file00 in found00Files)
441
			{
442
				string fileName = Path.GetFileName(file00);
443
				string[] splits = fileName.Split(new char[] { '-', '.' });
444 4a417b8b Eliška Mourycová
445 55d35561 Eliška Mourycová
				int month = int.Parse(splits[0]);
446
				int year = int.Parse(splits[1]);
447
448
				// now we have the year of one 00 file
449
				// dates not found in the directory now remain in requested dates
450
				if(requestedDates.Exists(d => d.Year == year))
451
				{
452
					// if the year of this 00 file remains in the list:
453
					relevantFiles.Add(file00);
454
				}
455 4a417b8b Eliška Mourycová
			}
456
457 55d35561 Eliška Mourycová
458
			return relevantFiles;
459 7a998d66 Eliška Mourycová
		}
460
461 4129ce12 Eliška Mourycová
		/// <summary>
462
		/// Returns all file paths in a given directory
463
		/// </summary>
464
		/// <param name="subDirectory"></param>
465
		/// <returns></returns>
466
		private List<string> GetData(string subDirectory)
467
		{
468
			string[] files = Directory.GetFiles(subDirectory);
469
			List<string> relevantFiles = new List<string>(files.Length);
470
471
			for (int i = 0; i < files.Length; i++)
472
				relevantFiles.Add(files[i]);
473
474
			return relevantFiles;
475
		}
476
477 d2d1c86a Eliška Mourycová
478
		#region UNUSED
479
		//public string GetDirectoryListingRegexForUrl(string url)
480
		//{
481
		//	if (url.Equals(site))
482
		//	{
483
		//		//return "\\\"([^\"]*)\\\"";
484
		//		return @"\bOD_ZCU_\w*\b";
485
		//		//return @"\A[OD_ZCU_]";
486
		//	}
487
		//	else return null;
488
		//}
489
		//public void ListDirectory()
490
		//{
491
		//	string url = site;
492
		//	HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
493
		//	using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
494
		//	{
495
		//		using (StreamReader reader = new StreamReader(response.GetResponseStream()))
496
		//		{
497
		//			string html = reader.ReadToEnd();
498
499
500
501
		//			Regex regex = new Regex(GetDirectoryListingRegexForUrl(url));
502
		//			MatchCollection matches = regex.Matches(html);
503
		//			Console.WriteLine(matches.Count);
504
505
		//			if (matches.Count > 0)
506
		//			{
507
		//				foreach (Match match in matches)
508
		//				{
509
		//					//if (match.Success)
510
		//					//{
511
		//						Console.WriteLine(match.ToString());
512
		//					//}
513
		//				}
514
		//			}
515
		//		}
516
		//		Console.ReadLine();
517
		//	}
518
		//}
519
520
		#endregion
521
	}
522
}