Projekt

Obecné

Profil

Stáhnout (14.7 KB) Statistiky
| Větev: | Tag: | Revize:
1 34bf7aa2 Eliška Mourycová
//
2
// Author: Eliska Mourycova
3
//
4
5
using System;
6 d2d1c86a Eliška Mourycová
using System.Collections.Generic;
7
using System.IO;
8
using System.IO.Compression;
9
using System.Net;
10
11 7a998d66 Eliška Mourycová
namespace ServerApp.DataDownload
12 d2d1c86a Eliška Mourycová
{
13 34bf7aa2 Eliška Mourycová
	/// <summary>
14
	/// Enum representing all of the available data types (not all will be used in this project)
15
	/// They are in Czech for easier handling file names.
16
	/// TBD: They might be translated to English later.
17
	/// </summary>
18 d2d1c86a Eliška Mourycová
	public enum DataType
19
	{
20
		POCASI, ENERGO, STROJE, EMAIL, OBSAZENI_MISTNOSTI, JIS, KOLOBEZKY, WIFI
21
	}
22 34bf7aa2 Eliška Mourycová
23
	/// <summary>
24
	/// Represent all available data formats.
25
	/// </summary>
26 d2d1c86a Eliška Mourycová
	public enum DataFormat
27
	{
28
		XML, JSON, CSV
29
	}
30
31 34bf7aa2 Eliška Mourycová
	/// <summary>
32
	/// This class takes care of downloading of data. Download happens from http://openstore.zcu.cz/.
33
	/// </summary>
34 d2d1c86a Eliška Mourycová
	public class DataDownloader
35
	{
36 34bf7aa2 Eliška Mourycová
		/// <summary>
37
		/// The root directory containing all downloaded data
38
		/// </summary>
39 7a998d66 Eliška Mourycová
		public string RootDataDirectory { get; }
40
41
		/// <summary>
42
		/// For a DataType key returns full (absolute) path to a direcotry, where this type of data is stored
43
		/// </summary>
44
		public Dictionary<DataType, string> DataSubDirectories { get; }
45 34bf7aa2 Eliška Mourycová
46
		/// <summary>
47
		/// Flag stating whether files which already exist should be overwritten when downloaded again
48
		/// </summary>
49 d2d1c86a Eliška Mourycová
		public bool OverwriteExisting { get; set; }
50
51 34bf7aa2 Eliška Mourycová
		// the main site where the data can be downloaded from
52 d2d1c86a Eliška Mourycová
		private string site;
53 34bf7aa2 Eliška Mourycová
54
		// the substring at the start of every file name
55 d2d1c86a Eliška Mourycová
		private string dataStr;
56 34bf7aa2 Eliška Mourycová
57
		// WebClient instance used for the actual download
58 d2d1c86a Eliška Mourycová
		private WebClient webClient;
59 34bf7aa2 Eliška Mourycová
60
		// a shortcut to writing Path.DirectorySeparatorChar
61 d2d1c86a Eliška Mourycová
		private char sep = Path.DirectorySeparatorChar;
62
63 55d35561 Eliška Mourycová
		// lists used for parsing the file names:
64 4a417b8b Eliška Mourycová
		private List<string> separatedFileName;
65
		private List<string> variablesInsertions;
66
		private List<char> nameTurns;
67
68
		public DataDownloader(string rootDataDir, string website, string namingConvention) // todo: take naming conventons specifiaction into account
69 d2d1c86a Eliška Mourycová
		{
70 34bf7aa2 Eliška Mourycová
			// initialize all needed variables:
71
72 4a417b8b Eliška Mourycová
			//Console.WriteLine(Directory.GetCurrentDirectory());
73 7a998d66 Eliška Mourycová
			site = website;//"http://openstore.zcu.cz/";
74 4a417b8b Eliška Mourycová
75
			ParseNaming(namingConvention);
76 d2d1c86a Eliška Mourycová
			dataStr = "OD_ZCU_";
77
78 7a998d66 Eliška Mourycová
			RootDataDirectory = rootDataDir;//$"..{sep}..{sep}..{sep}data{sep}auto";
79 d2d1c86a Eliška Mourycová
			OverwriteExisting = false;
80
81 55d35561 Eliška Mourycová
			DataSubDirectories = new Dictionary<DataType, string>();
82
			CreateSubdirectories(); // todo should we do it like this?
83
84 d2d1c86a Eliška Mourycová
			webClient = new WebClient();
85
		}
86
87 4b847de5 A-Konig
		/// <summary>
88 bfd5a848 A-Konig
		/// Downloads json file - returns contents of said file
89 4b847de5 A-Konig
		/// </summary>
90
		/// <returns> Path to file </returns>
91 bf906ac5 A-Konig
		public string DownloadWeatherPrediction()
92 4b847de5 A-Konig
		{
93
			// TODO either set this path as attribute or if parameter JsonParser needs an attribute that would be set through constructor
94
			string predictionSite = "http://wttr.in/Plzen,czechia?format=j1";
95
96
			DateTime now = DateTime.Now;
97
			WebClient webClient = new WebClient();
98 26ecc756 A-Konig
			webClient.DownloadFile(predictionSite, $"data/{now.Year}{now.Month}{now.Day}.json");
99 4b847de5 A-Konig
100 26ecc756 A-Konig
			return $"data/{now.Year}{now.Month}{now.Day}.json";
101 4b847de5 A-Konig
		}
102
103 55d35561 Eliška Mourycová
		/// <summary>
104
		/// Creates subdirectories for all data types
105
		/// TBD if we want to do it this way
106
		/// </summary>
107
		private void CreateSubdirectories()
108
		{
109
			foreach (DataType type in (DataType[])Enum.GetValues(typeof(DataType)))
110
			{
111
				string subDirectory = RootDataDirectory + sep + type;
112
				DirectoryInfo di = Directory.CreateDirectory(subDirectory);
113
114
				// create subdirectory record if it doesn't exist:
115
				if (!DataSubDirectories.ContainsKey(type))
116
					DataSubDirectories.Add(type, Path.GetFullPath(subDirectory));
117
			}
118
		}
119
120
		/// <summary>
121
		/// Parses the naming convention to be later used for determining URLs for data download
122
		/// </summary>
123
		/// <param name="namingConvention">The configured naming convention</param>
124 4a417b8b Eliška Mourycová
		private void ParseNaming(string namingConvention)
125
		{
126
			separatedFileName = new List<string>();
127
			variablesInsertions = new List<string>();
128
			nameTurns = new List<char>();
129
130
			string currPart = "";
131
			string currVar = "";
132
			bool readingNormal = true;
133
			foreach (char c in namingConvention)
134
			{
135
				if (c == '{')
136
				{
137
					AddToNameParts(currPart);
138
					readingNormal = false;
139
					currPart = "";
140
				}
141
				else if (c == '}')
142
				{
143
					AddToVariables(currVar);
144
					readingNormal = true;
145
					currVar = "";
146
				}
147
				else
148
				{
149
					// normal char
150
					if (readingNormal)
151
						currPart += c;
152
					else
153
						currVar += c;
154
				}
155
			}
156
157
			// add the rest if there is any:
158
			if (readingNormal)
159
				AddToNameParts(currPart);
160
			else
161
				AddToVariables(currVar);
162
163
			Console.WriteLine();
164
		}
165
166 55d35561 Eliška Mourycová
		// Adds to name parts
167 4a417b8b Eliška Mourycová
		private void AddToNameParts(string s)
168
		{
169
			if (s.Length > 0)
170
			{
171
				separatedFileName.Add(s);
172
				nameTurns.Add('n');
173
			}
174
				
175
		}
176
177 55d35561 Eliška Mourycová
		// Adds to variable name parts
178 4a417b8b Eliška Mourycová
		private void AddToVariables(string s)
179
		{
180
			if (s.Length > 0)
181
			{
182
				variablesInsertions.Add(s);
183
				nameTurns.Add('v');
184
			}
185
				
186
		}
187
188 55d35561 Eliška Mourycová
		/// <summary>
189
		/// Builds the name of the downloaded file. Takes naming convention into account
190
		/// </summary>
191
		/// <param name="type">The type of data</param>
192
		/// <param name="format">The data format</param>
193
		/// <param name="year">The year</param>
194
		/// <param name="month">The month</param>
195
		/// <returns></returns>
196 4a417b8b Eliška Mourycová
		private string BuildDownloadedName(DataType type, DataFormat format, int year, int month)
197
		{
198
			string nameZip = "";
199
200
			int partInd = 0;
201
			int varInd = 0;
202
			for(int i = 0; i < nameTurns.Count; i++)
203
			{
204
				if (nameTurns[i] == 'n')
205
				{
206
					nameZip += separatedFileName[partInd];
207
					partInd++;
208
				}
209
				else if(nameTurns[i] == 'v')
210
				{
211
					string add = "";
212
					switch (variablesInsertions[varInd])
213
					{
214
						case "type":
215
							add = "" + type;
216
							break;
217
						case "month": 
218
							add = month < 10 ? "0" + month : "" + month;
219
							break;
220
						case "year":
221
							add = "" + year;
222
							break;
223
						case "format":
224
							add = "" + format;
225
							break;
226
						default: throw new Exception("Config file error - naming conventions can only contain variables with following names: type, month, year, format");
227
					}
228
					nameZip += add;
229
					varInd++;
230
				}
231
			}
232
233
			return nameZip;
234
		}
235
236 085453be Eliška Mourycová
237 34bf7aa2 Eliška Mourycová
		/// <summary>
238
		/// Downloads a specific archive.
239
		/// </summary>
240
		/// <param name="type">The type of data</param>
241
		/// <param name="format">The format of the data</param>
242
		/// <param name="year">The year</param>
243
		/// <param name="month">The month</param>
244
		/// <returns>A list of all extracted file names (should be only one)</returns>
245 d2d1c86a Eliška Mourycová
		private List<string> DownloadData(DataType type, DataFormat format, int year, int month)
246
		{
247
			// the list of all files potentially relevant to the caller
248
			List<string> extractedFiles = new List<string>();
249
250
			// Prepare the url string to be downloaded from:
251
			string monthStr = month < 10 ? "0" + month : "" + month;
252
			string yearStr = "" + year;
253
			string monthYr = monthStr + "_" + yearStr;
254 4a417b8b Eliška Mourycová
255
256
			string nameZip = BuildDownloadedName(type, format, year, month);//dataStr + type + "_" + monthYr + "_" + format + ".zip";
257
			string url = site + "/" + dataStr + monthYr + "/" + nameZip;//+ dataStr + type + "_" + monthYr + "_" + format + ".zip";
258
			
259 7a998d66 Eliška Mourycová
			string nameFolder = RootDataDirectory + sep + type + sep; //+ dataStr + type + "_" + monthYr + "_" + format;
260 d2d1c86a Eliška Mourycová
261
			try
262
			{
263 7a998d66 Eliška Mourycová
				//Console.WriteLine("Downloading .zip to " + Path.GetFullPath(nameZip) + "...");
264 d2d1c86a Eliška Mourycová
265
				// Download the zip file:
266
				webClient.DownloadFile(url, nameZip);
267
268
				//ZipFile.ExtractToDirectory(nameZip, nameFolder);
269
				ZipArchive zipArchive = ZipFile.OpenRead(nameZip);
270
				// Go through all the extracted files:
271
				foreach (ZipArchiveEntry entry in zipArchive.Entries)
272
				{
273
					// get the relative path to the file:
274 085453be Eliška Mourycová
					string newFileName = $"{month}-{year}.{format}";
275
					string extractedFile = nameFolder + newFileName; //+ entry.Name;
276 d2d1c86a Eliška Mourycová
277
					// add full path to the list:
278
					extractedFiles.Add(Path.GetFullPath(extractedFile));
279
280
					if (OverwriteExisting)
281
					{
282
						// if overwrite is desired, execute it:
283
						entry.ExtractToFile(extractedFile, OverwriteExisting);
284
						
285
					}
286
					else
287
					{
288
						// if overwrite is not desired, check if the file exists first:
289 085453be Eliška Mourycová
						if(File.Exists(extractedFile/*nameFolder + entry.Name*/))
290 d2d1c86a Eliška Mourycová
						{
291
							continue;
292
						}
293
						else
294
						{
295
							// if it doesn't exist, save it:
296
							entry.ExtractToFile(extractedFile, OverwriteExisting);
297
						}
298
					}
299
300
					
301
				}
302
				// dispose of the archive:
303
				zipArchive.Dispose();
304
305 7a998d66 Eliška Mourycová
				//Console.WriteLine("Extracted to " + Path.GetFullPath(nameFolder));
306
				//Console.WriteLine("Deleting .zip from " + Path.GetFullPath(nameZip) + "...");
307
				//Console.WriteLine("Finished downloading " + nameZip);
308 d2d1c86a Eliška Mourycová
309
				// delete the previously downloaded zip file, files contained in it have been extracted:
310
				File.Delete(nameZip); // todo check?
311
312
			}
313
			catch(System.Net.WebException we)
314
			{
315
				// download fails, if the specified url is invalid
316 7a998d66 Eliška Mourycová
				//Console.WriteLine("Download from " + url + " failed.");
317
				//Console.WriteLine(we.Message);
318 d2d1c86a Eliška Mourycová
			}
319
320
321
322
			return extractedFiles;
323
		}
324
325
326
327
		/// <summary>
328
		/// Downloads selected type and time span of data in the desired format, returns a list of full paths to all successfully saved files. 
329
		/// If some of the files already existed and were not overwritten, then the returned List contains paths to these files also.
330
		/// </summary>
331 55d35561 Eliška Mourycová
		/// <param name="type">The requested data type</param>
332
		/// <param name="format">The data format</param>
333
		/// <param name="startDate">The start date</param>
334
		/// <param name="endDate">The end date</param>
335
		/// <returns></returns>
336 7a998d66 Eliška Mourycová
		public List<string> DownloadData(DataType type, DataFormat format, Date startDate, Date endDate/*int startYear, int endYear, int startMonth, int endMonth*/)
337 d2d1c86a Eliška Mourycová
		{
338 7a998d66 Eliška Mourycová
			if (startDate > endDate)
339
				throw new ArgumentException("startDate must be the same as or before the endDate.");
340
341
			// initialize:
342 d2d1c86a Eliška Mourycová
			List<string> savedFiles = new List<string>();
343 55d35561 Eliška Mourycová
			//string subDirectory = RootDataDirectory + sep + type;
344
			//DirectoryInfo di = Directory.CreateDirectory(subDirectory);
345 7a998d66 Eliška Mourycová
346 55d35561 Eliška Mourycová
			//// create subdirectory record if it doesn't exist:
347
			//if (!DataSubDirectories.ContainsKey(type))
348
			//	DataSubDirectories.Add(type, Path.GetFullPath(subDirectory));
349 d2d1c86a Eliška Mourycová
350 7a998d66 Eliška Mourycová
351
			Date currentDate = startDate;
352
			bool firstLoop = true;
353
			do
354 d2d1c86a Eliška Mourycová
			{
355 7a998d66 Eliška Mourycová
				Console.WriteLine("current date: " + currentDate);
356
				savedFiles.AddRange(DownloadData(type, format, (int)currentDate.Year, (int)currentDate.Month));
357
				Date nextDate = currentDate.IncreaseMonthByOne();
358
359
				// also try to find the 00 file for each year:
360
				if(nextDate.Year > currentDate.Year || firstLoop)
361 d2d1c86a Eliška Mourycová
				{
362 7a998d66 Eliška Mourycová
					savedFiles.AddRange(DownloadData(type, format, (int)currentDate.Year, 0));
363
					if (firstLoop)
364
						firstLoop = false; // so that we don't download the same thing all the time
365
					
366 d2d1c86a Eliška Mourycová
				}
367 7a998d66 Eliška Mourycová
368
				// assign the increased date to the current date:
369
				currentDate = nextDate;
370
371
372 4a417b8b Eliška Mourycová
			} while (currentDate <= endDate);
373 7a998d66 Eliška Mourycová
374
375
			
376
377
			//for (int y = startYear; y <= endYear; y++)
378
			//{
379
			//	for (int m = startMonth; m <= endMonth; m++)
380
			//	{
381
			//		savedFiles.AddRange(DownloadData(type, format, y, m));
382
			//	}
383
			//}
384 d2d1c86a Eliška Mourycová
385
			return savedFiles;
386
		}
387
388 7a998d66 Eliška Mourycová
		public bool CheckForNewData()
389
		{
390
			throw new NotImplementedException();
391
		}
392
393 085453be Eliška Mourycová
394 55d35561 Eliška Mourycová
		/// <summary>
395
		/// Retrieves all data files with dates falling within the specified range. If not all data for the specified range is found
396
		/// then returns also file/s with month 0 if exists.
397
		/// </summary>
398
		/// <param name="subDirectory">The subdirectory to search</param>
399
		/// <param name="startDate">The start date</param>
400
		/// <param name="endDate">The end date</param>
401
		/// <returns></returns>
402 4a417b8b Eliška Mourycová
		public List<string> GetData(string subDirectory, Date startDate, Date endDate)
403 7a998d66 Eliška Mourycová
		{
404 4129ce12 Eliška Mourycová
			if (startDate == null || endDate == null)
405
				return GetData(subDirectory);
406
407 4a417b8b Eliška Mourycová
			string[] files = Directory.GetFiles(subDirectory);
408 55d35561 Eliška Mourycová
			List<string> found00Files = new List<string>();
409
			List<string> relevantFiles = new List<string>();
410
			List<Date> requestedDates = new List<Date>();
411
412
			// prepare a list of requested dates:
413
			Date currentDate = startDate;
414
			do
415
			{
416
				requestedDates.Add(currentDate);
417
				Date nextDate = currentDate.IncreaseMonthByOne();
418
				// assign the increased date to the current date:
419
				currentDate = nextDate;
420
421
			} while (currentDate <= endDate);
422
423
424
425
			for (int i = 0; i < files.Length; i++)
426 4a417b8b Eliška Mourycová
			{
427 085453be Eliška Mourycová
				string currFileName = Path.GetFileName(files[i]);
428
				Console.WriteLine("curr file: " + currFileName);
429
				string[] splits = currFileName.Split(new char[] { '-', '.' });
430
431
				int month = int.Parse(splits[0]);
432
				int year = int.Parse(splits[1]);
433
434
435 55d35561 Eliška Mourycová
				
436
				if (month == 0)
437
				{
438
					found00Files.Add(files[i]);
439
					continue;
440
				}
441
442 085453be Eliška Mourycová
				Date d = new Date((uint)month, (uint)year);
443
444 55d35561 Eliška Mourycová
				if (d >= startDate && d <= endDate)
445
				{
446
					// we want to add this
447
					relevantFiles.Add(files[i]);
448
					requestedDates.Remove(d);
449
				}
450
			}
451
452
453
454
455
			// 00 will only appear once for every year?
456
			foreach (string file00 in found00Files)
457
			{
458
				string fileName = Path.GetFileName(file00);
459
				string[] splits = fileName.Split(new char[] { '-', '.' });
460 4a417b8b Eliška Mourycová
461 55d35561 Eliška Mourycová
				int month = int.Parse(splits[0]);
462
				int year = int.Parse(splits[1]);
463
464
				// now we have the year of one 00 file
465
				// dates not found in the directory now remain in requested dates
466
				if(requestedDates.Exists(d => d.Year == year))
467
				{
468
					// if the year of this 00 file remains in the list:
469
					relevantFiles.Add(file00);
470
				}
471 4a417b8b Eliška Mourycová
			}
472
473 55d35561 Eliška Mourycová
474
			return relevantFiles;
475 7a998d66 Eliška Mourycová
		}
476
477 4129ce12 Eliška Mourycová
		/// <summary>
478
		/// Returns all file paths in a given directory
479
		/// </summary>
480
		/// <param name="subDirectory"></param>
481
		/// <returns></returns>
482
		private List<string> GetData(string subDirectory)
483
		{
484
			string[] files = Directory.GetFiles(subDirectory);
485
			List<string> relevantFiles = new List<string>(files.Length);
486
487
			for (int i = 0; i < files.Length; i++)
488
				relevantFiles.Add(files[i]);
489
490
			return relevantFiles;
491
		}
492
493 d2d1c86a Eliška Mourycová
494
		#region UNUSED
495
		//public string GetDirectoryListingRegexForUrl(string url)
496
		//{
497
		//	if (url.Equals(site))
498
		//	{
499
		//		//return "\\\"([^\"]*)\\\"";
500
		//		return @"\bOD_ZCU_\w*\b";
501
		//		//return @"\A[OD_ZCU_]";
502
		//	}
503
		//	else return null;
504
		//}
505
		//public void ListDirectory()
506
		//{
507
		//	string url = site;
508
		//	HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
509
		//	using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
510
		//	{
511
		//		using (StreamReader reader = new StreamReader(response.GetResponseStream()))
512
		//		{
513
		//			string html = reader.ReadToEnd();
514
515
516
517
		//			Regex regex = new Regex(GetDirectoryListingRegexForUrl(url));
518
		//			MatchCollection matches = regex.Matches(html);
519
		//			Console.WriteLine(matches.Count);
520
521
		//			if (matches.Count > 0)
522
		//			{
523
		//				foreach (Match match in matches)
524
		//				{
525
		//					//if (match.Success)
526
		//					//{
527
		//						Console.WriteLine(match.ToString());
528
		//					//}
529
		//				}
530
		//			}
531
		//		}
532
		//		Console.ReadLine();
533
		//	}
534
		//}
535
536
		#endregion
537
	}
538
}