1 |
34bf7aa2
|
Eliška Mourycová
|
//
|
2 |
|
|
// Author: Eliska Mourycova
|
3 |
|
|
//
|
4 |
|
|
|
5 |
|
|
using System;
|
6 |
d2d1c86a
|
Eliška Mourycová
|
using System.Collections.Generic;
|
7 |
|
|
using System.IO;
|
8 |
|
|
using System.IO.Compression;
|
9 |
|
|
using System.Net;
|
10 |
|
|
|
11 |
|
|
namespace DataDownload
|
12 |
|
|
{
|
13 |
34bf7aa2
|
Eliška Mourycová
|
/// <summary>
|
14 |
|
|
/// Enum representing all of the available data types (not all will be used in this project)
|
15 |
|
|
/// They are in Czech for easier handling file names.
|
16 |
|
|
/// TBD: They might be translated to English later.
|
17 |
|
|
/// </summary>
|
18 |
d2d1c86a
|
Eliška Mourycová
|
public enum DataType
|
19 |
|
|
{
|
20 |
|
|
POCASI, ENERGO, STROJE, EMAIL, OBSAZENI_MISTNOSTI, JIS, KOLOBEZKY, WIFI
|
21 |
|
|
}
|
22 |
34bf7aa2
|
Eliška Mourycová
|
|
23 |
|
|
/// <summary>
|
24 |
|
|
/// Represent all available data formats.
|
25 |
|
|
/// </summary>
|
26 |
d2d1c86a
|
Eliška Mourycová
|
public enum DataFormat
|
27 |
|
|
{
|
28 |
|
|
XML, JSON, CSV
|
29 |
|
|
}
|
30 |
|
|
|
31 |
34bf7aa2
|
Eliška Mourycová
|
/// <summary>
|
32 |
|
|
/// This class takes care of downloading of data. Download happens from http://openstore.zcu.cz/.
|
33 |
|
|
/// </summary>
|
34 |
d2d1c86a
|
Eliška Mourycová
|
public class DataDownloader
|
35 |
|
|
{
|
36 |
34bf7aa2
|
Eliška Mourycová
|
/// <summary>
|
37 |
|
|
/// The root directory containing all downloaded data
|
38 |
|
|
/// </summary>
|
39 |
d2d1c86a
|
Eliška Mourycová
|
public string DataDirectory { get; }
|
40 |
34bf7aa2
|
Eliška Mourycová
|
|
41 |
|
|
/// <summary>
|
42 |
|
|
/// Flag stating whether files which already exist should be overwritten when downloaded again
|
43 |
|
|
/// </summary>
|
44 |
d2d1c86a
|
Eliška Mourycová
|
public bool OverwriteExisting { get; set; }
|
45 |
|
|
|
46 |
34bf7aa2
|
Eliška Mourycová
|
// the main site where the data can be downloaded from
|
47 |
d2d1c86a
|
Eliška Mourycová
|
private string site;
|
48 |
34bf7aa2
|
Eliška Mourycová
|
|
49 |
|
|
// the substring at the start of every file name
|
50 |
d2d1c86a
|
Eliška Mourycová
|
private string dataStr;
|
51 |
34bf7aa2
|
Eliška Mourycová
|
|
52 |
|
|
// WebClient instance used for the actual download
|
53 |
d2d1c86a
|
Eliška Mourycová
|
private WebClient webClient;
|
54 |
34bf7aa2
|
Eliška Mourycová
|
|
55 |
|
|
// a shortcut to writing Path.DirectorySeparatorChar
|
56 |
d2d1c86a
|
Eliška Mourycová
|
private char sep = Path.DirectorySeparatorChar;
|
57 |
|
|
|
58 |
|
|
public DataDownloader()
|
59 |
|
|
{
|
60 |
34bf7aa2
|
Eliška Mourycová
|
// initialize all needed variables:
|
61 |
|
|
|
62 |
d2d1c86a
|
Eliška Mourycová
|
Console.WriteLine(Directory.GetCurrentDirectory());
|
63 |
|
|
site = "http://openstore.zcu.cz/";
|
64 |
|
|
dataStr = "OD_ZCU_";
|
65 |
|
|
|
66 |
|
|
DataDirectory = $"..{sep}..{sep}data{sep}auto";
|
67 |
|
|
OverwriteExisting = false;
|
68 |
|
|
|
69 |
|
|
webClient = new WebClient();
|
70 |
|
|
}
|
71 |
|
|
|
72 |
34bf7aa2
|
Eliška Mourycová
|
/// <summary>
|
73 |
|
|
/// Downloads a specific archive.
|
74 |
|
|
/// </summary>
|
75 |
|
|
/// <param name="type">The type of data</param>
|
76 |
|
|
/// <param name="format">The format of the data</param>
|
77 |
|
|
/// <param name="year">The year</param>
|
78 |
|
|
/// <param name="month">The month</param>
|
79 |
|
|
/// <returns>A list of all extracted file names (should be only one)</returns>
|
80 |
d2d1c86a
|
Eliška Mourycová
|
private List<string> DownloadData(DataType type, DataFormat format, int year, int month)
|
81 |
|
|
{
|
82 |
|
|
// the list of all files potentially relevant to the caller
|
83 |
|
|
List<string> extractedFiles = new List<string>();
|
84 |
|
|
|
85 |
|
|
// Prepare the url string to be downloaded from:
|
86 |
|
|
string monthStr = month < 10 ? "0" + month : "" + month;
|
87 |
|
|
string yearStr = "" + year;
|
88 |
|
|
|
89 |
|
|
string monthYr = monthStr + "_" + yearStr;
|
90 |
|
|
string url = site + "/" + dataStr + monthYr + "/" + dataStr + type + "_" + monthYr + "_" + format + ".zip";
|
91 |
|
|
string nameZip = dataStr + type + "_" + monthYr + "_" + format + ".zip";
|
92 |
|
|
string nameFolder = DataDirectory + sep + type + sep; //+ dataStr + type + "_" + monthYr + "_" + format;
|
93 |
|
|
|
94 |
|
|
try
|
95 |
|
|
{
|
96 |
|
|
Console.WriteLine("Downloading .zip to " + Path.GetFullPath(nameZip) + "...");
|
97 |
|
|
|
98 |
|
|
// Download the zip file:
|
99 |
|
|
webClient.DownloadFile(url, nameZip);
|
100 |
|
|
|
101 |
|
|
//ZipFile.ExtractToDirectory(nameZip, nameFolder);
|
102 |
|
|
ZipArchive zipArchive = ZipFile.OpenRead(nameZip);
|
103 |
|
|
// Go through all the extracted files:
|
104 |
|
|
foreach (ZipArchiveEntry entry in zipArchive.Entries)
|
105 |
|
|
{
|
106 |
|
|
// get the relative path to the file:
|
107 |
|
|
string extractedFile = nameFolder + entry.Name;
|
108 |
|
|
|
109 |
|
|
// add full path to the list:
|
110 |
|
|
extractedFiles.Add(Path.GetFullPath(extractedFile));
|
111 |
|
|
|
112 |
|
|
if (OverwriteExisting)
|
113 |
|
|
{
|
114 |
|
|
// if overwrite is desired, execute it:
|
115 |
|
|
entry.ExtractToFile(extractedFile, OverwriteExisting);
|
116 |
|
|
|
117 |
|
|
}
|
118 |
|
|
else
|
119 |
|
|
{
|
120 |
|
|
// if overwrite is not desired, check if the file exists first:
|
121 |
|
|
if(File.Exists(nameFolder + entry.Name))
|
122 |
|
|
{
|
123 |
|
|
continue;
|
124 |
|
|
}
|
125 |
|
|
else
|
126 |
|
|
{
|
127 |
|
|
// if it doesn't exist, save it:
|
128 |
|
|
entry.ExtractToFile(extractedFile, OverwriteExisting);
|
129 |
|
|
}
|
130 |
|
|
}
|
131 |
|
|
|
132 |
|
|
|
133 |
|
|
}
|
134 |
|
|
// dispose of the archive:
|
135 |
|
|
zipArchive.Dispose();
|
136 |
|
|
|
137 |
|
|
Console.WriteLine("Extracted to " + Path.GetFullPath(nameFolder));
|
138 |
|
|
Console.WriteLine("Deleting .zip from " + Path.GetFullPath(nameZip) + "...");
|
139 |
|
|
Console.WriteLine("Finished downloading " + nameZip);
|
140 |
|
|
|
141 |
|
|
// delete the previously downloaded zip file, files contained in it have been extracted:
|
142 |
|
|
File.Delete(nameZip); // todo check?
|
143 |
|
|
|
144 |
|
|
}
|
145 |
|
|
catch(System.Net.WebException we)
|
146 |
|
|
{
|
147 |
|
|
// download fails, if the specified url is invalid
|
148 |
|
|
Console.WriteLine("Download from " + url + " failed.");
|
149 |
|
|
Console.WriteLine(we.Message);
|
150 |
|
|
}
|
151 |
|
|
|
152 |
|
|
|
153 |
|
|
|
154 |
|
|
return extractedFiles;
|
155 |
|
|
}
|
156 |
|
|
|
157 |
|
|
|
158 |
|
|
|
159 |
|
|
/// <summary>
|
160 |
|
|
/// Downloads selected type and time span of data in the desired format, returns a list of full paths to all successfully saved files.
|
161 |
|
|
/// If some of the files already existed and were not overwritten, then the returned List contains paths to these files also.
|
162 |
|
|
/// </summary>
|
163 |
|
|
/// <param name="type">The type of data, e.g. the jis data</param>
|
164 |
|
|
/// <param name="format">The desired format of data files, available are CSV, XML and JSON formats</param>
|
165 |
|
|
/// <param name="startYear">The start year to start the download from, inclusive</param>
|
166 |
|
|
/// <param name="endYear">The end year to start the download from, inclusive</param>
|
167 |
|
|
/// <param name="startMonth">The start month to start the download from, inclusive</param>
|
168 |
|
|
/// <param name="endMonth">The end month to start the download from, inclusive</param>
|
169 |
|
|
/// <returns>A list of full paths to all successfully saved files</returns>
|
170 |
|
|
public List<string> DownloadData(DataType type, DataFormat format, int startYear, int endYear, int startMonth, int endMonth)
|
171 |
|
|
{
|
172 |
|
|
List<string> savedFiles = new List<string>();
|
173 |
|
|
DirectoryInfo di = Directory.CreateDirectory(DataDirectory + sep + type);
|
174 |
|
|
|
175 |
|
|
for (int y = startYear; y <= endYear; y++)
|
176 |
|
|
{
|
177 |
|
|
for (int m = startMonth; m <= endMonth; m++)
|
178 |
|
|
{
|
179 |
|
|
savedFiles.AddRange(DownloadData(type, format, y, m));
|
180 |
|
|
}
|
181 |
|
|
}
|
182 |
|
|
|
183 |
|
|
return savedFiles;
|
184 |
|
|
}
|
185 |
|
|
|
186 |
|
|
|
187 |
|
|
#region UNUSED
|
188 |
|
|
//public string GetDirectoryListingRegexForUrl(string url)
|
189 |
|
|
//{
|
190 |
|
|
// if (url.Equals(site))
|
191 |
|
|
// {
|
192 |
|
|
// //return "\\\"([^\"]*)\\\"";
|
193 |
|
|
// return @"\bOD_ZCU_\w*\b";
|
194 |
|
|
// //return @"\A[OD_ZCU_]";
|
195 |
|
|
// }
|
196 |
|
|
// else return null;
|
197 |
|
|
//}
|
198 |
|
|
//public void ListDirectory()
|
199 |
|
|
//{
|
200 |
|
|
// string url = site;
|
201 |
|
|
// HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
|
202 |
|
|
// using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
|
203 |
|
|
// {
|
204 |
|
|
// using (StreamReader reader = new StreamReader(response.GetResponseStream()))
|
205 |
|
|
// {
|
206 |
|
|
// string html = reader.ReadToEnd();
|
207 |
|
|
|
208 |
|
|
|
209 |
|
|
|
210 |
|
|
// Regex regex = new Regex(GetDirectoryListingRegexForUrl(url));
|
211 |
|
|
// MatchCollection matches = regex.Matches(html);
|
212 |
|
|
// Console.WriteLine(matches.Count);
|
213 |
|
|
|
214 |
|
|
// if (matches.Count > 0)
|
215 |
|
|
// {
|
216 |
|
|
// foreach (Match match in matches)
|
217 |
|
|
// {
|
218 |
|
|
// //if (match.Success)
|
219 |
|
|
// //{
|
220 |
|
|
// Console.WriteLine(match.ToString());
|
221 |
|
|
// //}
|
222 |
|
|
// }
|
223 |
|
|
// }
|
224 |
|
|
// }
|
225 |
|
|
// Console.ReadLine();
|
226 |
|
|
// }
|
227 |
|
|
//}
|
228 |
|
|
|
229 |
|
|
#endregion
|
230 |
|
|
}
|
231 |
|
|
}
|