You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

load_data.rs 17KB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
5 years ago
6 years ago
5 years ago
5 years ago
5 years ago
6 years ago
6 years ago
5 years ago
6 years ago
5 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
6 years ago
6 years ago
5 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543
  1. extern crate serde_json;
  2. extern crate toml;
  3. use utils::de::fix_toml_dates;
  4. use utils::fs::{get_file_time, is_path_in_directory, read_file};
  5. use reqwest::{header, Client};
  6. use std::collections::hash_map::DefaultHasher;
  7. use std::fmt;
  8. use std::hash::{Hash, Hasher};
  9. use std::str::FromStr;
  10. use url::Url;
  11. use std::path::PathBuf;
  12. use std::sync::{Arc, Mutex};
  13. use csv::Reader;
  14. use std::collections::HashMap;
  15. use tera::{from_value, to_value, Error, Function as TeraFn, Map, Result, Value};
  16. static GET_DATA_ARGUMENT_ERROR_MESSAGE: &str =
  17. "`load_data`: requires EITHER a `path` or `url` argument";
  18. enum DataSource {
  19. Url(Url),
  20. Path(PathBuf),
  21. }
  22. #[derive(Debug)]
  23. enum OutputFormat {
  24. Toml,
  25. Json,
  26. Csv,
  27. Plain,
  28. }
  29. impl fmt::Display for OutputFormat {
  30. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  31. fmt::Debug::fmt(self, f)
  32. }
  33. }
  34. impl Hash for OutputFormat {
  35. fn hash<H: Hasher>(&self, state: &mut H) {
  36. self.to_string().hash(state);
  37. }
  38. }
  39. impl FromStr for OutputFormat {
  40. type Err = Error;
  41. fn from_str(output_format: &str) -> Result<Self> {
  42. match output_format {
  43. "toml" => Ok(OutputFormat::Toml),
  44. "csv" => Ok(OutputFormat::Csv),
  45. "json" => Ok(OutputFormat::Json),
  46. "plain" => Ok(OutputFormat::Plain),
  47. format => Err(format!("Unknown output format {}", format).into()),
  48. }
  49. }
  50. }
  51. impl OutputFormat {
  52. fn as_accept_header(&self) -> header::HeaderValue {
  53. header::HeaderValue::from_static(match self {
  54. OutputFormat::Json => "application/json",
  55. OutputFormat::Csv => "text/csv",
  56. OutputFormat::Toml => "application/toml",
  57. OutputFormat::Plain => "text/plain",
  58. })
  59. }
  60. }
  61. impl DataSource {
  62. fn from_args(
  63. path_arg: Option<String>,
  64. url_arg: Option<String>,
  65. content_path: &PathBuf,
  66. ) -> Result<Self> {
  67. if path_arg.is_some() && url_arg.is_some() {
  68. return Err(GET_DATA_ARGUMENT_ERROR_MESSAGE.into());
  69. }
  70. if let Some(path) = path_arg {
  71. let full_path = content_path.join(path);
  72. if !full_path.exists() {
  73. return Err(format!("{} doesn't exist", full_path.display()).into());
  74. }
  75. return Ok(DataSource::Path(full_path));
  76. }
  77. if let Some(url) = url_arg {
  78. return Url::parse(&url)
  79. .map(DataSource::Url)
  80. .map_err(|e| format!("Failed to parse {} as url: {}", url, e).into());
  81. }
  82. Err(GET_DATA_ARGUMENT_ERROR_MESSAGE.into())
  83. }
  84. fn get_cache_key(&self, format: &OutputFormat) -> u64 {
  85. let mut hasher = DefaultHasher::new();
  86. format.hash(&mut hasher);
  87. self.hash(&mut hasher);
  88. hasher.finish()
  89. }
  90. }
  91. impl Hash for DataSource {
  92. fn hash<H: Hasher>(&self, state: &mut H) {
  93. match self {
  94. DataSource::Url(url) => url.hash(state),
  95. DataSource::Path(path) => {
  96. path.hash(state);
  97. get_file_time(&path).expect("get file time").hash(state);
  98. }
  99. };
  100. }
  101. }
  102. fn get_data_source_from_args(
  103. content_path: &PathBuf,
  104. args: &HashMap<String, Value>,
  105. ) -> Result<DataSource> {
  106. let path_arg = optional_arg!(String, args.get("path"), GET_DATA_ARGUMENT_ERROR_MESSAGE);
  107. let url_arg = optional_arg!(String, args.get("url"), GET_DATA_ARGUMENT_ERROR_MESSAGE);
  108. DataSource::from_args(path_arg, url_arg, content_path)
  109. }
  110. fn read_data_file(base_path: &PathBuf, full_path: PathBuf) -> Result<String> {
  111. if !is_path_in_directory(&base_path, &full_path)
  112. .map_err(|e| format!("Failed to read data file {}: {}", full_path.display(), e))?
  113. {
  114. return Err(format!(
  115. "{} is not inside the base site directory {}",
  116. full_path.display(),
  117. base_path.display()
  118. )
  119. .into());
  120. }
  121. read_file(&full_path).map_err(|e| {
  122. format!("`load_data`: error {} loading file {}", full_path.to_str().unwrap(), e).into()
  123. })
  124. }
  125. fn get_output_format_from_args(
  126. args: &HashMap<String, Value>,
  127. data_source: &DataSource,
  128. ) -> Result<OutputFormat> {
  129. let format_arg = optional_arg!(
  130. String,
  131. args.get("format"),
  132. "`load_data`: `format` needs to be an argument with a string value, being one of the supported `load_data` file types (csv, json, toml, plain)"
  133. );
  134. if let Some(format) = format_arg {
  135. if format == "plain" {
  136. return Ok(OutputFormat::Plain);
  137. }
  138. return OutputFormat::from_str(&format);
  139. }
  140. let from_extension = if let DataSource::Path(path) = data_source {
  141. path.extension().map(|extension| extension.to_str().unwrap()).unwrap_or_else(|| "plain")
  142. } else {
  143. "plain"
  144. };
  145. // Always default to Plain if we don't know what it is
  146. OutputFormat::from_str(from_extension).or_else(|_| Ok(OutputFormat::Plain))
  147. }
  148. /// A Tera function to load data from a file or from a URL
  149. /// Currently the supported formats are json, toml, csv and plain text
  150. #[derive(Debug)]
  151. pub struct LoadData {
  152. base_path: PathBuf,
  153. client: Arc<Mutex<Client>>,
  154. result_cache: Arc<Mutex<HashMap<u64, Value>>>,
  155. }
  156. impl LoadData {
  157. pub fn new(base_path: PathBuf) -> Self {
  158. let client = Arc::new(Mutex::new(Client::builder().build().expect("reqwest client build")));
  159. let result_cache = Arc::new(Mutex::new(HashMap::new()));
  160. Self { base_path, client, result_cache }
  161. }
  162. }
  163. impl TeraFn for LoadData {
  164. fn call(&self, args: &HashMap<String, Value>) -> Result<Value> {
  165. let data_source = get_data_source_from_args(&self.base_path, &args)?;
  166. let file_format = get_output_format_from_args(&args, &data_source)?;
  167. let cache_key = data_source.get_cache_key(&file_format);
  168. let mut cache = self.result_cache.lock().expect("result cache lock");
  169. let response_client = self.client.lock().expect("response client lock");
  170. if let Some(cached_result) = cache.get(&cache_key) {
  171. return Ok(cached_result.clone());
  172. }
  173. let data = match data_source {
  174. DataSource::Path(path) => read_data_file(&self.base_path, path),
  175. DataSource::Url(url) => {
  176. let mut response = response_client
  177. .get(url.as_str())
  178. .header(header::ACCEPT, file_format.as_accept_header())
  179. .send()
  180. .and_then(|res| res.error_for_status())
  181. .map_err(|e| {
  182. format!(
  183. "Failed to request {}: {}",
  184. url,
  185. e.status().expect("response status")
  186. )
  187. })?;
  188. response
  189. .text()
  190. .map_err(|e| format!("Failed to parse response from {}: {:?}", url, e).into())
  191. }
  192. }?;
  193. let result_value: Result<Value> = match file_format {
  194. OutputFormat::Toml => load_toml(data),
  195. OutputFormat::Csv => load_csv(data),
  196. OutputFormat::Json => load_json(data),
  197. OutputFormat::Plain => to_value(data).map_err(|e| e.into()),
  198. };
  199. if let Ok(data_result) = &result_value {
  200. cache.insert(cache_key, data_result.clone());
  201. }
  202. result_value
  203. }
  204. }
  205. /// Parse a JSON string and convert it to a Tera Value
  206. fn load_json(json_data: String) -> Result<Value> {
  207. let json_content: Value =
  208. serde_json::from_str(json_data.as_str()).map_err(|e| format!("{:?}", e))?;
  209. Ok(json_content)
  210. }
  211. /// Parse a TOML string and convert it to a Tera Value
  212. fn load_toml(toml_data: String) -> Result<Value> {
  213. let toml_content: toml::Value = toml::from_str(&toml_data).map_err(|e| format!("{:?}", e))?;
  214. let toml_value = to_value(toml_content).expect("Got invalid JSON that was valid TOML somehow");
  215. match toml_value {
  216. Value::Object(m) => Ok(fix_toml_dates(m)),
  217. _ => unreachable!("Loaded something other than a TOML object"),
  218. }
  219. }
  220. /// Parse a CSV string and convert it to a Tera Value
  221. ///
  222. /// An example csv file `example.csv` could be:
  223. /// ```csv
  224. /// Number, Title
  225. /// 1,Gutenberg
  226. /// 2,Printing
  227. /// ```
  228. /// The json value output would be:
  229. /// ```json
  230. /// {
  231. /// "headers": ["Number", "Title"],
  232. /// "records": [
  233. /// ["1", "Gutenberg"],
  234. /// ["2", "Printing"]
  235. /// ],
  236. /// }
  237. /// ```
  238. fn load_csv(csv_data: String) -> Result<Value> {
  239. let mut reader = Reader::from_reader(csv_data.as_bytes());
  240. let mut csv_map = Map::new();
  241. {
  242. let hdrs = reader.headers().map_err(|e| {
  243. format!("'load_data': {} - unable to read CSV header line (line 1) for CSV file", e)
  244. })?;
  245. let headers_array = hdrs.iter().map(|v| Value::String(v.to_string())).collect();
  246. csv_map.insert(String::from("headers"), Value::Array(headers_array));
  247. }
  248. {
  249. let records = reader.records();
  250. let mut records_array: Vec<Value> = Vec::new();
  251. for result in records {
  252. let record = match result {
  253. Ok(r) => r,
  254. Err(e) => {
  255. return Err(tera::Error::chain(
  256. String::from("Error encountered when parsing csv records"),
  257. e,
  258. ));
  259. }
  260. };
  261. let mut elements_array: Vec<Value> = Vec::new();
  262. for e in record.into_iter() {
  263. elements_array.push(Value::String(String::from(e)));
  264. }
  265. records_array.push(Value::Array(elements_array));
  266. }
  267. csv_map.insert(String::from("records"), Value::Array(records_array));
  268. }
  269. let csv_value: Value = Value::Object(csv_map);
  270. to_value(csv_value).map_err(|err| err.into())
  271. }
  272. #[cfg(test)]
  273. mod tests {
  274. use super::{DataSource, LoadData, OutputFormat};
  275. use std::collections::HashMap;
  276. use std::path::PathBuf;
  277. use tera::{to_value, Function};
  278. fn get_test_file(filename: &str) -> PathBuf {
  279. let test_files = PathBuf::from("../utils/test-files").canonicalize().unwrap();
  280. return test_files.join(filename);
  281. }
  282. #[test]
  283. fn fails_when_missing_file() {
  284. let static_fn = LoadData::new(PathBuf::from("../utils"));
  285. let mut args = HashMap::new();
  286. args.insert("path".to_string(), to_value("../../../READMEE.md").unwrap());
  287. let result = static_fn.call(&args);
  288. assert!(result.is_err());
  289. assert!(result.unwrap_err().to_string().contains("READMEE.md doesn't exist"));
  290. }
  291. #[test]
  292. fn cant_load_outside_content_dir() {
  293. let static_fn = LoadData::new(PathBuf::from(PathBuf::from("../utils")));
  294. let mut args = HashMap::new();
  295. args.insert("path".to_string(), to_value("../../README.md").unwrap());
  296. args.insert("format".to_string(), to_value("plain").unwrap());
  297. let result = static_fn.call(&args);
  298. assert!(result.is_err());
  299. assert!(result
  300. .unwrap_err()
  301. .to_string()
  302. .contains("README.md is not inside the base site directory"));
  303. }
  304. #[test]
  305. fn calculates_cache_key_for_path() {
  306. // We can't test against a fixed value, due to the fact the cache key is built from the absolute path
  307. let cache_key =
  308. DataSource::Path(get_test_file("test.toml")).get_cache_key(&OutputFormat::Toml);
  309. let cache_key_2 =
  310. DataSource::Path(get_test_file("test.toml")).get_cache_key(&OutputFormat::Toml);
  311. assert_eq!(cache_key, cache_key_2);
  312. }
  313. #[test]
  314. fn calculates_cache_key_for_url() {
  315. let cache_key =
  316. DataSource::Url("https://api.github.com/repos/getzola/zola".parse().unwrap())
  317. .get_cache_key(&OutputFormat::Plain);
  318. assert_eq!(cache_key, 8916756616423791754);
  319. }
  320. #[test]
  321. fn different_cache_key_per_filename() {
  322. let toml_cache_key =
  323. DataSource::Path(get_test_file("test.toml")).get_cache_key(&OutputFormat::Toml);
  324. let json_cache_key =
  325. DataSource::Path(get_test_file("test.json")).get_cache_key(&OutputFormat::Toml);
  326. assert_ne!(toml_cache_key, json_cache_key);
  327. }
  328. #[test]
  329. fn different_cache_key_per_format() {
  330. let toml_cache_key =
  331. DataSource::Path(get_test_file("test.toml")).get_cache_key(&OutputFormat::Toml);
  332. let json_cache_key =
  333. DataSource::Path(get_test_file("test.toml")).get_cache_key(&OutputFormat::Json);
  334. assert_ne!(toml_cache_key, json_cache_key);
  335. }
  336. #[test]
  337. fn can_load_remote_data() {
  338. let static_fn = LoadData::new(PathBuf::new());
  339. let mut args = HashMap::new();
  340. args.insert("url".to_string(), to_value("https://httpbin.org/json").unwrap());
  341. args.insert("format".to_string(), to_value("json").unwrap());
  342. let result = static_fn.call(&args).unwrap();
  343. assert_eq!(
  344. result.get("slideshow").unwrap().get("title").unwrap(),
  345. &to_value("Sample Slide Show").unwrap()
  346. );
  347. }
  348. #[test]
  349. fn fails_when_request_404s() {
  350. let static_fn = LoadData::new(PathBuf::new());
  351. let mut args = HashMap::new();
  352. args.insert("url".to_string(), to_value("https://httpbin.org/status/404/").unwrap());
  353. args.insert("format".to_string(), to_value("json").unwrap());
  354. let result = static_fn.call(&args);
  355. assert!(result.is_err());
  356. assert_eq!(
  357. result.unwrap_err().to_string(),
  358. "Failed to request https://httpbin.org/status/404/: 404 Not Found"
  359. );
  360. }
  361. #[test]
  362. fn can_load_toml() {
  363. let static_fn = LoadData::new(PathBuf::from("../utils/test-files"));
  364. let mut args = HashMap::new();
  365. args.insert("path".to_string(), to_value("test.toml").unwrap());
  366. let result = static_fn.call(&args.clone()).unwrap();
  367. // TOML does not load in order
  368. assert_eq!(
  369. result,
  370. json!({
  371. "category": {
  372. "date": "1979-05-27T07:32:00Z",
  373. "lt1": "07:32:00",
  374. "key": "value"
  375. },
  376. })
  377. );
  378. }
  379. #[test]
  380. fn unknown_extension_defaults_to_plain() {
  381. let static_fn = LoadData::new(PathBuf::from("../utils/test-files"));
  382. let mut args = HashMap::new();
  383. args.insert("path".to_string(), to_value("test.css").unwrap());
  384. let result = static_fn.call(&args.clone()).unwrap();
  385. if cfg!(windows) {
  386. assert_eq!(result, ".hello {}\r\n",);
  387. } else {
  388. assert_eq!(result, ".hello {}\n",);
  389. };
  390. }
  391. #[test]
  392. fn can_override_known_extension_with_format() {
  393. let static_fn = LoadData::new(PathBuf::from("../utils/test-files"));
  394. let mut args = HashMap::new();
  395. args.insert("path".to_string(), to_value("test.csv").unwrap());
  396. args.insert("format".to_string(), to_value("plain").unwrap());
  397. let result = static_fn.call(&args.clone()).unwrap();
  398. if cfg!(windows) {
  399. assert_eq!(result, "Number,Title\r\n1,Gutenberg\r\n2,Printing",);
  400. } else {
  401. assert_eq!(result, "Number,Title\n1,Gutenberg\n2,Printing",);
  402. };
  403. }
  404. #[test]
  405. fn will_use_format_on_unknown_extension() {
  406. let static_fn = LoadData::new(PathBuf::from("../utils/test-files"));
  407. let mut args = HashMap::new();
  408. args.insert("path".to_string(), to_value("test.css").unwrap());
  409. args.insert("format".to_string(), to_value("plain").unwrap());
  410. let result = static_fn.call(&args.clone()).unwrap();
  411. if cfg!(windows) {
  412. assert_eq!(result, ".hello {}\r\n",);
  413. } else {
  414. assert_eq!(result, ".hello {}\n",);
  415. };
  416. }
  417. #[test]
  418. fn can_load_csv() {
  419. let static_fn = LoadData::new(PathBuf::from("../utils/test-files"));
  420. let mut args = HashMap::new();
  421. args.insert("path".to_string(), to_value("test.csv").unwrap());
  422. let result = static_fn.call(&args.clone()).unwrap();
  423. assert_eq!(
  424. result,
  425. json!({
  426. "headers": ["Number", "Title"],
  427. "records": [
  428. ["1", "Gutenberg"],
  429. ["2", "Printing"]
  430. ],
  431. })
  432. )
  433. }
  434. // Test points to bad csv file with uneven row lengths
  435. #[test]
  436. fn bad_csv_should_result_in_error() {
  437. let static_fn = LoadData::new(PathBuf::from("../utils/test-files"));
  438. let mut args = HashMap::new();
  439. args.insert("path".to_string(), to_value("uneven_rows.csv").unwrap());
  440. let result = static_fn.call(&args.clone());
  441. assert!(result.is_err());
  442. let error_kind = result.err().unwrap().kind;
  443. match error_kind {
  444. tera::ErrorKind::Msg(msg) => {
  445. if msg != String::from("Error encountered when parsing csv records") {
  446. panic!("Error message is wrong. Perhaps wrong error is being returned?");
  447. }
  448. }
  449. _ => panic!("Error encountered was not expected CSV error"),
  450. }
  451. }
  452. #[test]
  453. fn can_load_json() {
  454. let static_fn = LoadData::new(PathBuf::from("../utils/test-files"));
  455. let mut args = HashMap::new();
  456. args.insert("path".to_string(), to_value("test.json").unwrap());
  457. let result = static_fn.call(&args.clone()).unwrap();
  458. assert_eq!(
  459. result,
  460. json!({
  461. "key": "value",
  462. "array": [1, 2, 3],
  463. "subpackage": {
  464. "subkey": 5
  465. }
  466. })
  467. )
  468. }
  469. }