You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

290 lines
10KB

  1. use std::str::FromStr;
  2. use std::collections::{HashMap, HashSet};
  3. use chrono::{DateTime, Utc, NaiveDateTime, TimeZone};
  4. use elasticlunr::{Index, Language};
  5. use lazy_static::lazy_static;
  6. #[allow(unused_imports)]
  7. use errors::{bail, Result, Error};
  8. use library::{Library, Section};
  9. pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");
  10. lazy_static! {
  11. static ref AMMONIA: ammonia::Builder<'static> = {
  12. let mut clean_content = HashSet::new();
  13. clean_content.insert("script");
  14. clean_content.insert("style");
  15. let mut builder = ammonia::Builder::new();
  16. builder
  17. .tags(HashSet::new())
  18. .tag_attributes(HashMap::new())
  19. .generic_attributes(HashSet::new())
  20. .link_rel(None)
  21. .allowed_classes(HashMap::new())
  22. .clean_content_tags(clean_content);
  23. builder
  24. };
  25. }
  26. /// Returns the generated JSON index with all the documents of the site added using
  27. /// the language given
  28. /// Errors if the language given is not available in Elasticlunr
  29. /// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
  30. pub fn build_index(lang: &str, library: &Library) -> Result<Index> {
  31. let language = match Language::from_code(lang) {
  32. Some(l) => l,
  33. None => {
  34. bail!("Tried to build search index for language {} which is not supported", lang);
  35. }
  36. };
  37. let mut index = Index::with_language(language, &["title", "body"]);
  38. for section in library.sections_values() {
  39. if section.lang == lang {
  40. add_section_to_index(&mut index, section, library);
  41. }
  42. }
  43. Ok(index)
  44. }
  45. fn add_section_to_index(index: &mut Index, section: &Section, library: &Library) {
  46. if !section.meta.in_search_index {
  47. return;
  48. }
  49. // Don't index redirecting sections
  50. if section.meta.redirect_to.is_none() {
  51. index.add_doc(
  52. &section.permalink,
  53. &[
  54. &section.meta.title.clone().unwrap_or_default(),
  55. &AMMONIA.clean(&section.content).to_string(),
  56. ],
  57. );
  58. }
  59. for key in &section.pages {
  60. let page = library.get_page_by_key(*key);
  61. if !page.meta.in_search_index {
  62. continue;
  63. }
  64. index.add_doc(
  65. &page.permalink,
  66. &[
  67. &page.meta.title.clone().unwrap_or_default(),
  68. &AMMONIA.clean(&page.content).to_string(),
  69. ],
  70. );
  71. }
  72. }
  73. #[cfg(feature = "tantivy-indexing")]
  74. fn parse_language(lang: &str) -> Option<tantivy::tokenizer::Language> {
  75. use serde_derive::Deserialize;
  76. #[derive(Deserialize)]
  77. struct Lang {
  78. pub language: tantivy::tokenizer::Language,
  79. }
  80. // expecting two-character code, but will try other forms as fallback
  81. match lang.len() {
  82. 2 => isolang::Language::from_639_1(&lang.to_lowercase())
  83. .and_then(|parsed| {
  84. let json = format!("{{\"language\":\"{}\"}}", parsed.to_name());
  85. serde_json::from_str::<Lang>(&json).ok().map(|Lang { language }| language)
  86. }),
  87. 3 => isolang::Language::from_639_3(&lang.to_lowercase())
  88. .and_then(|parsed| {
  89. serde_json::from_str::<tantivy::tokenizer::Language>(parsed.to_name()).ok()
  90. }),
  91. // apparently not a code, so this is best available option
  92. _ => serde_json::from_str::<tantivy::tokenizer::Language>(lang).ok()
  93. }
  94. }
  95. #[cfg(feature = "tantivy-indexing")]
  96. fn parse_dt_assume_utc(datetime_string: &Option<String>, naive_datetime: &Option<NaiveDateTime>) -> Option<DateTime<Utc>> {
  97. // start here because it will potentially have timezone in the string
  98. if let Some(s) = datetime_string.as_ref() {
  99. if let Ok(utc) = DateTime::from_str(s.as_str()) {
  100. return Some(utc)
  101. }
  102. }
  103. // otherwise, if we have the NaiveDateTime, we'll assume it's UTC. would not do this if the
  104. // stakes were higher!
  105. if let Some(naive) = naive_datetime {
  106. return Some(Utc.from_utc_datetime(&naive))
  107. }
  108. None
  109. }
  110. #[cfg(feature = "tantivy-indexing")]
  111. fn normalize_taxonomy_name(s: &str) -> String {
  112. s.replace("-", "_")
  113. }
  114. #[cfg(feature = "tantivy-indexing")]
  115. pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
  116. lang: &str,
  117. library: &Library,
  118. index_dir: P,
  119. ) -> Result<usize> {
  120. use tantivy::{schema::*, tokenizer::*, Index, Document};
  121. use tantivy::doc;
  122. let parsed_lang: Language = parse_language(lang)
  123. .ok_or_else(|| { Error::from(format!("failed to parse language: '{}'", lang)) })?;
  124. let tokenizer_name: String = match parsed_lang {
  125. Language::English => "en_stem".to_string(),
  126. other => format!("{:?}_stem", other).to_lowercase(),
  127. };
  128. let text_indexing_options = TextFieldIndexing::default()
  129. .set_index_option(IndexRecordOption::WithFreqsAndPositions)
  130. .set_tokenizer(&tokenizer_name);
  131. let text_options = TextOptions::default()
  132. .set_indexing_options(text_indexing_options)
  133. .set_stored();
  134. struct IndexContent<'a> {
  135. pub title: &'a str,
  136. pub description: &'a str,
  137. pub permalink: &'a str,
  138. pub body: String,
  139. pub datetime: Option<DateTime<Utc>>,
  140. pub taxonomies: &'a HashMap<String, Vec<String>>,
  141. }
  142. let mut seen: HashSet<String> = Default::default(); // unique permalinks already indexed
  143. let mut all_taxonomies: HashSet<String> = Default::default(); // remember any taxonomy used anywhere so we can add to schema
  144. let mut index_pages: Vec<IndexContent> = Vec::new();
  145. let mut n_indexed = 0;
  146. let empty_taxonomies: HashMap<String, Vec<String>> = Default::default();
  147. for section in library.sections_values() {
  148. // reason for macro: Section/Page are different types but have same attributes
  149. macro_rules! extract_content {
  150. ($page:ident) => {{
  151. let already_indexed = seen.contains(&$page.permalink);
  152. if ! already_indexed && $page.meta.in_search_index && $page.lang == lang {
  153. seen.insert($page.permalink.clone()); // mark ask indexed
  154. n_indexed += 1;
  155. let cleaned_body: String = AMMONIA.clean(&$page.content).to_string();
  156. Some(IndexContent {
  157. title: $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
  158. description: $page.meta.description.as_ref().map(|x| x.as_str()).unwrap_or(""),
  159. permalink: $page.permalink.as_str(),
  160. body: cleaned_body,
  161. // page-only fields, leave blank
  162. datetime: None,
  163. taxonomies: &empty_taxonomies,
  164. })
  165. } else {
  166. None
  167. }
  168. }}
  169. }
  170. if section.meta.redirect_to.is_none() {
  171. if let Some(content) = extract_content!(section) {
  172. index_pages.push(content);
  173. }
  174. }
  175. for key in &section.pages {
  176. let page = library.get_page_by_key(*key);
  177. match extract_content!(page) {
  178. Some(mut index_content) => {
  179. all_taxonomies.extend(page.meta.taxonomies.keys().map(|x| normalize_taxonomy_name(x)));
  180. index_content.taxonomies = &page.meta.taxonomies;
  181. index_content.datetime = parse_dt_assume_utc(&page.meta.date, &page.meta.datetime);
  182. index_pages.push(index_content);
  183. }
  184. None => {}
  185. }
  186. }
  187. }
  188. let mut schema = SchemaBuilder::new();
  189. let mut fields: HashMap<String, Field> = Default::default();
  190. for text_field_name in &["title", "body", "description"] {
  191. fields.insert(text_field_name.to_string(), schema.add_text_field(text_field_name, text_options.clone()));
  192. }
  193. fields.insert("permalink".to_string(), schema.add_text_field("permalink", STORED));
  194. fields.insert("datetime".to_string(), schema.add_date_field("datetime", STORED | INDEXED));
  195. let reserved_field_names: HashSet<String> = fields.keys().map(|s| s.to_string()).collect();
  196. for taxonomy_name in all_taxonomies.difference(&reserved_field_names) {
  197. fields.insert(taxonomy_name.to_string(), schema.add_text_field(taxonomy_name.as_str(), text_options.clone()));
  198. }
  199. let schema = schema.build();
  200. let index = Index::create_in_dir(&index_dir, schema.clone())
  201. .map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?;
  202. // take care of non-English stemmers if needed
  203. if index.tokenizers().get(&tokenizer_name).is_none() {
  204. let tokenizer = TextAnalyzer::from(SimpleTokenizer)
  205. .filter(RemoveLongFilter::limit(40))
  206. .filter(LowerCaser)
  207. .filter(Stemmer::new(parsed_lang));
  208. index.tokenizers().register(&tokenizer_name, tokenizer);
  209. }
  210. let mut wtr = index.writer(1024 * 1024 * 256)
  211. .map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?;
  212. // now, let's index!
  213. for page in index_pages {
  214. let mut document: Document = doc!(
  215. fields["title"] => page.title,
  216. fields["description"] => page.description,
  217. fields["permalink"] => page.permalink,
  218. fields["body"] => page.body,
  219. );
  220. if let Some(utc) = page.datetime {
  221. document.add_date(fields["datetime"], &utc);
  222. }
  223. for (taxonomy, terms) in page.taxonomies.iter().filter(|(k, _)| ! reserved_field_names.contains(k.as_str())) {
  224. let normalized_taxonomy = normalize_taxonomy_name(taxonomy);
  225. for term in terms.iter() {
  226. document.add_text(fields[&normalized_taxonomy], term);
  227. }
  228. }
  229. wtr.add_document(document);
  230. }
  231. //wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
  232. wtr.commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
  233. wtr.wait_merging_threads().map_err(|e| { Error::from(format!("tantivy IndexWriter::wait_merging_threads failed: {}", e)) })?;
  234. drop(index);
  235. Ok(n_indexed)
  236. }