You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

295 lines
10KB

  1. #[allow(unused_imports)]
  2. use std::str::FromStr;
  3. use std::collections::{HashMap, HashSet};
  4. use elasticlunr::{Index, Language};
  5. use lazy_static::lazy_static;
  6. #[cfg(feature = "tantivy-indexing")]
  7. use chrono::{DateTime, Utc, NaiveDateTime, TimeZone};
  8. #[allow(unused_imports)]
  9. use errors::{bail, Result, Error};
  10. use library::{Library, Section};
  11. pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");
  12. lazy_static! {
  13. static ref AMMONIA: ammonia::Builder<'static> = {
  14. let mut clean_content = HashSet::new();
  15. clean_content.insert("script");
  16. clean_content.insert("style");
  17. let mut builder = ammonia::Builder::new();
  18. builder
  19. .tags(HashSet::new())
  20. .tag_attributes(HashMap::new())
  21. .generic_attributes(HashSet::new())
  22. .link_rel(None)
  23. .allowed_classes(HashMap::new())
  24. .clean_content_tags(clean_content);
  25. builder
  26. };
  27. }
  28. /// Returns the generated JSON index with all the documents of the site added using
  29. /// the language given
  30. /// Errors if the language given is not available in Elasticlunr
  31. /// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
  32. pub fn build_index(lang: &str, library: &Library) -> Result<Index> {
  33. let language = match Language::from_code(lang) {
  34. Some(l) => l,
  35. None => {
  36. bail!("Tried to build search index for language {} which is not supported", lang);
  37. }
  38. };
  39. let mut index = Index::with_language(language, &["title", "body"]);
  40. for section in library.sections_values() {
  41. if section.lang == lang {
  42. add_section_to_index(&mut index, section, library);
  43. }
  44. }
  45. Ok(index)
  46. }
  47. fn add_section_to_index(index: &mut Index, section: &Section, library: &Library) {
  48. if !section.meta.in_search_index {
  49. return;
  50. }
  51. // Don't index redirecting sections
  52. if section.meta.redirect_to.is_none() {
  53. index.add_doc(
  54. &section.permalink,
  55. &[
  56. &section.meta.title.clone().unwrap_or_default(),
  57. &AMMONIA.clean(&section.content).to_string(),
  58. ],
  59. );
  60. }
  61. for key in &section.pages {
  62. let page = library.get_page_by_key(*key);
  63. if !page.meta.in_search_index {
  64. continue;
  65. }
  66. index.add_doc(
  67. &page.permalink,
  68. &[
  69. &page.meta.title.clone().unwrap_or_default(),
  70. &AMMONIA.clean(&page.content).to_string(),
  71. ],
  72. );
  73. }
  74. }
  75. #[cfg(feature = "tantivy-indexing")]
  76. fn parse_language(lang: &str) -> Option<tantivy::tokenizer::Language> {
  77. use serde_derive::Deserialize;
  78. #[derive(Deserialize)]
  79. struct Lang {
  80. pub language: tantivy::tokenizer::Language,
  81. }
  82. // expecting two-character code, but will try other forms as fallback
  83. match lang.len() {
  84. 2 => isolang::Language::from_639_1(&lang.to_lowercase())
  85. .and_then(|parsed| {
  86. let json = format!("{{\"language\":\"{}\"}}", parsed.to_name());
  87. serde_json::from_str::<Lang>(&json).ok().map(|Lang { language }| language)
  88. }),
  89. 3 => isolang::Language::from_639_3(&lang.to_lowercase())
  90. .and_then(|parsed| {
  91. serde_json::from_str::<tantivy::tokenizer::Language>(parsed.to_name()).ok()
  92. }),
  93. // apparently not a code, so this is best available option
  94. _ => serde_json::from_str::<tantivy::tokenizer::Language>(&format!("{{\"language\":\"{}\"}}", lang)).ok()
  95. }
  96. }
  97. #[cfg(feature = "tantivy-indexing")]
  98. fn parse_dt_assume_utc(datetime_string: &Option<String>, naive_datetime: &Option<NaiveDateTime>) -> Option<DateTime<Utc>> {
  99. // start here because it will potentially have timezone in the string
  100. if let Some(s) = datetime_string.as_ref() {
  101. if let Ok(utc) = DateTime::from_str(s.as_str()) {
  102. return Some(utc)
  103. }
  104. }
  105. // otherwise, if we have the NaiveDateTime, we'll assume it's UTC. would not do this if the
  106. // stakes were higher!
  107. if let Some(naive) = naive_datetime {
  108. return Some(Utc.from_utc_datetime(&naive))
  109. }
  110. None
  111. }
  112. #[cfg(feature = "tantivy-indexing")]
  113. fn normalize_taxonomy_name(s: &str) -> String {
  114. s.replace("-", "_")
  115. }
  116. #[cfg(feature = "tantivy-indexing")]
  117. pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
  118. lang: &str,
  119. library: &Library,
  120. index_dir: P,
  121. ) -> Result<usize> {
  122. use tantivy::{schema::*, tokenizer::*, Index, Document};
  123. use tantivy::doc;
  124. let parsed_lang: Language = parse_language(lang)
  125. .ok_or_else(|| { Error::from(format!("failed to parse language: '{}'", lang)) })?;
  126. let tokenizer_name: String = match parsed_lang {
  127. Language::English => "en_stem".to_string(),
  128. other => format!("{:?}_stem", other).to_lowercase(),
  129. };
  130. let text_indexing_options = TextFieldIndexing::default()
  131. .set_index_option(IndexRecordOption::WithFreqsAndPositions)
  132. .set_tokenizer(&tokenizer_name);
  133. let text_options = TextOptions::default()
  134. .set_indexing_options(text_indexing_options)
  135. .set_stored();
  136. struct IndexContent<'a> {
  137. pub title: &'a str,
  138. pub description: &'a str,
  139. pub permalink: &'a str,
  140. pub body: String,
  141. pub datetime: Option<DateTime<Utc>>,
  142. pub taxonomies: &'a HashMap<String, Vec<String>>,
  143. }
  144. let mut seen: HashSet<String> = Default::default(); // unique permalinks already indexed
  145. let mut all_taxonomies: HashSet<String> = Default::default(); // remember any taxonomy used anywhere so we can add to schema
  146. let mut index_pages: Vec<IndexContent> = Vec::new();
  147. let mut n_indexed = 0;
  148. let empty_taxonomies: HashMap<String, Vec<String>> = Default::default();
  149. for section in library.sections_values() {
  150. //eprintln!("section: {:?}, section.pages: {:?}", section, section.pages);
  151. // reason for macro: Section/Page are different types but have same attributes
  152. macro_rules! extract_content {
  153. ($page:ident) => {{
  154. let already_indexed = seen.contains(&$page.permalink);
  155. if ! already_indexed && $page.meta.in_search_index && $page.lang == lang {
  156. seen.insert($page.permalink.clone()); // mark ask indexed
  157. n_indexed += 1;
  158. let cleaned_body: String = AMMONIA.clean(&$page.content).to_string();
  159. //eprintln!("indexing {}", $page.permalink.as_str());
  160. Some(IndexContent {
  161. title: $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
  162. description: $page.meta.description.as_ref().map(|x| x.as_str()).unwrap_or(""),
  163. permalink: $page.permalink.as_str(),
  164. body: cleaned_body,
  165. // page-only fields, leave blank
  166. datetime: None,
  167. taxonomies: &empty_taxonomies,
  168. })
  169. } else {
  170. //eprintln!("not indexing {}", $page.permalink.as_str());
  171. None
  172. }
  173. }}
  174. }
  175. if section.meta.redirect_to.is_none() {
  176. if let Some(content) = extract_content!(section) {
  177. index_pages.push(content);
  178. }
  179. }
  180. for key in &section.pages {
  181. let page = library.get_page_by_key(*key);
  182. match extract_content!(page) {
  183. Some(mut index_content) => {
  184. all_taxonomies.extend(page.meta.taxonomies.keys().map(|x| normalize_taxonomy_name(x)));
  185. index_content.taxonomies = &page.meta.taxonomies;
  186. index_content.datetime = parse_dt_assume_utc(&page.meta.date, &page.meta.datetime);
  187. index_pages.push(index_content);
  188. }
  189. None => {}
  190. }
  191. }
  192. }
  193. let mut schema = SchemaBuilder::new();
  194. let mut fields: HashMap<String, Field> = Default::default();
  195. for text_field_name in &["title", "body", "description"] {
  196. fields.insert(text_field_name.to_string(), schema.add_text_field(text_field_name, text_options.clone()));
  197. }
  198. fields.insert("permalink".to_string(), schema.add_text_field("permalink", STORED));
  199. fields.insert("datetime".to_string(), schema.add_date_field("datetime", STORED | INDEXED));
  200. let reserved_field_names: HashSet<String> = fields.keys().map(|s| s.to_string()).collect();
  201. for taxonomy_name in all_taxonomies.difference(&reserved_field_names) {
  202. fields.insert(taxonomy_name.to_string(), schema.add_text_field(taxonomy_name.as_str(), text_options.clone()));
  203. }
  204. let schema = schema.build();
  205. let index = Index::create_in_dir(&index_dir, schema.clone())
  206. .map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?;
  207. // take care of non-English stemmers if needed
  208. if index.tokenizers().get(&tokenizer_name).is_none() {
  209. let tokenizer = TextAnalyzer::from(SimpleTokenizer)
  210. .filter(RemoveLongFilter::limit(40))
  211. .filter(LowerCaser)
  212. .filter(Stemmer::new(parsed_lang));
  213. index.tokenizers().register(&tokenizer_name, tokenizer);
  214. }
  215. let mut wtr = index.writer(1024 * 1024 * 256)
  216. .map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?;
  217. // now, let's index!
  218. for page in index_pages {
  219. let mut document: Document = doc!(
  220. fields["title"] => page.title,
  221. fields["description"] => page.description,
  222. fields["permalink"] => page.permalink,
  223. fields["body"] => page.body,
  224. );
  225. if let Some(utc) = page.datetime {
  226. document.add_date(fields["datetime"], &utc);
  227. }
  228. for (taxonomy, terms) in page.taxonomies.iter().filter(|(k, _)| ! reserved_field_names.contains(k.as_str())) {
  229. let normalized_taxonomy = normalize_taxonomy_name(taxonomy);
  230. for term in terms.iter() {
  231. document.add_text(fields[&normalized_taxonomy], term);
  232. }
  233. }
  234. wtr.add_document(document);
  235. }
  236. //wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
  237. wtr.commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
  238. wtr.wait_merging_threads().map_err(|e| { Error::from(format!("tantivy IndexWriter::wait_merging_threads failed: {}", e)) })?;
  239. drop(index);
  240. Ok(n_indexed)
  241. }