You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

230 lines
7.8KB

  1. use std::collections::{HashMap, HashSet};
  2. use elasticlunr::{Index, Language};
  3. use lazy_static::lazy_static;
  4. use errors::{bail, Result, Error};
  5. use library::{Library, Section};
  6. pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");
  7. lazy_static! {
  8. static ref AMMONIA: ammonia::Builder<'static> = {
  9. let mut clean_content = HashSet::new();
  10. clean_content.insert("script");
  11. clean_content.insert("style");
  12. let mut builder = ammonia::Builder::new();
  13. builder
  14. .tags(HashSet::new())
  15. .tag_attributes(HashMap::new())
  16. .generic_attributes(HashSet::new())
  17. .link_rel(None)
  18. .allowed_classes(HashMap::new())
  19. .clean_content_tags(clean_content);
  20. builder
  21. };
  22. }
  23. /// Returns the generated JSON index with all the documents of the site added using
  24. /// the language given
  25. /// Errors if the language given is not available in Elasticlunr
  26. /// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
  27. pub fn build_index(lang: &str, library: &Library) -> Result<String> {
  28. let language = match Language::from_code(lang) {
  29. Some(l) => l,
  30. None => {
  31. bail!("Tried to build search index for language {} which is not supported", lang);
  32. }
  33. };
  34. let mut index = Index::with_language(language, &["title", "body"]);
  35. for section in library.sections_values() {
  36. if section.lang == lang {
  37. add_section_to_index(&mut index, section, library);
  38. }
  39. }
  40. Ok(index.to_json())
  41. }
  42. fn add_section_to_index(index: &mut Index, section: &Section, library: &Library) {
  43. if !section.meta.in_search_index {
  44. return;
  45. }
  46. // Don't index redirecting sections
  47. if section.meta.redirect_to.is_none() {
  48. index.add_doc(
  49. &section.permalink,
  50. &[
  51. &section.meta.title.clone().unwrap_or_default(),
  52. &AMMONIA.clean(&section.content).to_string(),
  53. ],
  54. );
  55. }
  56. for key in &section.pages {
  57. let page = library.get_page_by_key(*key);
  58. if !page.meta.in_search_index {
  59. continue;
  60. }
  61. index.add_doc(
  62. &page.permalink,
  63. &[
  64. &page.meta.title.clone().unwrap_or_default(),
  65. &AMMONIA.clean(&page.content).to_string(),
  66. ],
  67. );
  68. }
  69. }
  70. #[cfg(feature = "tantivy-indexing")]
  71. fn parse_language(lang: &str) -> Option<tantivy::tokenizer::Language> {
  72. use serde_derive::Deserialize;
  73. #[derive(Deserialize)]
  74. struct Lang {
  75. pub language: tantivy::tokenizer::Language,
  76. }
  77. // expecting two-character code, but will try other forms as fallback
  78. match lang.len() {
  79. 2 => isolang::Language::from_639_1(&lang.to_lowercase())
  80. .and_then(|parsed| {
  81. let json = format!("{{\"language\":\"{}\"}}", parsed.to_name());
  82. serde_json::from_str::<Lang>(&json).ok().map(|Lang { language }| language)
  83. }),
  84. 3 => isolang::Language::from_639_3(&lang.to_lowercase())
  85. .and_then(|parsed| {
  86. serde_json::from_str::<tantivy::tokenizer::Language>(parsed.to_name()).ok()
  87. }),
  88. // apparently not a code, so this is best available option
  89. _ => serde_json::from_str::<tantivy::tokenizer::Language>(lang).ok()
  90. }
  91. }
  92. #[cfg(feature = "tantivy-indexing")]
  93. pub fn build_tantivy_index(
  94. lang: &str,
  95. library: &Library,
  96. output_dir: &str,
  97. //skip_section_pages: bool,
  98. ) -> Result<()> {
  99. use tantivy::{schema::*, tokenizer::*, Index, Document};
  100. use tantivy::doc;
  101. let parsed_lang: Language = parse_language(lang)
  102. .ok_or_else(|| { Error::from(format!("failed to parse language: '{}'", lang)) })?;
  103. let tokenizer_name: String = match parsed_lang {
  104. Language::English => "en_stem".to_string(),
  105. other => format!("{:?}_stem", parsed_lang).to_lowercase(),
  106. };
  107. let mut text_indexing_options = TextFieldIndexing::default()
  108. .set_index_option(IndexRecordOption::WithFreqsAndPositions)
  109. .set_tokenizer(&tokenizer_name);
  110. let text_options = TextOptions::default()
  111. .set_indexing_options(text_indexing_options)
  112. .set_stored();
  113. let mut schema = SchemaBuilder::new();
  114. let title = schema.add_text_field("title", text_options.clone());
  115. let body = schema.add_text_field("body", text_options.clone());
  116. let permalink = schema.add_text_field("permalink", STORED);
  117. let schema = schema.build();
  118. let index_dir = std::path::Path::new(output_dir).join("tantivy-index");
  119. //utils::fs::ensure_directory_exists(&index_dir)?;
  120. let mut index = Index::create_in_dir(&index_dir, schema.clone())
  121. .map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?;
  122. if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer
  123. let tokenizer = TextAnalyzer::from(SimpleTokenizer)
  124. .filter(RemoveLongFilter::limit(40))
  125. .filter(LowerCaser)
  126. .filter(Stemmer::new(parsed_lang));
  127. index.tokenizers().register(&tokenizer_name, tokenizer);
  128. }
  129. //let mut wtr = index.writer_with_num_threads(num_cpus::get_physical(), 1024 * 1024 * 256)
  130. //let mut wtr = index.writer_with_num_threads(4, 1024 * 1024 * 256)
  131. let mut wtr = index.writer(1024 * 1024 * 256)
  132. .map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?;
  133. //index_writer.set_merge_policy(Box::new(NoMergePolicy));
  134. //let mut sections_it = library.sections_values().iter().filter(|s| s.lang == lang && s.meta.in_search_index);
  135. let mut seen: HashSet<String> = Default::default();
  136. let mut n_indexed = 0;
  137. //let group_size = 100_000;
  138. for section in library.sections_values() {
  139. if section.lang != lang { continue }
  140. if ! section.meta.in_search_index { continue }
  141. // Don't index redirecting sections
  142. //if section.meta.redirect_to.is_none() {
  143. // index.add_doc(
  144. // &section.permalink,
  145. // &[
  146. // &section.meta.title.clone().unwrap_or_default(),
  147. // &AMMONIA.clean(&section.content).to_string(),
  148. // ],
  149. // );
  150. //}
  151. for key in &section.pages {
  152. let page = library.get_page_by_key(*key);
  153. if !page.meta.in_search_index { continue; }
  154. if seen.contains(&page.permalink) { continue }
  155. seen.insert(page.permalink.clone());
  156. //let mut doc = Document::default();
  157. //doc.add(FieldValue::new(title, Value::from(page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""))));
  158. let cleaned_body: String = AMMONIA.clean(&page.content).to_string();
  159. //doc.add(FieldValue::new(body, Value::from(cleaned_body.as_str())));
  160. //doc.add(FieldValue::new(permalink, Value::from(page.permalink.as_str())));
  161. let opstamp = wtr.add_document(doc!(
  162. title => page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
  163. body => cleaned_body.as_str(),
  164. permalink => page.permalink.as_str(),
  165. ));
  166. println!("added {:?} {}", opstamp, page.permalink);
  167. n_indexed += 1;
  168. //if n_indexed % group_size == 0 { }
  169. }
  170. }
  171. wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
  172. let commit_opstamp = wtr.commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
  173. wtr.wait_merging_threads().map_err(|e| { Error::from(format!("tantivy IndexWriter::wait_merging_threads failed: {}", e)) })?;
  174. drop(index);
  175. println!("finished indexing {} pages", n_indexed);
  176. Ok(())
  177. }