You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lib.rs 6.8KB

6 years ago
5 years ago
6 years ago
6 years ago
6 years ago
5 years ago
6 years ago
6 years ago
6 years ago
6 years ago
5 years ago
6 years ago
6 years ago
5 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. use std::collections::{HashMap, HashSet};
  2. use elasticlunr::{Index, Language};
  3. use lazy_static::lazy_static;
  4. #[allow(unused_imports)]
  5. use errors::{bail, Result, Error};
  6. use library::{Library, Section};
  7. pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");
  8. lazy_static! {
  9. static ref AMMONIA: ammonia::Builder<'static> = {
  10. let mut clean_content = HashSet::new();
  11. clean_content.insert("script");
  12. clean_content.insert("style");
  13. let mut builder = ammonia::Builder::new();
  14. builder
  15. .tags(HashSet::new())
  16. .tag_attributes(HashMap::new())
  17. .generic_attributes(HashSet::new())
  18. .link_rel(None)
  19. .allowed_classes(HashMap::new())
  20. .clean_content_tags(clean_content);
  21. builder
  22. };
  23. }
  24. /// Returns the generated JSON index with all the documents of the site added using
  25. /// the language given
  26. /// Errors if the language given is not available in Elasticlunr
  27. /// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
  28. pub fn build_index(lang: &str, library: &Library) -> Result<Index> {
  29. let language = match Language::from_code(lang) {
  30. Some(l) => l,
  31. None => {
  32. bail!("Tried to build search index for language {} which is not supported", lang);
  33. }
  34. };
  35. let mut index = Index::with_language(language, &["title", "body"]);
  36. for section in library.sections_values() {
  37. if section.lang == lang {
  38. add_section_to_index(&mut index, section, library);
  39. }
  40. }
  41. Ok(index)
  42. }
  43. fn add_section_to_index(index: &mut Index, section: &Section, library: &Library) {
  44. if !section.meta.in_search_index {
  45. return;
  46. }
  47. // Don't index redirecting sections
  48. if section.meta.redirect_to.is_none() {
  49. index.add_doc(
  50. &section.permalink,
  51. &[
  52. &section.meta.title.clone().unwrap_or_default(),
  53. &AMMONIA.clean(&section.content).to_string(),
  54. ],
  55. );
  56. }
  57. for key in &section.pages {
  58. let page = library.get_page_by_key(*key);
  59. if !page.meta.in_search_index {
  60. continue;
  61. }
  62. index.add_doc(
  63. &page.permalink,
  64. &[
  65. &page.meta.title.clone().unwrap_or_default(),
  66. &AMMONIA.clean(&page.content).to_string(),
  67. ],
  68. );
  69. }
  70. }
  71. #[cfg(feature = "tantivy-indexing")]
  72. fn parse_language(lang: &str) -> Option<tantivy::tokenizer::Language> {
  73. use serde_derive::Deserialize;
  74. #[derive(Deserialize)]
  75. struct Lang {
  76. pub language: tantivy::tokenizer::Language,
  77. }
  78. // expecting two-character code, but will try other forms as fallback
  79. match lang.len() {
  80. 2 => isolang::Language::from_639_1(&lang.to_lowercase())
  81. .and_then(|parsed| {
  82. let json = format!("{{\"language\":\"{}\"}}", parsed.to_name());
  83. serde_json::from_str::<Lang>(&json).ok().map(|Lang { language }| language)
  84. }),
  85. 3 => isolang::Language::from_639_3(&lang.to_lowercase())
  86. .and_then(|parsed| {
  87. serde_json::from_str::<tantivy::tokenizer::Language>(parsed.to_name()).ok()
  88. }),
  89. // apparently not a code, so this is best available option
  90. _ => serde_json::from_str::<tantivy::tokenizer::Language>(lang).ok()
  91. }
  92. }
  93. #[cfg(feature = "tantivy-indexing")]
  94. pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
  95. lang: &str,
  96. library: &Library,
  97. index_dir: P,
  98. ) -> Result<usize> {
  99. use tantivy::{schema::*, tokenizer::*, Index, Document};
  100. use tantivy::doc;
  101. let parsed_lang: Language = parse_language(lang)
  102. .ok_or_else(|| { Error::from(format!("failed to parse language: '{}'", lang)) })?;
  103. let tokenizer_name: String = match parsed_lang {
  104. Language::English => "en_stem".to_string(),
  105. other => format!("{:?}_stem", other).to_lowercase(),
  106. };
  107. let text_indexing_options = TextFieldIndexing::default()
  108. .set_index_option(IndexRecordOption::WithFreqsAndPositions)
  109. .set_tokenizer(&tokenizer_name);
  110. let text_options = TextOptions::default()
  111. .set_indexing_options(text_indexing_options)
  112. .set_stored();
  113. let mut schema = SchemaBuilder::new();
  114. let title = schema.add_text_field("title", text_options.clone());
  115. let body = schema.add_text_field("body", text_options.clone());
  116. let permalink = schema.add_text_field("permalink", STORED);
  117. let schema = schema.build();
  118. let index = Index::create_in_dir(&index_dir, schema.clone())
  119. .map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?;
  120. if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer
  121. let tokenizer = TextAnalyzer::from(SimpleTokenizer)
  122. .filter(RemoveLongFilter::limit(40))
  123. .filter(LowerCaser)
  124. .filter(Stemmer::new(parsed_lang));
  125. index.tokenizers().register(&tokenizer_name, tokenizer);
  126. }
  127. let mut wtr = index.writer(1024 * 1024 * 256)
  128. .map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?;
  129. let mut seen: HashSet<String> = Default::default();
  130. let mut n_indexed = 0;
  131. for section in library.sections_values() {
  132. // reason for macro: Section/Page are different types but have same attributes
  133. macro_rules! index_page {
  134. ($page:ident) => {{
  135. let already_indexed = seen.contains(&$page.permalink);
  136. if ! already_indexed && $page.meta.in_search_index && $page.lang == lang {
  137. seen.insert($page.permalink.clone()); // mark ask indexed
  138. let cleaned_body: String = AMMONIA.clean(&$page.content).to_string();
  139. let page_doc: Document = doc!(
  140. title => $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
  141. body => cleaned_body.as_str(),
  142. permalink => $page.permalink.as_str(),
  143. );
  144. wtr.add_document(page_doc);
  145. n_indexed += 1;
  146. }
  147. }}
  148. }
  149. if section.meta.redirect_to.is_none() {
  150. index_page!(section);
  151. }
  152. for key in &section.pages {
  153. let page = library.get_page_by_key(*key);
  154. index_page!(page);
  155. }
  156. }
  157. //wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
  158. wtr.commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
  159. wtr.wait_merging_threads().map_err(|e| { Error::from(format!("tantivy IndexWriter::wait_merging_threads failed: {}", e)) })?;
  160. drop(index);
  161. Ok(n_indexed)
  162. }