@@ -34,12 +34,14 @@ ws = "0.9" | |||||
ctrlc = "3" | ctrlc = "3" | ||||
open = "1.2" | open = "1.2" | ||||
globset = "0.4" | globset = "0.4" | ||||
tantivy = { version = "0.12", optional = true } | |||||
site = { path = "components/site" } | site = { path = "components/site" } | ||||
errors = { path = "components/errors" } | errors = { path = "components/errors" } | ||||
front_matter = { path = "components/front_matter" } | front_matter = { path = "components/front_matter" } | ||||
utils = { path = "components/utils" } | utils = { path = "components/utils" } | ||||
rebuild = { path = "components/rebuild" } | rebuild = { path = "components/rebuild" } | ||||
search = { path = "components/search" } | |||||
[workspace] | [workspace] | ||||
members = [ | members = [ | ||||
@@ -57,6 +59,9 @@ members = [ | |||||
"components/library", | "components/library", | ||||
] | ] | ||||
[features] | |||||
tantivy-indexing = ["tantivy", "search/tantivy-indexing"] | |||||
[profile.release] | [profile.release] | ||||
lto = true | lto = true | ||||
codegen-units = 1 | codegen-units = 1 |
@@ -8,6 +8,16 @@ edition = "2018" | |||||
elasticlunr-rs = "2" | elasticlunr-rs = "2" | ||||
ammonia = "3" | ammonia = "3" | ||||
lazy_static = "1" | lazy_static = "1" | ||||
tantivy = { version = "0.12", optional = true } | |||||
isolang = { version = "1.0", optional = true } | |||||
serde = { version = "1.0", optional = true } | |||||
serde_derive = { version = "1.0", optional = true } | |||||
serde_json = { version = "1.0", optional = true } | |||||
num_cpus = { version = "1.12", optional = true } | |||||
errors = { path = "../errors" } | errors = { path = "../errors" } | ||||
library = { path = "../library" } | library = { path = "../library" } | ||||
[features] | |||||
default = [] | |||||
tantivy-indexing = ["isolang", "serde", "serde_derive", "serde_json", "num_cpus", "tantivy"] |
@@ -3,7 +3,7 @@ use std::collections::{HashMap, HashSet}; | |||||
use elasticlunr::{Index, Language}; | use elasticlunr::{Index, Language}; | ||||
use lazy_static::lazy_static; | use lazy_static::lazy_static; | ||||
use errors::{bail, Result}; | |||||
use errors::{bail, Result, Error}; | |||||
use library::{Library, Section}; | use library::{Library, Section}; | ||||
pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js"); | pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js"); | ||||
@@ -79,3 +79,150 @@ fn add_section_to_index(index: &mut Index, section: &Section, library: &Library) | |||||
); | ); | ||||
} | } | ||||
} | } | ||||
#[cfg(feature = "tantivy-indexing")] | |||||
fn parse_language(lang: &str) -> Option<tantivy::tokenizer::Language> { | |||||
use serde_derive::Deserialize; | |||||
#[derive(Deserialize)] | |||||
struct Lang { | |||||
pub language: tantivy::tokenizer::Language, | |||||
} | |||||
// expecting two-character code, but will try other forms as fallback | |||||
match lang.len() { | |||||
2 => isolang::Language::from_639_1(&lang.to_lowercase()) | |||||
.and_then(|parsed| { | |||||
let json = format!("{{\"language\":\"{}\"}}", parsed.to_name()); | |||||
serde_json::from_str::<Lang>(&json).ok().map(|Lang { language }| language) | |||||
}), | |||||
3 => isolang::Language::from_639_3(&lang.to_lowercase()) | |||||
.and_then(|parsed| { | |||||
serde_json::from_str::<tantivy::tokenizer::Language>(parsed.to_name()).ok() | |||||
}), | |||||
// apparently not a code, so this is best available option | |||||
_ => serde_json::from_str::<tantivy::tokenizer::Language>(lang).ok() | |||||
} | |||||
} | |||||
#[cfg(feature = "tantivy-indexing")] | |||||
pub fn build_tantivy_index( | |||||
lang: &str, | |||||
library: &Library, | |||||
output_dir: &str, | |||||
//skip_section_pages: bool, | |||||
) -> Result<()> { | |||||
use tantivy::{schema::*, tokenizer::*, Index, Document}; | |||||
use tantivy::doc; | |||||
let parsed_lang: Language = parse_language(lang) | |||||
.ok_or_else(|| { Error::from(format!("failed to parse language: '{}'", lang)) })?; | |||||
let tokenizer_name: String = match parsed_lang { | |||||
Language::English => "en_stem".to_string(), | |||||
other => format!("{:?}_stem", parsed_lang).to_lowercase(), | |||||
}; | |||||
let mut text_indexing_options = TextFieldIndexing::default() | |||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions) | |||||
.set_tokenizer(&tokenizer_name); | |||||
let text_options = TextOptions::default() | |||||
.set_indexing_options(text_indexing_options) | |||||
.set_stored(); | |||||
let mut schema = SchemaBuilder::new(); | |||||
let title = schema.add_text_field("title", text_options.clone()); | |||||
//let body = schema.add_text_field("body", text_options.clone()); | |||||
let permalink = schema.add_text_field("permalink", STORED); | |||||
let schema = schema.build(); | |||||
let index_dir = std::path::Path::new(output_dir).join("tantivy-index"); | |||||
//utils::fs::ensure_directory_exists(&index_dir)?; | |||||
let mut index = Index::create_in_dir(&index_dir, schema.clone()) | |||||
.map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?; | |||||
if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer | |||||
let tokenizer = TextAnalyzer::from(SimpleTokenizer) | |||||
.filter(RemoveLongFilter::limit(40)) | |||||
.filter(LowerCaser) | |||||
.filter(Stemmer::new(parsed_lang)); | |||||
index.tokenizers().register(&tokenizer_name, tokenizer); | |||||
} | |||||
//let mut wtr = index.writer_with_num_threads(num_cpus::get_physical(), 1024 * 1024 * 256) | |||||
//let mut wtr = index.writer_with_num_threads(4, 1024 * 1024 * 256) | |||||
let mut wtr = index.writer(1024 * 1024 * 256) | |||||
.map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?; | |||||
//index_writer.set_merge_policy(Box::new(NoMergePolicy)); | |||||
//let mut sections_it = library.sections_values().iter().filter(|s| s.lang == lang && s.meta.in_search_index); | |||||
let mut n_indexed = 0; | |||||
//let group_size = 100_000; | |||||
for section in library.sections_values() { | |||||
if section.lang != lang { continue } | |||||
if ! section.meta.in_search_index { continue } | |||||
// Don't index redirecting sections | |||||
//if section.meta.redirect_to.is_none() { | |||||
// index.add_doc( | |||||
// §ion.permalink, | |||||
// &[ | |||||
// §ion.meta.title.clone().unwrap_or_default(), | |||||
// &AMMONIA.clean(§ion.content).to_string(), | |||||
// ], | |||||
// ); | |||||
//} | |||||
for _ in 0..16 { | |||||
for key in §ion.pages { | |||||
let page = library.get_page_by_key(*key); | |||||
if !page.meta.in_search_index { | |||||
continue; | |||||
} | |||||
//let mut doc = Document::default(); | |||||
//doc.add(FieldValue::new(title, Value::from(page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or("")))); | |||||
let cleaned_body: String = AMMONIA.clean(&page.content).to_string(); | |||||
//doc.add(FieldValue::new(body, Value::from(cleaned_body.as_str()))); | |||||
//doc.add(FieldValue::new(permalink, Value::from(page.permalink.as_str()))); | |||||
let opstamp = wtr.add_document(doc!( | |||||
title => page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""), | |||||
//body => cleaned_body.as_str(), | |||||
permalink => page.permalink.as_str(), | |||||
)); | |||||
println!("added {:?} {}", opstamp, page.permalink); | |||||
n_indexed += 1; | |||||
//if n_indexed % group_size == 0 { } | |||||
} | |||||
} | |||||
} | |||||
wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?; | |||||
let commit_opstamp = wtr.commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?; | |||||
println!("committed {:?}", commit_opstamp); | |||||
wtr.wait_merging_threads().map_err(|e| { Error::from(format!("tantivy IndexWriter::wait_merging_threads failed: {}", e)) })?; | |||||
drop(index); | |||||
Ok(()) | |||||
} | |||||
@@ -54,7 +54,7 @@ impl Site { | |||||
config.load_extra_syntaxes(path)?; | config.load_extra_syntaxes(path)?; | ||||
let tpl_glob = | let tpl_glob = | ||||
format!("{}/{}", path.to_string_lossy().replace("\\", "/"), "templates/**/*.*ml"); | |||||
format!("{}/{}", path.to_string_lossy().replace("\\", "/"), "templates/**/*.*ml"); // " | |||||
// Only parsing as we might be extending templates from themes and that would error | // Only parsing as we might be extending templates from themes and that would error | ||||
// as we haven't loaded them yet | // as we haven't loaded them yet | ||||
let mut tera = | let mut tera = | ||||
@@ -71,7 +71,7 @@ impl Site { | |||||
let theme_tpl_glob = format!( | let theme_tpl_glob = format!( | ||||
"{}/{}", | "{}/{}", | ||||
path.to_string_lossy().replace("\\", "/"), | |||||
path.to_string_lossy().replace("\\", "/"), // " | |||||
format!("themes/{}/templates/**/*.*ml", theme) | format!("themes/{}/templates/**/*.*ml", theme) | ||||
); | ); | ||||
let mut tera_theme = Tera::parse(&theme_tpl_glob) | let mut tera_theme = Tera::parse(&theme_tpl_glob) | ||||
@@ -165,7 +165,7 @@ impl Site { | |||||
/// Reads all .md files in the `content` directory and create pages/sections | /// Reads all .md files in the `content` directory and create pages/sections | ||||
/// out of them | /// out of them | ||||
pub fn load(&mut self) -> Result<()> { | pub fn load(&mut self) -> Result<()> { | ||||
let base_path = self.base_path.to_string_lossy().replace("\\", "/"); | |||||
let base_path = self.base_path.to_string_lossy().replace("\\", "/"); // " | |||||
let content_glob = format!("{}/{}", base_path, "content/**/*.md"); | let content_glob = format!("{}/{}", base_path, "content/**/*.md"); | ||||
let (section_entries, page_entries): (Vec<_>, Vec<_>) = glob(&content_glob) | let (section_entries, page_entries): (Vec<_>, Vec<_>) = glob(&content_glob) | ||||
@@ -98,26 +98,36 @@ pub fn build_cli() -> App<'static, 'static> { | |||||
]), | ]), | ||||
SubCommand::with_name("index") | SubCommand::with_name("index") | ||||
.about("Create a search index as a stand-alone task, and with additional options") | .about("Create a search index as a stand-alone task, and with additional options") | ||||
.args(&[ | |||||
Arg::with_name("index_type") | |||||
.long("index-type") | |||||
.short("t") | |||||
.takes_value(true) | |||||
.possible_values(&["elasticlunr", "tantivy"]) | |||||
.required(true) | |||||
.help("what kind of search index to build"), | |||||
Arg::with_name("output_dir") | |||||
.short("o") | |||||
.long("output-dir") | |||||
.default_value("public") | |||||
.takes_value(true) | |||||
.help("Outputs the generated search index files into the provided dir. \ | |||||
Note: Tantivy indexing produces a directory instead of a file, \ | |||||
which will be located at output-dir/tantivy-index"), | |||||
Arg::with_name("drafts") | |||||
.long("drafts") | |||||
.args({ | |||||
let drafts = Arg::with_name("drafts") .long("drafts") | |||||
.takes_value(false) | .takes_value(false) | ||||
.help("Include drafts when loading the site"), | |||||
]), | |||||
.help("Include drafts when loading the site"); | |||||
#[cfg(feature = "tantivy-indexing")] | |||||
{ | |||||
let index_type = Arg::with_name("index_type") | |||||
.long("index-type") | |||||
.short("t") | |||||
.takes_value(true) | |||||
.possible_values(&["elasticlunr", "tantivy"]) | |||||
.required(true) | |||||
.help("what kind of search index to build"); | |||||
let output_dir = Arg::with_name("output_dir") | |||||
.short("o") | |||||
.long("output-dir") | |||||
.default_value("public") | |||||
.takes_value(true) | |||||
.help("Outputs the generated search index files into the provided dir. \ | |||||
Note: Tantivy indexing produces a directory instead of a file, \ | |||||
which will be located at output-dir/tantivy-index"); | |||||
&[drafts, index_type, output_dir] | |||||
} | |||||
#[cfg(not(feature = "tantivy-indexing"))] | |||||
{ | |||||
&[drafts] | |||||
} | |||||
}), | |||||
]) | ]) | ||||
} | } |
@@ -30,16 +30,28 @@ pub fn index( | |||||
// somehow impacting the search indexing? doesn't seem like | // somehow impacting the search indexing? doesn't seem like | ||||
// it could, but maybe | // it could, but maybe | ||||
match index_type { | |||||
"elasticlunr" => { | |||||
site.build_search_index()?; | |||||
#[cfg(feature = "tantivy-indexing")] | |||||
{ | |||||
match index_type { | |||||
"elasticlunr" => { | |||||
site.build_search_index()?; | |||||
} | |||||
"tantivy" => { | |||||
//if ! Path::new(output_dir).exists() { | |||||
// std::fs::create_dir_all(output_dir)?; | |||||
//} | |||||
let index_dir = Path::new(output_dir).join("tantivy-index"); | |||||
utils::fs::ensure_directory_exists(&index_dir)?; | |||||
let lang = &site.config.default_language; | |||||
let library = site.library.read().unwrap(); // unwrap originally in Site::build_search_index, just parroting here, no idea if safe | |||||
search::build_tantivy_index(lang, &library, output_dir)?; | |||||
} | |||||
_ => unreachable!() | |||||
} | } | ||||
"tantivy" => { | |||||
unimplemented!() | |||||
} | |||||
_ => unreachable!() | |||||
} | } | ||||
Ok(()) | Ok(()) | ||||