Browse Source

tantivy indexing, doesn't crash, but index produced is corrupt somehow

index-subcmd
Jonathan Strong 1 year ago
parent
commit
a043f641ea
6 changed files with 217 additions and 33 deletions
  1. +5
    -0
      Cargo.toml
  2. +10
    -0
      components/search/Cargo.toml
  3. +148
    -1
      components/search/src/lib.rs
  4. +3
    -3
      components/site/src/lib.rs
  5. +30
    -20
      src/cli.rs
  6. +21
    -9
      src/cmd/index.rs

+ 5
- 0
Cargo.toml View File

@@ -34,12 +34,14 @@ ws = "0.9"
ctrlc = "3"
open = "1.2"
globset = "0.4"
tantivy = { version = "0.12", optional = true }

site = { path = "components/site" }
errors = { path = "components/errors" }
front_matter = { path = "components/front_matter" }
utils = { path = "components/utils" }
rebuild = { path = "components/rebuild" }
search = { path = "components/search" }

[workspace]
members = [
@@ -57,6 +59,9 @@ members = [
"components/library",
]

[features]
tantivy-indexing = ["tantivy", "search/tantivy-indexing"]

[profile.release]
lto = true
codegen-units = 1

+ 10
- 0
components/search/Cargo.toml View File

@@ -8,6 +8,16 @@ edition = "2018"
elasticlunr-rs = "2"
ammonia = "3"
lazy_static = "1"
tantivy = { version = "0.12", optional = true }
isolang = { version = "1.0", optional = true }
serde = { version = "1.0", optional = true }
serde_derive = { version = "1.0", optional = true }
serde_json = { version = "1.0", optional = true }
num_cpus = { version = "1.12", optional = true }

errors = { path = "../errors" }
library = { path = "../library" }

[features]
default = []
tantivy-indexing = ["isolang", "serde", "serde_derive", "serde_json", "num_cpus", "tantivy"]

+ 148
- 1
components/search/src/lib.rs View File

@@ -3,7 +3,7 @@ use std::collections::{HashMap, HashSet};
use elasticlunr::{Index, Language};
use lazy_static::lazy_static;

use errors::{bail, Result};
use errors::{bail, Result, Error};
use library::{Library, Section};

pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");
@@ -79,3 +79,150 @@ fn add_section_to_index(index: &mut Index, section: &Section, library: &Library)
);
}
}


#[cfg(feature = "tantivy-indexing")]
fn parse_language(lang: &str) -> Option<tantivy::tokenizer::Language> {
use serde_derive::Deserialize;
#[derive(Deserialize)]
struct Lang {
pub language: tantivy::tokenizer::Language,
}

// expecting two-character code, but will try other forms as fallback
match lang.len() {
2 => isolang::Language::from_639_1(&lang.to_lowercase())
.and_then(|parsed| {
let json = format!("{{\"language\":\"{}\"}}", parsed.to_name());
serde_json::from_str::<Lang>(&json).ok().map(|Lang { language }| language)
}),

3 => isolang::Language::from_639_3(&lang.to_lowercase())
.and_then(|parsed| {
serde_json::from_str::<tantivy::tokenizer::Language>(parsed.to_name()).ok()
}),

// apparently not a code, so this is best available option
_ => serde_json::from_str::<tantivy::tokenizer::Language>(lang).ok()
}
}

#[cfg(feature = "tantivy-indexing")]
pub fn build_tantivy_index(
lang: &str,
library: &Library,
output_dir: &str,
//skip_section_pages: bool,

) -> Result<()> {

use tantivy::{schema::*, tokenizer::*, Index, Document};
use tantivy::doc;

let parsed_lang: Language = parse_language(lang)
.ok_or_else(|| { Error::from(format!("failed to parse language: '{}'", lang)) })?;

let tokenizer_name: String = match parsed_lang {
Language::English => "en_stem".to_string(),
other => format!("{:?}_stem", parsed_lang).to_lowercase(),
};

let mut text_indexing_options = TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
.set_tokenizer(&tokenizer_name);

let text_options = TextOptions::default()
.set_indexing_options(text_indexing_options)
.set_stored();

let mut schema = SchemaBuilder::new();

let title = schema.add_text_field("title", text_options.clone());
//let body = schema.add_text_field("body", text_options.clone());
let permalink = schema.add_text_field("permalink", STORED);

let schema = schema.build();

let index_dir = std::path::Path::new(output_dir).join("tantivy-index");
//utils::fs::ensure_directory_exists(&index_dir)?;

let mut index = Index::create_in_dir(&index_dir, schema.clone())
.map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?;


if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(parsed_lang));
index.tokenizers().register(&tokenizer_name, tokenizer);
}

//let mut wtr = index.writer_with_num_threads(num_cpus::get_physical(), 1024 * 1024 * 256)
//let mut wtr = index.writer_with_num_threads(4, 1024 * 1024 * 256)
let mut wtr = index.writer(1024 * 1024 * 256)
.map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?;

//index_writer.set_merge_policy(Box::new(NoMergePolicy));

//let mut sections_it = library.sections_values().iter().filter(|s| s.lang == lang && s.meta.in_search_index);

let mut n_indexed = 0;
//let group_size = 100_000;

for section in library.sections_values() {
if section.lang != lang { continue }

if ! section.meta.in_search_index { continue }

// Don't index redirecting sections
//if section.meta.redirect_to.is_none() {
// index.add_doc(
// &section.permalink,
// &[
// &section.meta.title.clone().unwrap_or_default(),
// &AMMONIA.clean(&section.content).to_string(),
// ],
// );
//}

for _ in 0..16 {
for key in &section.pages {
let page = library.get_page_by_key(*key);

if !page.meta.in_search_index {
continue;
}

//let mut doc = Document::default();
//doc.add(FieldValue::new(title, Value::from(page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""))));

let cleaned_body: String = AMMONIA.clean(&page.content).to_string();
//doc.add(FieldValue::new(body, Value::from(cleaned_body.as_str())));

//doc.add(FieldValue::new(permalink, Value::from(page.permalink.as_str())));

let opstamp = wtr.add_document(doc!(
title => page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
//body => cleaned_body.as_str(),
permalink => page.permalink.as_str(),
));
println!("added {:?} {}", opstamp, page.permalink);

n_indexed += 1;

//if n_indexed % group_size == 0 { }
}
}
}

wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
let commit_opstamp = wtr.commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
println!("committed {:?}", commit_opstamp);
wtr.wait_merging_threads().map_err(|e| { Error::from(format!("tantivy IndexWriter::wait_merging_threads failed: {}", e)) })?;
drop(index);

Ok(())
}



+ 3
- 3
components/site/src/lib.rs View File

@@ -54,7 +54,7 @@ impl Site {
config.load_extra_syntaxes(path)?;

let tpl_glob =
format!("{}/{}", path.to_string_lossy().replace("\\", "/"), "templates/**/*.*ml");
format!("{}/{}", path.to_string_lossy().replace("\\", "/"), "templates/**/*.*ml"); // "
// Only parsing as we might be extending templates from themes and that would error
// as we haven't loaded them yet
let mut tera =
@@ -71,7 +71,7 @@ impl Site {

let theme_tpl_glob = format!(
"{}/{}",
path.to_string_lossy().replace("\\", "/"),
path.to_string_lossy().replace("\\", "/"), // "
format!("themes/{}/templates/**/*.*ml", theme)
);
let mut tera_theme = Tera::parse(&theme_tpl_glob)
@@ -165,7 +165,7 @@ impl Site {
/// Reads all .md files in the `content` directory and create pages/sections
/// out of them
pub fn load(&mut self) -> Result<()> {
let base_path = self.base_path.to_string_lossy().replace("\\", "/");
let base_path = self.base_path.to_string_lossy().replace("\\", "/"); // "
let content_glob = format!("{}/{}", base_path, "content/**/*.md");

let (section_entries, page_entries): (Vec<_>, Vec<_>) = glob(&content_glob)


+ 30
- 20
src/cli.rs View File

@@ -98,26 +98,36 @@ pub fn build_cli() -> App<'static, 'static> {
]),
SubCommand::with_name("index")
.about("Create a search index as a stand-alone task, and with additional options")
.args(&[
Arg::with_name("index_type")
.long("index-type")
.short("t")
.takes_value(true)
.possible_values(&["elasticlunr", "tantivy"])
.required(true)
.help("what kind of search index to build"),
Arg::with_name("output_dir")
.short("o")
.long("output-dir")
.default_value("public")
.takes_value(true)
.help("Outputs the generated search index files into the provided dir. \
Note: Tantivy indexing produces a directory instead of a file, \
which will be located at output-dir/tantivy-index"),
Arg::with_name("drafts")
.long("drafts")
.args({
let drafts = Arg::with_name("drafts") .long("drafts")
.takes_value(false)
.help("Include drafts when loading the site"),
]),
.help("Include drafts when loading the site");

#[cfg(feature = "tantivy-indexing")]
{

let index_type = Arg::with_name("index_type")
.long("index-type")
.short("t")
.takes_value(true)
.possible_values(&["elasticlunr", "tantivy"])
.required(true)
.help("what kind of search index to build");
let output_dir = Arg::with_name("output_dir")
.short("o")
.long("output-dir")
.default_value("public")
.takes_value(true)
.help("Outputs the generated search index files into the provided dir. \
Note: Tantivy indexing produces a directory instead of a file, \
which will be located at output-dir/tantivy-index");
&[drafts, index_type, output_dir]
}

#[cfg(not(feature = "tantivy-indexing"))]
{
&[drafts]
}
}),
])
}

+ 21
- 9
src/cmd/index.rs View File

@@ -30,16 +30,28 @@ pub fn index(
// somehow impacting the search indexing? doesn't seem like
// it could, but maybe

match index_type {
"elasticlunr" => {
site.build_search_index()?;
#[cfg(feature = "tantivy-indexing")]
{
match index_type {
"elasticlunr" => {
site.build_search_index()?;
}

"tantivy" => {
//if ! Path::new(output_dir).exists() {
// std::fs::create_dir_all(output_dir)?;
//}
let index_dir = Path::new(output_dir).join("tantivy-index");
utils::fs::ensure_directory_exists(&index_dir)?;

let lang = &site.config.default_language;
let library = site.library.read().unwrap(); // unwrap originally in Site::build_search_index, just parroting here, no idea if safe

search::build_tantivy_index(lang, &library, output_dir)?;
}

_ => unreachable!()
}

"tantivy" => {
unimplemented!()
}

_ => unreachable!()
}

Ok(())


Loading…
Cancel
Save