tantivy indexing, doesn't crash, but index produced is corrupt somehow

5 years ago · a043f641ea
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,12 +34,14 @@ ws = "0.9"
 ctrlc = "3"
 open = "1.2"
 globset = "0.4"
 tantivy = { version = "0.12", optional = true }

 site = { path = "components/site" }
 errors = { path = "components/errors" }
 front_matter = { path = "components/front_matter" }
 utils = { path = "components/utils" }
 rebuild = { path = "components/rebuild" }
 search = { path = "components/search" }

 [workspace]
 members = [
@@ -57,6 +59,9 @@ members = [
    "components/library",
 ]

 [features]
 tantivy-indexing = ["tantivy", "search/tantivy-indexing"]

 [profile.release]
 lto = true
 codegen-units = 1
--- a/components/search/Cargo.toml
+++ b/components/search/Cargo.toml
@@ -8,6 +8,16 @@ edition = "2018"
 elasticlunr-rs = "2"
 ammonia = "3"
 lazy_static = "1"
 tantivy = { version = "0.12", optional = true }
 isolang = { version = "1.0", optional = true }
 serde = { version = "1.0", optional = true }
 serde_derive = { version = "1.0", optional = true }
 serde_json = { version = "1.0", optional = true }
 num_cpus = { version = "1.12", optional = true }

 errors = { path = "../errors" }
 library = { path = "../library" }

 [features]
 default = []
 tantivy-indexing = ["isolang", "serde", "serde_derive", "serde_json", "num_cpus", "tantivy"]
--- a/components/search/src/lib.rs
+++ b/components/search/src/lib.rs
@@ -3,7 +3,7 @@ use std::collections::{HashMap, HashSet};
 use elasticlunr::{Index, Language};
 use lazy_static::lazy_static;

 use errors::{bail, Result};
 use errors::{bail, Result, Error};
 use library::{Library, Section};

 pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");
@@ -79,3 +79,150 @@ fn add_section_to_index(index: &mut Index, section: &Section, library: &Library)
        );
    }
 }


 #[cfg(feature = "tantivy-indexing")]
 fn parse_language(lang: &str) -> Option<tantivy::tokenizer::Language> {
    use serde_derive::Deserialize;
    #[derive(Deserialize)]
    struct Lang {
        pub language: tantivy::tokenizer::Language,
    }

    // expecting two-character code, but will try other forms as fallback
    match lang.len() {
        2 => isolang::Language::from_639_1(&lang.to_lowercase())
                .and_then(|parsed| {
                    let json = format!("{{\"language\":\"{}\"}}", parsed.to_name());
                    serde_json::from_str::<Lang>(&json).ok().map(|Lang { language }| language)
                }),

        3 => isolang::Language::from_639_3(&lang.to_lowercase())
                .and_then(|parsed| {
                    serde_json::from_str::<tantivy::tokenizer::Language>(parsed.to_name()).ok()
                }),

        // apparently not a code, so this is best available option
        _ => serde_json::from_str::<tantivy::tokenizer::Language>(lang).ok()
    }
 }

 #[cfg(feature = "tantivy-indexing")]
 pub fn build_tantivy_index(
    lang: &str,
    library: &Library,
    output_dir: &str,
    //skip_section_pages: bool,

 ) -> Result<()> {

    use tantivy::{schema::*, tokenizer::*, Index, Document};
    use tantivy::doc;

    let parsed_lang: Language = parse_language(lang)
        .ok_or_else(|| { Error::from(format!("failed to parse language: '{}'", lang)) })?;

    let tokenizer_name: String = match parsed_lang {
        Language::English => "en_stem".to_string(),
        other => format!("{:?}_stem", parsed_lang).to_lowercase(),
    };

    let mut text_indexing_options = TextFieldIndexing::default()
        .set_index_option(IndexRecordOption::WithFreqsAndPositions)
        .set_tokenizer(&tokenizer_name);

    let text_options = TextOptions::default()
        .set_indexing_options(text_indexing_options)
        .set_stored();

    let mut schema = SchemaBuilder::new();

    let title = schema.add_text_field("title", text_options.clone());
    //let body = schema.add_text_field("body", text_options.clone());
    let permalink = schema.add_text_field("permalink", STORED); 

    let schema = schema.build(); 

    let index_dir = std::path::Path::new(output_dir).join("tantivy-index"); 
    //utils::fs::ensure_directory_exists(&index_dir)?;

    let mut index = Index::create_in_dir(&index_dir, schema.clone())
        .map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?;


    if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer
        let tokenizer = TextAnalyzer::from(SimpleTokenizer)
            .filter(RemoveLongFilter::limit(40))
            .filter(LowerCaser)
            .filter(Stemmer::new(parsed_lang));
        index.tokenizers().register(&tokenizer_name, tokenizer);
    }

    //let mut wtr = index.writer_with_num_threads(num_cpus::get_physical(), 1024 * 1024 * 256)
    //let mut wtr = index.writer_with_num_threads(4, 1024 * 1024 * 256)
    let mut wtr = index.writer(1024 * 1024 * 256)
        .map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?;

    //index_writer.set_merge_policy(Box::new(NoMergePolicy));

    //let mut sections_it = library.sections_values().iter().filter(|s| s.lang == lang && s.meta.in_search_index);

    let mut n_indexed = 0;
    //let group_size = 100_000;

    for section in library.sections_values() {
        if section.lang != lang { continue }

        if ! section.meta.in_search_index { continue }

        // Don't index redirecting sections
        //if section.meta.redirect_to.is_none() {
        //    index.add_doc(
        //        &section.permalink,
        //        &[
        //            &section.meta.title.clone().unwrap_or_default(),
        //            &AMMONIA.clean(&section.content).to_string(),
        //        ],
        //    );
        //}

        for _ in 0..16 { 
        for key in &section.pages {
            let page = library.get_page_by_key(*key);

            if !page.meta.in_search_index {
                continue;
            }

            //let mut doc = Document::default();
            //doc.add(FieldValue::new(title, Value::from(page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""))));

            let cleaned_body: String = AMMONIA.clean(&page.content).to_string();
            //doc.add(FieldValue::new(body, Value::from(cleaned_body.as_str())));

            //doc.add(FieldValue::new(permalink, Value::from(page.permalink.as_str())));

            let opstamp = wtr.add_document(doc!(
                title => page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
                //body => cleaned_body.as_str(),
                permalink => page.permalink.as_str(),
            ));
            println!("added {:?} {}", opstamp, page.permalink);

            n_indexed += 1;

            //if n_indexed % group_size == 0 { }
        }
        }
    }

    wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
    let commit_opstamp = wtr.commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
    println!("committed {:?}", commit_opstamp);
    wtr.wait_merging_threads().map_err(|e| { Error::from(format!("tantivy IndexWriter::wait_merging_threads failed: {}", e)) })?;
    drop(index);

    Ok(())
 }


--- a/components/site/src/lib.rs
+++ b/components/site/src/lib.rs
@@ -54,7 +54,7 @@ impl Site {
        config.load_extra_syntaxes(path)?;

        let tpl_glob =
            format!("{}/{}", path.to_string_lossy().replace("\\", "/"), "templates/**/*.*ml");
            format!("{}/{}", path.to_string_lossy().replace("\\", "/"), "templates/**/*.*ml"); // "
        // Only parsing as we might be extending templates from themes and that would error
        // as we haven't loaded them yet
        let mut tera =
@@ -71,7 +71,7 @@ impl Site {

            let theme_tpl_glob = format!(
                "{}/{}",
                path.to_string_lossy().replace("\\", "/"),
                path.to_string_lossy().replace("\\", "/"),      // "
                format!("themes/{}/templates/**/*.*ml", theme)
            );
            let mut tera_theme = Tera::parse(&theme_tpl_glob)
@@ -165,7 +165,7 @@ impl Site {
    /// Reads all .md files in the `content` directory and create pages/sections
    /// out of them
    pub fn load(&mut self) -> Result<()> {
        let base_path = self.base_path.to_string_lossy().replace("\\", "/");
        let base_path = self.base_path.to_string_lossy().replace("\\", "/"); // "
        let content_glob = format!("{}/{}", base_path, "content/**/*.md");

        let (section_entries, page_entries): (Vec<_>, Vec<_>) = glob(&content_glob)
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -98,26 +98,36 @@ pub fn build_cli() -> App<'static, 'static> {
                ]),
            SubCommand::with_name("index")
                .about("Create a search index as a stand-alone task, and with additional options")
                .args(&[
                    Arg::with_name("index_type")
                        .long("index-type")
                        .short("t")
                        .takes_value(true)
                        .possible_values(&["elasticlunr", "tantivy"])
                        .required(true)
                        .help("what kind of search index to build"),
                    Arg::with_name("output_dir")
                        .short("o")
                        .long("output-dir")
                        .default_value("public")
                        .takes_value(true)
                        .help("Outputs the generated search index files into the provided dir. \
                               Note: Tantivy indexing produces a directory instead of a file, \
                               which will be located at output-dir/tantivy-index"),
                    Arg::with_name("drafts")
                        .long("drafts")
                .args({
                    let drafts = Arg::with_name("drafts") .long("drafts")
                        .takes_value(false)
                        .help("Include drafts when loading the site"),
                ]),
                        .help("Include drafts when loading the site");

                    #[cfg(feature = "tantivy-indexing")]
                    {

                        let index_type = Arg::with_name("index_type")
                            .long("index-type")
                            .short("t")
                            .takes_value(true)
                            .possible_values(&["elasticlunr", "tantivy"])
                            .required(true)
                            .help("what kind of search index to build");
                        let output_dir = Arg::with_name("output_dir")
                            .short("o")
                            .long("output-dir")
                            .default_value("public")
                            .takes_value(true)
                            .help("Outputs the generated search index files into the provided dir. \
                                   Note: Tantivy indexing produces a directory instead of a file, \
                                   which will be located at output-dir/tantivy-index");
                        &[drafts, index_type, output_dir]
                    }

                    #[cfg(not(feature = "tantivy-indexing"))]
                    {
                        &[drafts]
                    }
               }),
        ])
 }
--- a/src/cmd/index.rs
+++ b/src/cmd/index.rs
@@ -30,16 +30,28 @@ pub fn index(
    // somehow impacting the search indexing? doesn't seem like
    // it could, but maybe

    match index_type {
        "elasticlunr" => {
            site.build_search_index()?;
    #[cfg(feature = "tantivy-indexing")]
    {
        match index_type {
            "elasticlunr" => {
                site.build_search_index()?;
            }

            "tantivy" => {
                //if ! Path::new(output_dir).exists() {
                //    std::fs::create_dir_all(output_dir)?;
                //}
                let index_dir = Path::new(output_dir).join("tantivy-index");
                utils::fs::ensure_directory_exists(&index_dir)?;

                let lang = &site.config.default_language;
                let library = site.library.read().unwrap(); // unwrap originally in Site::build_search_index, just parroting here, no idea if safe

                search::build_tantivy_index(lang, &library, output_dir)?;
            }

            _ => unreachable!()
        }

        "tantivy" => {
            unimplemented!()
        }

        _ => unreachable!()
    }

    Ok(())