Browse Source

clean up the new indexing code by quite a lot

index-subcmd
Jonathan Strong 6 months ago
parent
commit
43a88fb79e
5 changed files with 99 additions and 90 deletions
  1. +33
    -64
      components/search/src/lib.rs
  2. +14
    -4
      components/site/src/lib.rs
  3. +27
    -13
      src/cmd/index.rs
  4. +10
    -0
      src/console.rs
  5. +15
    -9
      src/main.rs

+ 33
- 64
components/search/src/lib.rs View File

@@ -3,6 +3,7 @@ use std::collections::{HashMap, HashSet};
use elasticlunr::{Index, Language};
use lazy_static::lazy_static;

#[allow(unused_imports)]
use errors::{bail, Result, Error};
use library::{Library, Section};

@@ -29,7 +30,7 @@ lazy_static! {
/// the language given
/// Errors if the language given is not available in Elasticlunr
/// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
pub fn build_index(lang: &str, library: &Library) -> Result<String> {
pub fn build_index(lang: &str, library: &Library) -> Result<Index> {
let language = match Language::from_code(lang) {
Some(l) => l,
None => {
@@ -45,7 +46,7 @@ pub fn build_index(lang: &str, library: &Library) -> Result<String> {
}
}

Ok(index.to_json())
Ok(index)
}

fn add_section_to_index(index: &mut Index, section: &Section, library: &Library) {
@@ -108,13 +109,11 @@ fn parse_language(lang: &str) -> Option<tantivy::tokenizer::Language> {
}

#[cfg(feature = "tantivy-indexing")]
pub fn build_tantivy_index(
pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
lang: &str,
library: &Library,
output_dir: &str,
//skip_section_pages: bool,

) -> Result<()> {
index_dir: P,
) -> Result<usize> {

use tantivy::{schema::*, tokenizer::*, Index, Document};
use tantivy::doc;
@@ -124,10 +123,10 @@ pub fn build_tantivy_index(

let tokenizer_name: String = match parsed_lang {
Language::English => "en_stem".to_string(),
other => format!("{:?}_stem", parsed_lang).to_lowercase(),
other => format!("{:?}_stem", other).to_lowercase(),
};

let mut text_indexing_options = TextFieldIndexing::default()
let text_indexing_options = TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
.set_tokenizer(&tokenizer_name);

@@ -143,13 +142,9 @@ pub fn build_tantivy_index(

let schema = schema.build();

let index_dir = std::path::Path::new(output_dir).join("tantivy-index");
//utils::fs::ensure_directory_exists(&index_dir)?;

let mut index = Index::create_in_dir(&index_dir, schema.clone())
let index = Index::create_in_dir(&index_dir, schema.clone())
.map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?;


if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
@@ -158,72 +153,46 @@ pub fn build_tantivy_index(
index.tokenizers().register(&tokenizer_name, tokenizer);
}

//let mut wtr = index.writer_with_num_threads(num_cpus::get_physical(), 1024 * 1024 * 256)
//let mut wtr = index.writer_with_num_threads(4, 1024 * 1024 * 256)
let mut wtr = index.writer(1024 * 1024 * 256)
.map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?;

//index_writer.set_merge_policy(Box::new(NoMergePolicy));

//let mut sections_it = library.sections_values().iter().filter(|s| s.lang == lang && s.meta.in_search_index);

let mut seen: HashSet<String> = Default::default();
let mut n_indexed = 0;
//let group_size = 100_000;

for section in library.sections_values() {
if section.lang != lang { continue }

if ! section.meta.in_search_index { continue }
// reason for macro: Section/Page are different types but have same attributes
macro_rules! index_page {
($page:ident) => {{
let already_indexed = seen.contains(&$page.permalink);
if ! already_indexed && $page.meta.in_search_index && $page.lang == lang {
seen.insert($page.permalink.clone()); // mark ask indexed
let cleaned_body: String = AMMONIA.clean(&$page.content).to_string();
let page_doc: Document = doc!(
title => $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
body => cleaned_body.as_str(),
permalink => $page.permalink.as_str(),
);
wtr.add_document(page_doc);
n_indexed += 1;
}
}}
}

// Don't index redirecting sections
//if section.meta.redirect_to.is_none() {
// index.add_doc(
// &section.permalink,
// &[
// &section.meta.title.clone().unwrap_or_default(),
// &AMMONIA.clean(&section.content).to_string(),
// ],
// );
//}
if section.meta.redirect_to.is_none() {
index_page!(section);
}

for key in &section.pages {
let page = library.get_page_by_key(*key);

if !page.meta.in_search_index { continue; }

if seen.contains(&page.permalink) { continue }

seen.insert(page.permalink.clone());

//let mut doc = Document::default();
//doc.add(FieldValue::new(title, Value::from(page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""))));

let cleaned_body: String = AMMONIA.clean(&page.content).to_string();
//doc.add(FieldValue::new(body, Value::from(cleaned_body.as_str())));

//doc.add(FieldValue::new(permalink, Value::from(page.permalink.as_str())));

let opstamp = wtr.add_document(doc!(
title => page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
body => cleaned_body.as_str(),
permalink => page.permalink.as_str(),
));
println!("added {:?} {}", opstamp, page.permalink);

n_indexed += 1;

//if n_indexed % group_size == 0 { }
index_page!(page);
}
}

wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
let commit_opstamp = wtr.commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
//wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
wtr.commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
wtr.wait_merging_threads().map_err(|e| { Error::from(format!("tantivy IndexWriter::wait_merging_threads failed: {}", e)) })?;
drop(index);
println!("finished indexing {} pages", n_indexed);

Ok(())
Ok(n_indexed)
}



+ 14
- 4
components/site/src/lib.rs View File

@@ -778,24 +778,34 @@ impl Site {
Ok(())
}

pub fn build_search_index(&self) -> Result<()> {
pub fn build_search_index(&self) -> Result<usize> {
let mut n_indexed = 0;
ensure_directory_exists(&self.output_path)?;
// index first
let index = search::build_index(&self.config.default_language, &self.library.read().unwrap())
.map_err(|e| Error::from(format!("creating elasticlunr index failed: {}", e)))?;
n_indexed += index.document_store.len();

create_file(
&self.output_path.join(&format!("search_index.{}.js", self.config.default_language)),
&format!(
"window.searchIndex = {};",
search::build_index(&self.config.default_language, &self.library.read().unwrap())?
index.to_json(),
),
)?;

for language in &self.config.languages {
if language.code != self.config.default_language && language.search {

let index = search::build_index(&self.config.default_language, &self.library.read().unwrap())
.map_err(|e| Error::from(format!("creating elasticlunr index failed: {}", e)))?;
n_indexed += index.document_store.len();

create_file(
&self.output_path.join(&format!("search_index.{}.js", &language.code)),
&format!(
"window.searchIndex = {};",
search::build_index(&language.code, &self.library.read().unwrap())?
index.to_json()
),
)?;
}
@@ -804,7 +814,7 @@ impl Site {
// then elasticlunr.min.js
create_file(&self.output_path.join("elasticlunr.min.js"), search::ELASTICLUNR_JS)?;

Ok(())
Ok(n_indexed)
}

pub fn compile_sass(&self, base_path: &Path) -> Result<()> {


+ 27
- 13
src/cmd/index.rs View File

@@ -1,7 +1,8 @@
use std::path::Path;
use errors::Result;
use std::time::*;
use errors::{Result, Error};
use site::Site;
use crate::console;

//use crate::console;

@@ -11,6 +12,7 @@ pub fn index(
base_url: Option<&str>,
output_dir: &str,
include_drafts: bool,
#[cfg(feature = "tantivy-indexing")]
index_type: &str,
) -> Result<()> {
let mut site = Site::new(root_dir, config_file)?;
@@ -25,34 +27,46 @@ pub fn index(
site.include_drafts();
}
site.load()?;
console::notify_site_size(&site);
console::warn_about_ignored_pages(&site);

// TODO: could skipping the theme and/or sass prep end up
// somehow impacting the search indexing? doesn't seem like
// it could, but maybe
let do_elastic_index = || -> Result<()> {
let indexing_start = Instant::now();
let n_indexed = site.build_search_index()
.map_err(|e| Error::from(format!("creating elasticlunr index failed: {}", e)))?;
let indexing_took = Instant::now() - indexing_start;
console::report_n_pages_indexed(n_indexed, indexing_took);
Ok(())
};

#[cfg(feature = "tantivy-indexing")]
{
match index_type {
"elasticlunr" => {
site.build_search_index()?;
}
"elasticlunr" => do_elastic_index()?,

"tantivy" => {
//if ! Path::new(output_dir).exists() {
// std::fs::create_dir_all(output_dir)?;
//}
let index_dir = Path::new(output_dir).join("tantivy-index");
if index_dir.exists() {
std::fs::remove_dir_all(&index_dir)?;
}
utils::fs::ensure_directory_exists(&index_dir)?;

let lang = &site.config.default_language;
let library = site.library.read().unwrap(); // unwrap originally in Site::build_search_index, just parroting here, no idea if safe

search::build_tantivy_index(lang, &library, output_dir)?;
let indexing_start = Instant::now();
let n_pages_indexed = search::build_tantivy_index(lang, &library, &index_dir)?;
let indexing_took = Instant::now() - indexing_start;
console::report_n_pages_indexed(n_pages_indexed, indexing_took);
}

_ => unreachable!()
}
}

#[cfg(not(feature = "tantivy-indexing"))]
{
do_elastic_index()?;
}

Ok(())
}

+ 10
- 0
src/console.rs View File

@@ -97,6 +97,16 @@ pub fn warn_about_ignored_pages(site: &Site) {
}
}

pub fn report_n_pages_indexed(n_pages_indexed: usize, took: std::time::Duration) {
let duration_sec = took.as_secs_f64();
let duration_ms = duration_sec * 1000.0;
if duration_ms < 1000.0 {
success(&format!("Search: indexed {} pages in {}ms.\n", n_pages_indexed, duration_ms));
} else {
success(&format!("Search: indexed {} pages in {:.1}s.\n", n_pages_indexed, ((duration_sec * 10.0).round() / 10.0)));
}
}

/// Print the time elapsed rounded to 1 decimal
pub fn report_elapsed_time(instant: Instant) {
let duration_ms = Duration::from_std(instant.elapsed()).unwrap().num_milliseconds() as f64;


+ 15
- 9
src/main.rs View File

@@ -115,16 +115,22 @@ fn main() {
console::info("Building search index...");
let start = Instant::now();
let output_dir = matches.value_of("output_dir").unwrap();
let index_type = matches.value_of("index_type").unwrap();
match cmd::index(
&root_dir,
config_file,
matches.value_of("base_url"),
output_dir,
matches.is_present("drafts"),
index_type,
) {
let show_drafts = matches.is_present("drafts");

let indexing_result = {
#[cfg(feature = "tantivy-indexing")]
{
let index_type = matches.value_of("index_type").unwrap();
cmd::index( &root_dir, config_file, matches.value_of("base_url"), output_dir, show_drafts, index_type)
}

#[cfg(not(feature = "tantivy-indexing"))]
cmd::index( &root_dir, config_file, matches.value_of("base_url"), output_dir, show_drafts)
};

match indexing_result {
Ok(()) => console::report_elapsed_time(start),

Err(e) => {
console::unravel_errors("Failed to build search index", &e);
::std::process::exit(1);


Loading…
Cancel
Save