Browse Source

upgrade tantivy to v0.12, get it working (other than bench - todo, behind a feature gate meanwhile)

develop
Jonathan Strong 4 years ago
parent
commit
8df82a4b02
8 changed files with 736 additions and 673 deletions
  1. +556
    -625
      Cargo.lock
  2. +5
    -2
      Cargo.toml
  3. +12
    -10
      src/commands/merge.rs
  4. +49
    -0
      src/commands/mod.rs
  5. +6
    -6
      src/commands/new.rs
  6. +54
    -5
      src/commands/search.rs
  7. +2
    -4
      src/commands/serve.rs
  8. +52
    -21
      src/main.rs

+ 556
- 625
Cargo.lock
File diff suppressed because it is too large
View File


+ 5
- 2
Cargo.toml View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy-cli" name = "tantivy-cli"
version = "0.10.0"
version = "0.12.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]


description = """Command line interface for Tantivy, a search engine library.""" description = """Command line interface for Tantivy, a search engine library."""
@@ -31,12 +31,15 @@ byteorder = "0.5"
log = "0.3" log = "0.3"
futures = "0.1" futures = "0.1"
env_logger = "0.3" env_logger = "0.3"
tantivy = "0.10"
tantivy = "0.12"


[[bin]] [[bin]]
name = "tantivy" name = "tantivy"
path = "src/main.rs" path = "src/main.rs"


[features]
bench = []



[profile.release] [profile.release]
opt-level = 3 opt-level = 3


+ 12
- 10
src/commands/merge.rs View File

@@ -1,13 +1,13 @@
extern crate tantivy;
//extern crate tantivy;


use clap::ArgMatches; use clap::ArgMatches;
use futures::Future;
//use futures::Future;
use std::path::PathBuf; use std::path::PathBuf;
use tantivy::{Index, SegmentMeta}; use tantivy::{Index, SegmentMeta};


const HEAP_SIZE: usize = 300_000_000; const HEAP_SIZE: usize = 300_000_000;


fn error_msg(err: tantivy::Error) -> String {
fn error_msg(err: tantivy::TantivyError) -> String {
format!("Merge failed : {:?}", err) format!("Merge failed : {:?}", err)
} }


@@ -21,16 +21,18 @@ pub fn run_merge_cli(argmatch: &ArgMatches) -> Result<(), String> {
fn run_merge(path: PathBuf) -> tantivy::Result<()> { fn run_merge(path: PathBuf) -> tantivy::Result<()> {
let index = Index::open_in_dir(&path)?; let index = Index::open_in_dir(&path)?;
let segments = index.searchable_segment_ids()?; let segments = index.searchable_segment_ids()?;
let segment_meta: SegmentMeta = index
.writer(HEAP_SIZE)?
.merge(&segments)?
.wait()
.expect("Merge failed");
let segment_meta: SegmentMeta =
super::run(
index
.writer(HEAP_SIZE)?
.merge(&segments)
).expect("Merge failed");
//.map_err(|_| tantivy::Error::ErrorInThread(String::from("Merge got cancelled"))); //.map_err(|_| tantivy::Error::ErrorInThread(String::from("Merge got cancelled")));
println!("Merge finished with segment meta {:?}", segment_meta); println!("Merge finished with segment meta {:?}", segment_meta);
println!("Garbage collect irrelevant segments."); println!("Garbage collect irrelevant segments.");
Index::open_in_dir(&path)?
let gc_fut = Index::open_in_dir(&path)?
.writer_with_num_threads(1, 40_000_000)? .writer_with_num_threads(1, 40_000_000)?
.garbage_collect_files()?;
.garbage_collect_files();
super::run(gc_fut)?;
Ok(()) Ok(())
} }

+ 49
- 0
src/commands/mod.rs View File

@@ -1,3 +1,7 @@
use std::sync::{Arc, Condvar, Mutex};
use std::task::{Context, Poll, RawWaker, RawWakerVTable, Waker};

#[cfg(feature = "bench")]
mod bench; mod bench;
mod index; mod index;
mod merge; mod merge;
@@ -5,9 +9,54 @@ mod new;
mod search; mod search;
mod serve; mod serve;


#[cfg(feature = "bench")]
pub use self::bench::run_bench_cli; pub use self::bench::run_bench_cli;
pub use self::index::run_index_cli; pub use self::index::run_index_cli;
pub use self::merge::run_merge_cli; pub use self::merge::run_merge_cli;
pub use self::new::run_new_cli; pub use self::new::run_new_cli;
pub use self::search::run_search_cli; pub use self::search::run_search_cli;
pub use self::serve::run_serve_cli; pub use self::serve::run_serve_cli;

// stolen from [extreme](https://github.com/spacejam/extreme)

#[derive(Default)]
struct Park(Mutex<bool>, Condvar);

fn unpark(park: &Park) {
*park.0.lock().unwrap() = true;
park.1.notify_one();
}

static VTABLE: RawWakerVTable = RawWakerVTable::new(
|clone_me| unsafe {
let arc = Arc::from_raw(clone_me as *const Park);
std::mem::forget(arc.clone());
RawWaker::new(Arc::into_raw(arc) as *const (), &VTABLE)
},
|wake_me| unsafe { unpark(&Arc::from_raw(wake_me as *const Park)) },
|wake_by_ref_me| unsafe { unpark(&*(wake_by_ref_me as *const Park)) },
|drop_me| unsafe { drop(Arc::from_raw(drop_me as *const Park)) },
);

/// Run a `Future`.
pub fn run<F: std::future::Future>(mut f: F) -> F::Output {
let mut f = unsafe { std::pin::Pin::new_unchecked(&mut f) };
let park = Arc::new(Park::default());
let sender = Arc::into_raw(park.clone());
let raw_waker = RawWaker::new(sender as *const _, &VTABLE);
let waker = unsafe { Waker::from_raw(raw_waker) };
let mut cx = Context::from_waker(&waker);

loop {
match f.as_mut().poll(&mut cx) {
Poll::Pending => {
let mut runnable = park.0.lock().unwrap();
while !*runnable {
runnable = park.1.wait(runnable).unwrap();
}
*runnable = false;
}
Poll::Ready(val) => return val,
}
}
}

+ 6
- 6
src/commands/new.rs View File

@@ -87,7 +87,7 @@ fn prompt_field_type(msg: &str, codes: Vec<&str>) -> tantivy::schema::Type {
"TEXT" => Type::Str, "TEXT" => Type::Str,
"U64" => Type::U64, "U64" => Type::U64,
"I64" => Type::I64, "I64" => Type::I64,
// "F64" => Type::F64,
"F64" => Type::F64,
"DATE" => Type::Date, "DATE" => Type::Date,
"FACET" => Type::HierarchicalFacet, "FACET" => Type::HierarchicalFacet,
"BYTES" => Type::Bytes, "BYTES" => Type::Bytes,
@@ -149,9 +149,9 @@ fn ask_add_num_field_with_options(
Type::U64 => { Type::U64 => {
schema_builder.add_u64_field(field_name, int_options); schema_builder.add_u64_field(field_name, int_options);
} }
// Type::F64 => {
// schema_builder.add_f64_field(field_name, int_options);
// }
Type::F64 => {
schema_builder.add_f64_field(field_name, int_options);
}
Type::I64 => { Type::I64 => {
schema_builder.add_i64_field(field_name, int_options); schema_builder.add_i64_field(field_name, int_options);
} }
@@ -177,8 +177,8 @@ fn ask_add_field(schema_builder: &mut SchemaBuilder) {
Type::Str => { Type::Str => {
ask_add_field_text(&field_name, schema_builder); ask_add_field_text(&field_name, schema_builder);
} }
Type::U64 | Type::Date | Type::I64 => {
// Type::U64 | Type::F64 | Type::Date | Type::I64 => {
//Type::U64 | Type::Date | Type::I64 => {
Type::U64 | Type::F64 | Type::Date | Type::I64 => {
ask_add_num_field_with_options(&field_name, field_type, schema_builder); ask_add_num_field_with_options(&field_name, field_type, schema_builder);
} }
Type::HierarchicalFacet => { Type::HierarchicalFacet => {


+ 54
- 5
src/commands/search.rs View File

@@ -1,5 +1,6 @@
use clap::ArgMatches; use clap::ArgMatches;
use serde_json; use serde_json;
use std::str::FromStr;
use std::convert::From; use std::convert::From;
use std::path::Path; use std::path::Path;
use std::path::PathBuf; use std::path::PathBuf;
@@ -8,11 +9,24 @@ use tantivy::query::QueryParser;
use tantivy::schema::Field; use tantivy::schema::Field;
use tantivy::schema::FieldType; use tantivy::schema::FieldType;
use tantivy::Index; use tantivy::Index;
use tantivy::collector::{Count, TopDocs};
use tantivy::Document;


pub fn run_search_cli(matches: &ArgMatches) -> Result<(), String> { pub fn run_search_cli(matches: &ArgMatches) -> Result<(), String> {
let index_directory = PathBuf::from(matches.value_of("index").unwrap()); let index_directory = PathBuf::from(matches.value_of("index").unwrap());
let query = matches.value_of("query").unwrap(); let query = matches.value_of("query").unwrap();
run_search(&index_directory, &query).map_err(|e| format!("{:?}", e))
match matches.value_of("num_hits") {
Some(num_hits_str) => {
let num_hits: usize = FromStr::from_str(num_hits_str)
.map_err(|e| { format!("Failed to parse --num_hits (got '{}', expected integer): {}", num_hits_str, e) })?;
run_top_search(&index_directory, &query, num_hits)
.map_err(|e| format!("{:?}", e))
}

None => {
run_search(&index_directory, &query).map_err(|e| format!("{:?}", e))
}
}
} }


fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> {
@@ -20,15 +34,13 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> {
let schema = index.schema(); let schema = index.schema();
let default_fields: Vec<Field> = schema let default_fields: Vec<Field> = schema
.fields() .fields()
.iter()
.enumerate()
.filter(|&(_, ref field_entry)| match *field_entry.field_type() { .filter(|&(_, ref field_entry)| match *field_entry.field_type() {
FieldType::Str(ref text_field_options) => { FieldType::Str(ref text_field_options) => {
text_field_options.get_indexing_options().is_some() text_field_options.get_indexing_options().is_some()
} }
_ => false, _ => false,
}) })
.map(|(i, _)| Field(i as u32))
.map(|(field, _)| field)
.collect(); .collect();
let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone());
let query = query_parser.parse_query(query)?; let query = query_parser.parse_query(query)?;
@@ -36,7 +48,7 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> {
let weight = query.weight(&searcher, false)?; let weight = query.weight(&searcher, false)?;
let schema = index.schema(); let schema = index.schema();
for segment_reader in searcher.segment_readers() { for segment_reader in searcher.segment_readers() {
let mut scorer = weight.scorer(segment_reader)?;
let mut scorer = weight.scorer(segment_reader, 1.0)?;
let store_reader = segment_reader.get_store_reader(); let store_reader = segment_reader.get_store_reader();
while scorer.advance() { while scorer.advance() {
let doc_id = scorer.doc(); let doc_id = scorer.doc();
@@ -47,3 +59,40 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> {
} }
Ok(()) Ok(())
} }

fn run_top_search(directory: &Path, query: &str, num_hits: usize) -> tantivy::Result<()> {
let index = Index::open_in_dir(directory)?;
let schema = index.schema();
let default_fields: Vec<Field> = schema
.fields()
.filter(|(_, field_entry)| {
match field_entry.field_type() {
FieldType::Str(ref text_field_options) => {
text_field_options.get_indexing_options().is_some()
}
_ => false,
}
})
.map(|(field, _)| field)
.collect();
let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone());
let query = query_parser.parse_query(query)?;
let searcher = index.reader()?.searcher();
let (top_docs, num_hits) = searcher.search(&query, &(TopDocs::with_limit(num_hits), Count))?;
let mut out = String::with_capacity(1024);
top_docs
.iter()
.take(num_hits)
.for_each(|(_score, doc_address)| {
let doc: Document = searcher.doc(*doc_address).unwrap();
let named_doc = schema.to_named_doc(&doc);
let json_doc: String = serde_json::to_string(&named_doc).unwrap();
out.push_str(&format!("{}\n", json_doc));
});
print!("{}", out);
Ok(())
}





+ 2
- 4
src/commands/serve.rs View File

@@ -75,7 +75,7 @@ impl IndexServer {
let index = Index::open_in_dir(path).unwrap(); let index = Index::open_in_dir(path).unwrap();
index.tokenizers().register( index.tokenizers().register(
"commoncrawl", "commoncrawl",
SimpleTokenizer
TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40)) .filter(RemoveLongFilter::limit(40))
.filter(LowerCaser) .filter(LowerCaser)
.filter(AlphaNumOnlyFilter) .filter(AlphaNumOnlyFilter)
@@ -84,15 +84,13 @@ impl IndexServer {
let schema = index.schema(); let schema = index.schema();
let default_fields: Vec<Field> = schema let default_fields: Vec<Field> = schema
.fields() .fields()
.iter()
.enumerate()
.filter(|&(_, ref field_entry)| match *field_entry.field_type() { .filter(|&(_, ref field_entry)| match *field_entry.field_type() {
FieldType::Str(ref text_field_options) => { FieldType::Str(ref text_field_options) => {
text_field_options.get_indexing_options().is_some() text_field_options.get_indexing_options().is_some()
} }
_ => false, _ => false,
}) })
.map(|(i, _)| Field(i as u32))
.map(|(field, _)| field)
.collect(); .collect();
let query_parser = let query_parser =
QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone());


+ 52
- 21
src/main.rs View File

@@ -37,7 +37,8 @@ fn main() {
.help("Tantivy index directory filepath") .help("Tantivy index directory filepath")
.required(true); .required(true);


let cli_options = App::new("Tantivy")
#[allow(unused_mut)]
let mut cli_menu = App::new("Tantivy")
.setting(AppSettings::SubcommandRequiredElseHelp) .setting(AppSettings::SubcommandRequiredElseHelp)
.version(env!("CARGO_PKG_VERSION")) .version(env!("CARGO_PKG_VERSION"))
.author("Paul Masurel <paul.masurel@gmail.com>") .author("Paul Masurel <paul.masurel@gmail.com>")
@@ -99,43 +100,73 @@ fn main() {
.value_name("query") .value_name("query")
.help("Query") .help("Query")
.required(true)) .required(true))
)
.subcommand(
SubCommand::with_name("bench")
.about("Run a benchmark on your index")
.arg(index_arg.clone())
.arg(Arg::with_name("queries")
.short("q")
.long("queries")
.value_name("queries")
.help("File containing queries (one per line) to run in the benchmark.")
.required(true))
.arg(Arg::with_name("num_repeat")
.arg(Arg::with_name("num_hits")
.short("n") .short("n")
.long("num_repeat")
.value_name("num_repeat")
.help("Number of times to repeat the benchmark.")
.default_value("1"))
.long("num_hits")
.value_name("num_hits")
.help("Limit number of search results to top <num_hits> hits")
.takes_value(true)
.required(false))
) )
.subcommand( .subcommand(
SubCommand::with_name("merge") SubCommand::with_name("merge")
.about("Merge all the segments of an index") .about("Merge all the segments of an index")
.arg(index_arg.clone()) .arg(index_arg.clone())
)
.get_matches();
);

#[cfg(feature = "bench")]
{
cli_menu = cli_menu
.subcommand(
SubCommand::with_name("bench")
.about("Run a benchmark on your index")
.arg(index_arg.clone())
.arg(Arg::with_name("queries")
.short("q")
.long("queries")
.value_name("queries")
.help("File containing queries (one per line) to run in the benchmark.")
.required(true))
.arg(Arg::with_name("num_repeat")
.short("n")
.long("num_repeat")
.value_name("num_repeat")
.help("Number of times to repeat the benchmark.")
.default_value("1"))
)
}

let cli_options = cli_menu.get_matches();


let (subcommand, some_options) = cli_options.subcommand(); let (subcommand, some_options) = cli_options.subcommand();
let options = some_options.unwrap(); let options = some_options.unwrap();

let run_cli = match subcommand { let run_cli = match subcommand {
"new" => run_new_cli, "new" => run_new_cli,
"index" => run_index_cli, "index" => run_index_cli,
"serve" => run_serve_cli, "serve" => run_serve_cli,
"search" => run_search_cli, "search" => run_search_cli,
"merge" => run_merge_cli, "merge" => run_merge_cli,
"bench" => run_bench_cli,
_ => panic!("Subcommand {} is unknown", subcommand),
//"bench" => run_bench_cli,
#[allow(unused)]
other => {
#[cfg(feature = "bench")]
{
if other == "bench" {
run_bench_cli
} else {
panic!("Subcommand {} is unknown", subcommand)
}
}

#[cfg(not(feature = "bench"))]
{
panic!("Subcommand {} is unknown", subcommand)
}
}
}; };



if let Err(ref e) = run_cli(options) { if let Err(ref e) = run_cli(options) {
let stderr = &mut std::io::stderr(); let stderr = &mut std::io::stderr();
let errmsg = "Error writing ot stderr"; let errmsg = "Error writing ot stderr";


Loading…
Cancel
Save