@@ -1,6 +1,6 @@ | |||
[package] | |||
name = "tantivy-cli" | |||
version = "0.10.0" | |||
version = "0.12.0" | |||
authors = ["Paul Masurel <paul.masurel@gmail.com>"] | |||
description = """Command line interface for Tantivy, a search engine library.""" | |||
@@ -31,12 +31,15 @@ byteorder = "0.5" | |||
log = "0.3" | |||
futures = "0.1" | |||
env_logger = "0.3" | |||
tantivy = "0.10" | |||
tantivy = "0.12" | |||
[[bin]] | |||
name = "tantivy" | |||
path = "src/main.rs" | |||
[features] | |||
bench = [] | |||
[profile.release] | |||
opt-level = 3 | |||
@@ -1,13 +1,13 @@ | |||
extern crate tantivy; | |||
//extern crate tantivy; | |||
use clap::ArgMatches; | |||
use futures::Future; | |||
//use futures::Future; | |||
use std::path::PathBuf; | |||
use tantivy::{Index, SegmentMeta}; | |||
const HEAP_SIZE: usize = 300_000_000; | |||
fn error_msg(err: tantivy::Error) -> String { | |||
fn error_msg(err: tantivy::TantivyError) -> String { | |||
format!("Merge failed : {:?}", err) | |||
} | |||
@@ -21,16 +21,18 @@ pub fn run_merge_cli(argmatch: &ArgMatches) -> Result<(), String> { | |||
fn run_merge(path: PathBuf) -> tantivy::Result<()> { | |||
let index = Index::open_in_dir(&path)?; | |||
let segments = index.searchable_segment_ids()?; | |||
let segment_meta: SegmentMeta = index | |||
.writer(HEAP_SIZE)? | |||
.merge(&segments)? | |||
.wait() | |||
.expect("Merge failed"); | |||
let segment_meta: SegmentMeta = | |||
super::run( | |||
index | |||
.writer(HEAP_SIZE)? | |||
.merge(&segments) | |||
).expect("Merge failed"); | |||
//.map_err(|_| tantivy::Error::ErrorInThread(String::from("Merge got cancelled"))); | |||
println!("Merge finished with segment meta {:?}", segment_meta); | |||
println!("Garbage collect irrelevant segments."); | |||
Index::open_in_dir(&path)? | |||
let gc_fut = Index::open_in_dir(&path)? | |||
.writer_with_num_threads(1, 40_000_000)? | |||
.garbage_collect_files()?; | |||
.garbage_collect_files(); | |||
super::run(gc_fut)?; | |||
Ok(()) | |||
} |
@@ -1,3 +1,7 @@ | |||
use std::sync::{Arc, Condvar, Mutex}; | |||
use std::task::{Context, Poll, RawWaker, RawWakerVTable, Waker}; | |||
#[cfg(feature = "bench")] | |||
mod bench; | |||
mod index; | |||
mod merge; | |||
@@ -5,9 +9,54 @@ mod new; | |||
mod search; | |||
mod serve; | |||
#[cfg(feature = "bench")] | |||
pub use self::bench::run_bench_cli; | |||
pub use self::index::run_index_cli; | |||
pub use self::merge::run_merge_cli; | |||
pub use self::new::run_new_cli; | |||
pub use self::search::run_search_cli; | |||
pub use self::serve::run_serve_cli; | |||
// stolen from [extreme](https://github.com/spacejam/extreme) | |||
#[derive(Default)] | |||
struct Park(Mutex<bool>, Condvar); | |||
fn unpark(park: &Park) { | |||
*park.0.lock().unwrap() = true; | |||
park.1.notify_one(); | |||
} | |||
static VTABLE: RawWakerVTable = RawWakerVTable::new( | |||
|clone_me| unsafe { | |||
let arc = Arc::from_raw(clone_me as *const Park); | |||
std::mem::forget(arc.clone()); | |||
RawWaker::new(Arc::into_raw(arc) as *const (), &VTABLE) | |||
}, | |||
|wake_me| unsafe { unpark(&Arc::from_raw(wake_me as *const Park)) }, | |||
|wake_by_ref_me| unsafe { unpark(&*(wake_by_ref_me as *const Park)) }, | |||
|drop_me| unsafe { drop(Arc::from_raw(drop_me as *const Park)) }, | |||
); | |||
/// Run a `Future`. | |||
pub fn run<F: std::future::Future>(mut f: F) -> F::Output { | |||
let mut f = unsafe { std::pin::Pin::new_unchecked(&mut f) }; | |||
let park = Arc::new(Park::default()); | |||
let sender = Arc::into_raw(park.clone()); | |||
let raw_waker = RawWaker::new(sender as *const _, &VTABLE); | |||
let waker = unsafe { Waker::from_raw(raw_waker) }; | |||
let mut cx = Context::from_waker(&waker); | |||
loop { | |||
match f.as_mut().poll(&mut cx) { | |||
Poll::Pending => { | |||
let mut runnable = park.0.lock().unwrap(); | |||
while !*runnable { | |||
runnable = park.1.wait(runnable).unwrap(); | |||
} | |||
*runnable = false; | |||
} | |||
Poll::Ready(val) => return val, | |||
} | |||
} | |||
} |
@@ -87,7 +87,7 @@ fn prompt_field_type(msg: &str, codes: Vec<&str>) -> tantivy::schema::Type { | |||
"TEXT" => Type::Str, | |||
"U64" => Type::U64, | |||
"I64" => Type::I64, | |||
// "F64" => Type::F64, | |||
"F64" => Type::F64, | |||
"DATE" => Type::Date, | |||
"FACET" => Type::HierarchicalFacet, | |||
"BYTES" => Type::Bytes, | |||
@@ -149,9 +149,9 @@ fn ask_add_num_field_with_options( | |||
Type::U64 => { | |||
schema_builder.add_u64_field(field_name, int_options); | |||
} | |||
// Type::F64 => { | |||
// schema_builder.add_f64_field(field_name, int_options); | |||
// } | |||
Type::F64 => { | |||
schema_builder.add_f64_field(field_name, int_options); | |||
} | |||
Type::I64 => { | |||
schema_builder.add_i64_field(field_name, int_options); | |||
} | |||
@@ -177,8 +177,8 @@ fn ask_add_field(schema_builder: &mut SchemaBuilder) { | |||
Type::Str => { | |||
ask_add_field_text(&field_name, schema_builder); | |||
} | |||
Type::U64 | Type::Date | Type::I64 => { | |||
// Type::U64 | Type::F64 | Type::Date | Type::I64 => { | |||
//Type::U64 | Type::Date | Type::I64 => { | |||
Type::U64 | Type::F64 | Type::Date | Type::I64 => { | |||
ask_add_num_field_with_options(&field_name, field_type, schema_builder); | |||
} | |||
Type::HierarchicalFacet => { | |||
@@ -1,5 +1,6 @@ | |||
use clap::ArgMatches; | |||
use serde_json; | |||
use std::str::FromStr; | |||
use std::convert::From; | |||
use std::path::Path; | |||
use std::path::PathBuf; | |||
@@ -8,11 +9,24 @@ use tantivy::query::QueryParser; | |||
use tantivy::schema::Field; | |||
use tantivy::schema::FieldType; | |||
use tantivy::Index; | |||
use tantivy::collector::{Count, TopDocs}; | |||
use tantivy::Document; | |||
pub fn run_search_cli(matches: &ArgMatches) -> Result<(), String> { | |||
let index_directory = PathBuf::from(matches.value_of("index").unwrap()); | |||
let query = matches.value_of("query").unwrap(); | |||
run_search(&index_directory, &query).map_err(|e| format!("{:?}", e)) | |||
match matches.value_of("num_hits") { | |||
Some(num_hits_str) => { | |||
let num_hits: usize = FromStr::from_str(num_hits_str) | |||
.map_err(|e| { format!("Failed to parse --num_hits (got '{}', expected integer): {}", num_hits_str, e) })?; | |||
run_top_search(&index_directory, &query, num_hits) | |||
.map_err(|e| format!("{:?}", e)) | |||
} | |||
None => { | |||
run_search(&index_directory, &query).map_err(|e| format!("{:?}", e)) | |||
} | |||
} | |||
} | |||
fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { | |||
@@ -20,15 +34,13 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { | |||
let schema = index.schema(); | |||
let default_fields: Vec<Field> = schema | |||
.fields() | |||
.iter() | |||
.enumerate() | |||
.filter(|&(_, ref field_entry)| match *field_entry.field_type() { | |||
FieldType::Str(ref text_field_options) => { | |||
text_field_options.get_indexing_options().is_some() | |||
} | |||
_ => false, | |||
}) | |||
.map(|(i, _)| Field(i as u32)) | |||
.map(|(field, _)| field) | |||
.collect(); | |||
let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); | |||
let query = query_parser.parse_query(query)?; | |||
@@ -36,7 +48,7 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { | |||
let weight = query.weight(&searcher, false)?; | |||
let schema = index.schema(); | |||
for segment_reader in searcher.segment_readers() { | |||
let mut scorer = weight.scorer(segment_reader)?; | |||
let mut scorer = weight.scorer(segment_reader, 1.0)?; | |||
let store_reader = segment_reader.get_store_reader(); | |||
while scorer.advance() { | |||
let doc_id = scorer.doc(); | |||
@@ -47,3 +59,40 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { | |||
} | |||
Ok(()) | |||
} | |||
fn run_top_search(directory: &Path, query: &str, num_hits: usize) -> tantivy::Result<()> { | |||
let index = Index::open_in_dir(directory)?; | |||
let schema = index.schema(); | |||
let default_fields: Vec<Field> = schema | |||
.fields() | |||
.filter(|(_, field_entry)| { | |||
match field_entry.field_type() { | |||
FieldType::Str(ref text_field_options) => { | |||
text_field_options.get_indexing_options().is_some() | |||
} | |||
_ => false, | |||
} | |||
}) | |||
.map(|(field, _)| field) | |||
.collect(); | |||
let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); | |||
let query = query_parser.parse_query(query)?; | |||
let searcher = index.reader()?.searcher(); | |||
let (top_docs, num_hits) = searcher.search(&query, &(TopDocs::with_limit(num_hits), Count))?; | |||
let mut out = String::with_capacity(1024); | |||
top_docs | |||
.iter() | |||
.take(num_hits) | |||
.for_each(|(_score, doc_address)| { | |||
let doc: Document = searcher.doc(*doc_address).unwrap(); | |||
let named_doc = schema.to_named_doc(&doc); | |||
let json_doc: String = serde_json::to_string(&named_doc).unwrap(); | |||
out.push_str(&format!("{}\n", json_doc)); | |||
}); | |||
print!("{}", out); | |||
Ok(()) | |||
} | |||
@@ -75,7 +75,7 @@ impl IndexServer { | |||
let index = Index::open_in_dir(path).unwrap(); | |||
index.tokenizers().register( | |||
"commoncrawl", | |||
SimpleTokenizer | |||
TextAnalyzer::from(SimpleTokenizer) | |||
.filter(RemoveLongFilter::limit(40)) | |||
.filter(LowerCaser) | |||
.filter(AlphaNumOnlyFilter) | |||
@@ -84,15 +84,13 @@ impl IndexServer { | |||
let schema = index.schema(); | |||
let default_fields: Vec<Field> = schema | |||
.fields() | |||
.iter() | |||
.enumerate() | |||
.filter(|&(_, ref field_entry)| match *field_entry.field_type() { | |||
FieldType::Str(ref text_field_options) => { | |||
text_field_options.get_indexing_options().is_some() | |||
} | |||
_ => false, | |||
}) | |||
.map(|(i, _)| Field(i as u32)) | |||
.map(|(field, _)| field) | |||
.collect(); | |||
let query_parser = | |||
QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); | |||
@@ -37,7 +37,8 @@ fn main() { | |||
.help("Tantivy index directory filepath") | |||
.required(true); | |||
let cli_options = App::new("Tantivy") | |||
#[allow(unused_mut)] | |||
let mut cli_menu = App::new("Tantivy") | |||
.setting(AppSettings::SubcommandRequiredElseHelp) | |||
.version(env!("CARGO_PKG_VERSION")) | |||
.author("Paul Masurel <paul.masurel@gmail.com>") | |||
@@ -99,43 +100,73 @@ fn main() { | |||
.value_name("query") | |||
.help("Query") | |||
.required(true)) | |||
) | |||
.subcommand( | |||
SubCommand::with_name("bench") | |||
.about("Run a benchmark on your index") | |||
.arg(index_arg.clone()) | |||
.arg(Arg::with_name("queries") | |||
.short("q") | |||
.long("queries") | |||
.value_name("queries") | |||
.help("File containing queries (one per line) to run in the benchmark.") | |||
.required(true)) | |||
.arg(Arg::with_name("num_repeat") | |||
.arg(Arg::with_name("num_hits") | |||
.short("n") | |||
.long("num_repeat") | |||
.value_name("num_repeat") | |||
.help("Number of times to repeat the benchmark.") | |||
.default_value("1")) | |||
.long("num_hits") | |||
.value_name("num_hits") | |||
.help("Limit number of search results to top <num_hits> hits") | |||
.takes_value(true) | |||
.required(false)) | |||
) | |||
.subcommand( | |||
SubCommand::with_name("merge") | |||
.about("Merge all the segments of an index") | |||
.arg(index_arg.clone()) | |||
) | |||
.get_matches(); | |||
); | |||
#[cfg(feature = "bench")] | |||
{ | |||
cli_menu = cli_menu | |||
.subcommand( | |||
SubCommand::with_name("bench") | |||
.about("Run a benchmark on your index") | |||
.arg(index_arg.clone()) | |||
.arg(Arg::with_name("queries") | |||
.short("q") | |||
.long("queries") | |||
.value_name("queries") | |||
.help("File containing queries (one per line) to run in the benchmark.") | |||
.required(true)) | |||
.arg(Arg::with_name("num_repeat") | |||
.short("n") | |||
.long("num_repeat") | |||
.value_name("num_repeat") | |||
.help("Number of times to repeat the benchmark.") | |||
.default_value("1")) | |||
) | |||
} | |||
let cli_options = cli_menu.get_matches(); | |||
let (subcommand, some_options) = cli_options.subcommand(); | |||
let options = some_options.unwrap(); | |||
let run_cli = match subcommand { | |||
"new" => run_new_cli, | |||
"index" => run_index_cli, | |||
"serve" => run_serve_cli, | |||
"search" => run_search_cli, | |||
"merge" => run_merge_cli, | |||
"bench" => run_bench_cli, | |||
_ => panic!("Subcommand {} is unknown", subcommand), | |||
//"bench" => run_bench_cli, | |||
#[allow(unused)] | |||
other => { | |||
#[cfg(feature = "bench")] | |||
{ | |||
if other == "bench" { | |||
run_bench_cli | |||
} else { | |||
panic!("Subcommand {} is unknown", subcommand) | |||
} | |||
} | |||
#[cfg(not(feature = "bench"))] | |||
{ | |||
panic!("Subcommand {} is unknown", subcommand) | |||
} | |||
} | |||
}; | |||
if let Err(ref e) = run_cli(options) { | |||
let stderr = &mut std::io::stderr(); | |||
let errmsg = "Error writing ot stderr"; | |||