@@ -1,6 +1,6 @@ | |||||
[package] | [package] | ||||
name = "tantivy-cli" | name = "tantivy-cli" | ||||
version = "0.10.0" | |||||
version = "0.12.0" | |||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"] | authors = ["Paul Masurel <paul.masurel@gmail.com>"] | ||||
description = """Command line interface for Tantivy, a search engine library.""" | description = """Command line interface for Tantivy, a search engine library.""" | ||||
@@ -31,12 +31,15 @@ byteorder = "0.5" | |||||
log = "0.3" | log = "0.3" | ||||
futures = "0.1" | futures = "0.1" | ||||
env_logger = "0.3" | env_logger = "0.3" | ||||
tantivy = "0.10" | |||||
tantivy = "0.12" | |||||
[[bin]] | [[bin]] | ||||
name = "tantivy" | name = "tantivy" | ||||
path = "src/main.rs" | path = "src/main.rs" | ||||
[features] | |||||
bench = [] | |||||
[profile.release] | [profile.release] | ||||
opt-level = 3 | opt-level = 3 | ||||
@@ -1,13 +1,13 @@ | |||||
extern crate tantivy; | |||||
//extern crate tantivy; | |||||
use clap::ArgMatches; | use clap::ArgMatches; | ||||
use futures::Future; | |||||
//use futures::Future; | |||||
use std::path::PathBuf; | use std::path::PathBuf; | ||||
use tantivy::{Index, SegmentMeta}; | use tantivy::{Index, SegmentMeta}; | ||||
const HEAP_SIZE: usize = 300_000_000; | const HEAP_SIZE: usize = 300_000_000; | ||||
fn error_msg(err: tantivy::Error) -> String { | |||||
fn error_msg(err: tantivy::TantivyError) -> String { | |||||
format!("Merge failed : {:?}", err) | format!("Merge failed : {:?}", err) | ||||
} | } | ||||
@@ -21,16 +21,18 @@ pub fn run_merge_cli(argmatch: &ArgMatches) -> Result<(), String> { | |||||
fn run_merge(path: PathBuf) -> tantivy::Result<()> { | fn run_merge(path: PathBuf) -> tantivy::Result<()> { | ||||
let index = Index::open_in_dir(&path)?; | let index = Index::open_in_dir(&path)?; | ||||
let segments = index.searchable_segment_ids()?; | let segments = index.searchable_segment_ids()?; | ||||
let segment_meta: SegmentMeta = index | |||||
.writer(HEAP_SIZE)? | |||||
.merge(&segments)? | |||||
.wait() | |||||
.expect("Merge failed"); | |||||
let segment_meta: SegmentMeta = | |||||
super::run( | |||||
index | |||||
.writer(HEAP_SIZE)? | |||||
.merge(&segments) | |||||
).expect("Merge failed"); | |||||
//.map_err(|_| tantivy::Error::ErrorInThread(String::from("Merge got cancelled"))); | //.map_err(|_| tantivy::Error::ErrorInThread(String::from("Merge got cancelled"))); | ||||
println!("Merge finished with segment meta {:?}", segment_meta); | println!("Merge finished with segment meta {:?}", segment_meta); | ||||
println!("Garbage collect irrelevant segments."); | println!("Garbage collect irrelevant segments."); | ||||
Index::open_in_dir(&path)? | |||||
let gc_fut = Index::open_in_dir(&path)? | |||||
.writer_with_num_threads(1, 40_000_000)? | .writer_with_num_threads(1, 40_000_000)? | ||||
.garbage_collect_files()?; | |||||
.garbage_collect_files(); | |||||
super::run(gc_fut)?; | |||||
Ok(()) | Ok(()) | ||||
} | } |
@@ -1,3 +1,7 @@ | |||||
use std::sync::{Arc, Condvar, Mutex}; | |||||
use std::task::{Context, Poll, RawWaker, RawWakerVTable, Waker}; | |||||
#[cfg(feature = "bench")] | |||||
mod bench; | mod bench; | ||||
mod index; | mod index; | ||||
mod merge; | mod merge; | ||||
@@ -5,9 +9,54 @@ mod new; | |||||
mod search; | mod search; | ||||
mod serve; | mod serve; | ||||
#[cfg(feature = "bench")] | |||||
pub use self::bench::run_bench_cli; | pub use self::bench::run_bench_cli; | ||||
pub use self::index::run_index_cli; | pub use self::index::run_index_cli; | ||||
pub use self::merge::run_merge_cli; | pub use self::merge::run_merge_cli; | ||||
pub use self::new::run_new_cli; | pub use self::new::run_new_cli; | ||||
pub use self::search::run_search_cli; | pub use self::search::run_search_cli; | ||||
pub use self::serve::run_serve_cli; | pub use self::serve::run_serve_cli; | ||||
// stolen from [extreme](https://github.com/spacejam/extreme) | |||||
#[derive(Default)] | |||||
struct Park(Mutex<bool>, Condvar); | |||||
fn unpark(park: &Park) { | |||||
*park.0.lock().unwrap() = true; | |||||
park.1.notify_one(); | |||||
} | |||||
static VTABLE: RawWakerVTable = RawWakerVTable::new( | |||||
|clone_me| unsafe { | |||||
let arc = Arc::from_raw(clone_me as *const Park); | |||||
std::mem::forget(arc.clone()); | |||||
RawWaker::new(Arc::into_raw(arc) as *const (), &VTABLE) | |||||
}, | |||||
|wake_me| unsafe { unpark(&Arc::from_raw(wake_me as *const Park)) }, | |||||
|wake_by_ref_me| unsafe { unpark(&*(wake_by_ref_me as *const Park)) }, | |||||
|drop_me| unsafe { drop(Arc::from_raw(drop_me as *const Park)) }, | |||||
); | |||||
/// Run a `Future`. | |||||
pub fn run<F: std::future::Future>(mut f: F) -> F::Output { | |||||
let mut f = unsafe { std::pin::Pin::new_unchecked(&mut f) }; | |||||
let park = Arc::new(Park::default()); | |||||
let sender = Arc::into_raw(park.clone()); | |||||
let raw_waker = RawWaker::new(sender as *const _, &VTABLE); | |||||
let waker = unsafe { Waker::from_raw(raw_waker) }; | |||||
let mut cx = Context::from_waker(&waker); | |||||
loop { | |||||
match f.as_mut().poll(&mut cx) { | |||||
Poll::Pending => { | |||||
let mut runnable = park.0.lock().unwrap(); | |||||
while !*runnable { | |||||
runnable = park.1.wait(runnable).unwrap(); | |||||
} | |||||
*runnable = false; | |||||
} | |||||
Poll::Ready(val) => return val, | |||||
} | |||||
} | |||||
} |
@@ -87,7 +87,7 @@ fn prompt_field_type(msg: &str, codes: Vec<&str>) -> tantivy::schema::Type { | |||||
"TEXT" => Type::Str, | "TEXT" => Type::Str, | ||||
"U64" => Type::U64, | "U64" => Type::U64, | ||||
"I64" => Type::I64, | "I64" => Type::I64, | ||||
// "F64" => Type::F64, | |||||
"F64" => Type::F64, | |||||
"DATE" => Type::Date, | "DATE" => Type::Date, | ||||
"FACET" => Type::HierarchicalFacet, | "FACET" => Type::HierarchicalFacet, | ||||
"BYTES" => Type::Bytes, | "BYTES" => Type::Bytes, | ||||
@@ -149,9 +149,9 @@ fn ask_add_num_field_with_options( | |||||
Type::U64 => { | Type::U64 => { | ||||
schema_builder.add_u64_field(field_name, int_options); | schema_builder.add_u64_field(field_name, int_options); | ||||
} | } | ||||
// Type::F64 => { | |||||
// schema_builder.add_f64_field(field_name, int_options); | |||||
// } | |||||
Type::F64 => { | |||||
schema_builder.add_f64_field(field_name, int_options); | |||||
} | |||||
Type::I64 => { | Type::I64 => { | ||||
schema_builder.add_i64_field(field_name, int_options); | schema_builder.add_i64_field(field_name, int_options); | ||||
} | } | ||||
@@ -177,8 +177,8 @@ fn ask_add_field(schema_builder: &mut SchemaBuilder) { | |||||
Type::Str => { | Type::Str => { | ||||
ask_add_field_text(&field_name, schema_builder); | ask_add_field_text(&field_name, schema_builder); | ||||
} | } | ||||
Type::U64 | Type::Date | Type::I64 => { | |||||
// Type::U64 | Type::F64 | Type::Date | Type::I64 => { | |||||
//Type::U64 | Type::Date | Type::I64 => { | |||||
Type::U64 | Type::F64 | Type::Date | Type::I64 => { | |||||
ask_add_num_field_with_options(&field_name, field_type, schema_builder); | ask_add_num_field_with_options(&field_name, field_type, schema_builder); | ||||
} | } | ||||
Type::HierarchicalFacet => { | Type::HierarchicalFacet => { | ||||
@@ -1,5 +1,6 @@ | |||||
use clap::ArgMatches; | use clap::ArgMatches; | ||||
use serde_json; | use serde_json; | ||||
use std::str::FromStr; | |||||
use std::convert::From; | use std::convert::From; | ||||
use std::path::Path; | use std::path::Path; | ||||
use std::path::PathBuf; | use std::path::PathBuf; | ||||
@@ -8,11 +9,24 @@ use tantivy::query::QueryParser; | |||||
use tantivy::schema::Field; | use tantivy::schema::Field; | ||||
use tantivy::schema::FieldType; | use tantivy::schema::FieldType; | ||||
use tantivy::Index; | use tantivy::Index; | ||||
use tantivy::collector::{Count, TopDocs}; | |||||
use tantivy::Document; | |||||
pub fn run_search_cli(matches: &ArgMatches) -> Result<(), String> { | pub fn run_search_cli(matches: &ArgMatches) -> Result<(), String> { | ||||
let index_directory = PathBuf::from(matches.value_of("index").unwrap()); | let index_directory = PathBuf::from(matches.value_of("index").unwrap()); | ||||
let query = matches.value_of("query").unwrap(); | let query = matches.value_of("query").unwrap(); | ||||
run_search(&index_directory, &query).map_err(|e| format!("{:?}", e)) | |||||
match matches.value_of("num_hits") { | |||||
Some(num_hits_str) => { | |||||
let num_hits: usize = FromStr::from_str(num_hits_str) | |||||
.map_err(|e| { format!("Failed to parse --num_hits (got '{}', expected integer): {}", num_hits_str, e) })?; | |||||
run_top_search(&index_directory, &query, num_hits) | |||||
.map_err(|e| format!("{:?}", e)) | |||||
} | |||||
None => { | |||||
run_search(&index_directory, &query).map_err(|e| format!("{:?}", e)) | |||||
} | |||||
} | |||||
} | } | ||||
fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { | fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { | ||||
@@ -20,15 +34,13 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { | |||||
let schema = index.schema(); | let schema = index.schema(); | ||||
let default_fields: Vec<Field> = schema | let default_fields: Vec<Field> = schema | ||||
.fields() | .fields() | ||||
.iter() | |||||
.enumerate() | |||||
.filter(|&(_, ref field_entry)| match *field_entry.field_type() { | .filter(|&(_, ref field_entry)| match *field_entry.field_type() { | ||||
FieldType::Str(ref text_field_options) => { | FieldType::Str(ref text_field_options) => { | ||||
text_field_options.get_indexing_options().is_some() | text_field_options.get_indexing_options().is_some() | ||||
} | } | ||||
_ => false, | _ => false, | ||||
}) | }) | ||||
.map(|(i, _)| Field(i as u32)) | |||||
.map(|(field, _)| field) | |||||
.collect(); | .collect(); | ||||
let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); | let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); | ||||
let query = query_parser.parse_query(query)?; | let query = query_parser.parse_query(query)?; | ||||
@@ -36,7 +48,7 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { | |||||
let weight = query.weight(&searcher, false)?; | let weight = query.weight(&searcher, false)?; | ||||
let schema = index.schema(); | let schema = index.schema(); | ||||
for segment_reader in searcher.segment_readers() { | for segment_reader in searcher.segment_readers() { | ||||
let mut scorer = weight.scorer(segment_reader)?; | |||||
let mut scorer = weight.scorer(segment_reader, 1.0)?; | |||||
let store_reader = segment_reader.get_store_reader(); | let store_reader = segment_reader.get_store_reader(); | ||||
while scorer.advance() { | while scorer.advance() { | ||||
let doc_id = scorer.doc(); | let doc_id = scorer.doc(); | ||||
@@ -47,3 +59,40 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { | |||||
} | } | ||||
Ok(()) | Ok(()) | ||||
} | } | ||||
fn run_top_search(directory: &Path, query: &str, num_hits: usize) -> tantivy::Result<()> { | |||||
let index = Index::open_in_dir(directory)?; | |||||
let schema = index.schema(); | |||||
let default_fields: Vec<Field> = schema | |||||
.fields() | |||||
.filter(|(_, field_entry)| { | |||||
match field_entry.field_type() { | |||||
FieldType::Str(ref text_field_options) => { | |||||
text_field_options.get_indexing_options().is_some() | |||||
} | |||||
_ => false, | |||||
} | |||||
}) | |||||
.map(|(field, _)| field) | |||||
.collect(); | |||||
let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); | |||||
let query = query_parser.parse_query(query)?; | |||||
let searcher = index.reader()?.searcher(); | |||||
let (top_docs, num_hits) = searcher.search(&query, &(TopDocs::with_limit(num_hits), Count))?; | |||||
let mut out = String::with_capacity(1024); | |||||
top_docs | |||||
.iter() | |||||
.take(num_hits) | |||||
.for_each(|(_score, doc_address)| { | |||||
let doc: Document = searcher.doc(*doc_address).unwrap(); | |||||
let named_doc = schema.to_named_doc(&doc); | |||||
let json_doc: String = serde_json::to_string(&named_doc).unwrap(); | |||||
out.push_str(&format!("{}\n", json_doc)); | |||||
}); | |||||
print!("{}", out); | |||||
Ok(()) | |||||
} | |||||
@@ -75,7 +75,7 @@ impl IndexServer { | |||||
let index = Index::open_in_dir(path).unwrap(); | let index = Index::open_in_dir(path).unwrap(); | ||||
index.tokenizers().register( | index.tokenizers().register( | ||||
"commoncrawl", | "commoncrawl", | ||||
SimpleTokenizer | |||||
TextAnalyzer::from(SimpleTokenizer) | |||||
.filter(RemoveLongFilter::limit(40)) | .filter(RemoveLongFilter::limit(40)) | ||||
.filter(LowerCaser) | .filter(LowerCaser) | ||||
.filter(AlphaNumOnlyFilter) | .filter(AlphaNumOnlyFilter) | ||||
@@ -84,15 +84,13 @@ impl IndexServer { | |||||
let schema = index.schema(); | let schema = index.schema(); | ||||
let default_fields: Vec<Field> = schema | let default_fields: Vec<Field> = schema | ||||
.fields() | .fields() | ||||
.iter() | |||||
.enumerate() | |||||
.filter(|&(_, ref field_entry)| match *field_entry.field_type() { | .filter(|&(_, ref field_entry)| match *field_entry.field_type() { | ||||
FieldType::Str(ref text_field_options) => { | FieldType::Str(ref text_field_options) => { | ||||
text_field_options.get_indexing_options().is_some() | text_field_options.get_indexing_options().is_some() | ||||
} | } | ||||
_ => false, | _ => false, | ||||
}) | }) | ||||
.map(|(i, _)| Field(i as u32)) | |||||
.map(|(field, _)| field) | |||||
.collect(); | .collect(); | ||||
let query_parser = | let query_parser = | ||||
QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); | QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); | ||||
@@ -37,7 +37,8 @@ fn main() { | |||||
.help("Tantivy index directory filepath") | .help("Tantivy index directory filepath") | ||||
.required(true); | .required(true); | ||||
let cli_options = App::new("Tantivy") | |||||
#[allow(unused_mut)] | |||||
let mut cli_menu = App::new("Tantivy") | |||||
.setting(AppSettings::SubcommandRequiredElseHelp) | .setting(AppSettings::SubcommandRequiredElseHelp) | ||||
.version(env!("CARGO_PKG_VERSION")) | .version(env!("CARGO_PKG_VERSION")) | ||||
.author("Paul Masurel <paul.masurel@gmail.com>") | .author("Paul Masurel <paul.masurel@gmail.com>") | ||||
@@ -99,43 +100,73 @@ fn main() { | |||||
.value_name("query") | .value_name("query") | ||||
.help("Query") | .help("Query") | ||||
.required(true)) | .required(true)) | ||||
) | |||||
.subcommand( | |||||
SubCommand::with_name("bench") | |||||
.about("Run a benchmark on your index") | |||||
.arg(index_arg.clone()) | |||||
.arg(Arg::with_name("queries") | |||||
.short("q") | |||||
.long("queries") | |||||
.value_name("queries") | |||||
.help("File containing queries (one per line) to run in the benchmark.") | |||||
.required(true)) | |||||
.arg(Arg::with_name("num_repeat") | |||||
.arg(Arg::with_name("num_hits") | |||||
.short("n") | .short("n") | ||||
.long("num_repeat") | |||||
.value_name("num_repeat") | |||||
.help("Number of times to repeat the benchmark.") | |||||
.default_value("1")) | |||||
.long("num_hits") | |||||
.value_name("num_hits") | |||||
.help("Limit number of search results to top <num_hits> hits") | |||||
.takes_value(true) | |||||
.required(false)) | |||||
) | ) | ||||
.subcommand( | .subcommand( | ||||
SubCommand::with_name("merge") | SubCommand::with_name("merge") | ||||
.about("Merge all the segments of an index") | .about("Merge all the segments of an index") | ||||
.arg(index_arg.clone()) | .arg(index_arg.clone()) | ||||
) | |||||
.get_matches(); | |||||
); | |||||
#[cfg(feature = "bench")] | |||||
{ | |||||
cli_menu = cli_menu | |||||
.subcommand( | |||||
SubCommand::with_name("bench") | |||||
.about("Run a benchmark on your index") | |||||
.arg(index_arg.clone()) | |||||
.arg(Arg::with_name("queries") | |||||
.short("q") | |||||
.long("queries") | |||||
.value_name("queries") | |||||
.help("File containing queries (one per line) to run in the benchmark.") | |||||
.required(true)) | |||||
.arg(Arg::with_name("num_repeat") | |||||
.short("n") | |||||
.long("num_repeat") | |||||
.value_name("num_repeat") | |||||
.help("Number of times to repeat the benchmark.") | |||||
.default_value("1")) | |||||
) | |||||
} | |||||
let cli_options = cli_menu.get_matches(); | |||||
let (subcommand, some_options) = cli_options.subcommand(); | let (subcommand, some_options) = cli_options.subcommand(); | ||||
let options = some_options.unwrap(); | let options = some_options.unwrap(); | ||||
let run_cli = match subcommand { | let run_cli = match subcommand { | ||||
"new" => run_new_cli, | "new" => run_new_cli, | ||||
"index" => run_index_cli, | "index" => run_index_cli, | ||||
"serve" => run_serve_cli, | "serve" => run_serve_cli, | ||||
"search" => run_search_cli, | "search" => run_search_cli, | ||||
"merge" => run_merge_cli, | "merge" => run_merge_cli, | ||||
"bench" => run_bench_cli, | |||||
_ => panic!("Subcommand {} is unknown", subcommand), | |||||
//"bench" => run_bench_cli, | |||||
#[allow(unused)] | |||||
other => { | |||||
#[cfg(feature = "bench")] | |||||
{ | |||||
if other == "bench" { | |||||
run_bench_cli | |||||
} else { | |||||
panic!("Subcommand {} is unknown", subcommand) | |||||
} | |||||
} | |||||
#[cfg(not(feature = "bench"))] | |||||
{ | |||||
panic!("Subcommand {} is unknown", subcommand) | |||||
} | |||||
} | |||||
}; | }; | ||||
if let Err(ref e) = run_cli(options) { | if let Err(ref e) = run_cli(options) { | ||||
let stderr = &mut std::io::stderr(); | let stderr = &mut std::io::stderr(); | ||||
let errmsg = "Error writing ot stderr"; | let errmsg = "Error writing ot stderr"; | ||||