@@ -1,6 +1,6 @@ | |||
[package] | |||
name = "tantivy-cli" | |||
version = "0.7.0" | |||
version = "0.9.0" | |||
authors = ["Paul Masurel <paul.masurel@gmail.com>"] | |||
description = """Command line interface for Tantivy, a search engine library.""" | |||
@@ -30,7 +30,7 @@ byteorder = "0.5" | |||
log = "0.3" | |||
futures = "0.1" | |||
env_logger = "0.3" | |||
tantivy = "0.7" | |||
tantivy = "0.9" | |||
[[bin]] | |||
name = "tantivy" | |||
@@ -1,36 +1,32 @@ | |||
use tantivy::Index; | |||
use tantivy::schema::{Field, Schema}; | |||
use tantivy::query::QueryParser; | |||
use std::path::Path; | |||
use std::io::BufReader; | |||
use std::io::BufRead; | |||
use std::io; | |||
use std::fs::File; | |||
use tantivy::collector::chain; | |||
use tantivy::collector::TopCollector; | |||
use tantivy::collector::CountCollector; | |||
use clap::ArgMatches; | |||
use std::fs::File; | |||
use std::io; | |||
use std::io::BufRead; | |||
use std::io::BufReader; | |||
use std::path::Path; | |||
use std::path::PathBuf; | |||
use tantivy::collector::{Count, TopDocs}; | |||
use tantivy::query::QueryParser; | |||
use tantivy::schema::{Field, Schema}; | |||
use tantivy::Index; | |||
use timer::TimerTree; | |||
pub fn run_bench_cli(matches: &ArgMatches) -> Result<(), String> { | |||
let index_path = PathBuf::from(matches.value_of("index").unwrap()); | |||
let queries_path = PathBuf::from(matches.value_of("queries").unwrap()); // the unwrap is safe as long as it is comming from the main cli. | |||
let num_repeat = value_t!(matches, "num_repeat", usize).map_err(|e| format!("Failed to read num_repeat argument as an integer. {:?}", e))?; | |||
let num_repeat = value_t!(matches, "num_repeat", usize) | |||
.map_err(|e| format!("Failed to read num_repeat argument as an integer. {:?}", e))?; | |||
run_bench(&index_path, &queries_path, num_repeat).map_err(From::from) | |||
} | |||
fn extract_search_fields(schema: &Schema) -> Vec<Field> { | |||
schema.fields() | |||
.iter() | |||
.enumerate() | |||
.filter(|&(_, field_entry)| { | |||
field_entry.is_indexed() | |||
}) | |||
.map(|(field_id, _)| Field(field_id as u32)) | |||
.collect() | |||
schema | |||
.fields() | |||
.iter() | |||
.enumerate() | |||
.filter(|&(_, field_entry)| field_entry.is_indexed()) | |||
.map(|(field_id, _)| Field(field_id as u32)) | |||
.collect() | |||
} | |||
fn read_query_file(query_path: &Path) -> io::Result<Vec<String>> { | |||
@@ -43,60 +39,70 @@ fn read_query_file(query_path: &Path) -> io::Result<Vec<String>> { | |||
Ok(queries) | |||
} | |||
fn run_bench(index_path: &Path, | |||
query_filepath: &Path, | |||
num_repeat: usize) -> Result<(), String> { | |||
fn run_bench(index_path: &Path, query_filepath: &Path, num_repeat: usize) -> Result<(), String> { | |||
println!("index_path : {:?}", index_path); | |||
println!("Query : {:?}", index_path); | |||
println!("-------------------------------\n\n\n"); | |||
let index = Index::open_in_dir(index_path).map_err(|e| format!("Failed to open index.\n{:?}", e))?; | |||
let searcher = index.searcher(); | |||
let index = | |||
Index::open_in_dir(index_path).map_err(|e| format!("Failed to open index.\n{:?}", e))?; | |||
let searcher = index | |||
.reader() | |||
.map_err(|err| format!("{:?}", err))? | |||
.searcher(); | |||
let default_search_fields: Vec<Field> = extract_search_fields(&index.schema()); | |||
let queries = read_query_file(query_filepath).map_err(|e| format!("Failed reading the query file: {}", e))?; | |||
let query_parser = QueryParser::new(index.schema(), default_search_fields, index.tokenizers().clone()); | |||
let queries = read_query_file(query_filepath) | |||
.map_err(|e| format!("Failed reading the query file: {}", e))?; | |||
let query_parser = QueryParser::new( | |||
index.schema(), | |||
default_search_fields, | |||
index.tokenizers().clone(), | |||
); | |||
println!("SEARCH\n"); | |||
println!("{}\t{}\t{}\t{}", "query", "num_terms", "num hits", "time in microsecs"); | |||
println!( | |||
"{}\t{}\t{}\t{}", | |||
"query", "num_terms", "num hits", "time in microsecs" | |||
); | |||
for _ in 0..num_repeat { | |||
for query_txt in &queries { | |||
let query = query_parser.parse_query(&query_txt).unwrap(); | |||
// let num_terms = query.num_terms(); | |||
let mut top_collector = TopCollector::with_limit(10); | |||
let mut count_collector = CountCollector::default(); | |||
let mut timing = TimerTree::default(); | |||
{ | |||
let (_top_docs, count) = { | |||
let _search = timing.open("search"); | |||
let mut collector = chain().push(&mut top_collector).push(&mut count_collector); | |||
query.search(&searcher, &mut collector) | |||
.map_err(|e| format!("Failed while searching query {:?}.\n\n{:?}", query_txt, e))?; | |||
} | |||
println!("{}\t{}\t{}", query_txt, count_collector.count(), timing.total_time()); | |||
searcher | |||
.search(&query, &(TopDocs::with_limit(10), Count)) | |||
.map_err(|e| { | |||
format!("Failed while searching query {:?}.\n\n{:?}", query_txt, e) | |||
})? | |||
}; | |||
println!("{}\t{}\t{}", query_txt, count, timing.total_time()); | |||
} | |||
} | |||
println!("\n\nFETCH STORE\n"); | |||
println!("{}\t{}", "query", "time in microsecs"); | |||
for _ in 0..num_repeat { | |||
for query_txt in &queries { | |||
let query = query_parser.parse_query(&query_txt).unwrap(); | |||
let mut top_collector = TopCollector::with_limit(10); | |||
query.search(&searcher, &mut top_collector) | |||
.map_err(|e| format!("Failed while retrieving document for query {:?}.\n{:?}", query, e))?; | |||
let top_docs = searcher | |||
.search(&*query, &TopDocs::with_limit(10)) | |||
.map_err(|e| { | |||
format!( | |||
"Failed while retrieving document for query {:?}.\n{:?}", | |||
query, e | |||
) | |||
})?; | |||
let mut timer = TimerTree::default(); | |||
{ | |||
let _scoped_timer_ = timer.open("total"); | |||
for doc_address in top_collector.docs() { | |||
for (_score, doc_address) in top_docs { | |||
searcher.doc(doc_address).unwrap(); | |||
} | |||
} | |||
println!("{}\t{}", query_txt, timer.total_time()); | |||
} | |||
} | |||
Ok(()) | |||
} | |||
@@ -1,42 +1,53 @@ | |||
use chan; | |||
use clap::ArgMatches; | |||
use std::cmp; | |||
use std::convert::From; | |||
use std::fs::File; | |||
use std::io; | |||
use std::cmp; | |||
use std::io::BufRead; | |||
use std::io::BufReader; | |||
use std::io::Read; | |||
use std::path::PathBuf; | |||
use std::thread; | |||
use tantivy; | |||
use tantivy::merge_policy::NoMergePolicy; | |||
use tantivy::Document; | |||
use tantivy::Index; | |||
use tantivy::IndexWriter; | |||
use tantivy::Document; | |||
use time::PreciseTime; | |||
use clap::ArgMatches; | |||
use chan; | |||
use tantivy::merge_policy::NoMergePolicy; | |||
use std::thread; | |||
pub fn run_index_cli(argmatch: &ArgMatches) -> Result<(), String> { | |||
let index_directory = PathBuf::from(argmatch.value_of("index").unwrap()); | |||
let document_source = argmatch.value_of("file") | |||
let document_source = argmatch | |||
.value_of("file") | |||
.map(|path| DocumentSource::FromFile(PathBuf::from(path))) | |||
.unwrap_or(DocumentSource::FromPipe); | |||
let no_merge = argmatch.is_present("nomerge"); | |||
let mut num_threads = value_t!(argmatch, "num_threads", usize).map_err(|_| format!("Failed to read num_threads argument as an integer."))?; | |||
let mut num_threads = value_t!(argmatch, "num_threads", usize) | |||
.map_err(|_| format!("Failed to read num_threads argument as an integer."))?; | |||
if num_threads == 0 { | |||
num_threads = 1; | |||
} | |||
let buffer_size = value_t!(argmatch, "memory_size", usize).map_err(|_| format!("Failed to read the buffer size argument as an integer."))?; | |||
let buffer_size = value_t!(argmatch, "memory_size", usize) | |||
.map_err(|_| format!("Failed to read the buffer size argument as an integer."))?; | |||
let buffer_size_per_thread = buffer_size / num_threads; | |||
run_index(index_directory, document_source, buffer_size_per_thread, num_threads, no_merge).map_err(|e| format!("Indexing failed : {:?}", e)) | |||
run_index( | |||
index_directory, | |||
document_source, | |||
buffer_size_per_thread, | |||
num_threads, | |||
no_merge, | |||
) | |||
.map_err(|e| format!("Indexing failed : {:?}", e)) | |||
} | |||
fn run_index(directory: PathBuf, | |||
document_source: DocumentSource, | |||
buffer_size_per_thread: usize, | |||
num_threads: usize, | |||
no_merge: bool) -> tantivy::Result<()> { | |||
fn run_index( | |||
directory: PathBuf, | |||
document_source: DocumentSource, | |||
buffer_size_per_thread: usize, | |||
num_threads: usize, | |||
no_merge: bool, | |||
) -> tantivy::Result<()> { | |||
let index = Index::open_in_dir(&directory)?; | |||
let schema = index.schema(); | |||
let (line_sender, line_receiver) = chan::sync(10_000); | |||
@@ -76,7 +87,7 @@ fn run_index(directory: PathBuf, | |||
} else { | |||
index.writer(buffer_size_per_thread) | |||
}?; | |||
if no_merge { | |||
index_writer.set_merge_policy(Box::new(NoMergePolicy)); | |||
} | |||
@@ -88,8 +99,6 @@ fn run_index(directory: PathBuf, | |||
info!("Indexing the documents took {} s", duration.num_seconds()); | |||
} | |||
match index_result { | |||
Ok(docstamp) => { | |||
println!("Commit succeed, docstamp at {}", docstamp); | |||
@@ -98,7 +107,10 @@ fn run_index(directory: PathBuf, | |||
println!("Terminated successfully!"); | |||
{ | |||
let duration = start_overall.to(PreciseTime::now()); | |||
info!("Indexing the documents took {} s overall (indexing + merge)", duration.num_seconds()); | |||
info!( | |||
"Indexing the documents took {} s overall (indexing + merge)", | |||
duration.num_seconds() | |||
); | |||
} | |||
Ok(()) | |||
} | |||
@@ -111,7 +123,10 @@ fn run_index(directory: PathBuf, | |||
} | |||
} | |||
fn index_documents(index_writer: &mut IndexWriter, doc_receiver: chan::Receiver<Document>) -> tantivy::Result<u64> { | |||
fn index_documents( | |||
index_writer: &mut IndexWriter, | |||
doc_receiver: chan::Receiver<Document>, | |||
) -> tantivy::Result<u64> { | |||
let group_count = 100_000; | |||
let mut num_docs = 0; | |||
let mut cur = PreciseTime::now(); | |||
@@ -121,7 +136,11 @@ fn index_documents(index_writer: &mut IndexWriter, doc_receiver: chan::Receiver< | |||
println!("{} Docs", num_docs); | |||
let new = PreciseTime::now(); | |||
let elapsed = cur.to(new); | |||
println!("{:?} docs / hour", group_count * 3600 * 1_000_000 as u64 / (elapsed.num_microseconds().unwrap() as u64)); | |||
println!( | |||
"{:?} docs / hour", | |||
group_count * 3600 * 1_000_000 as u64 | |||
/ (elapsed.num_microseconds().unwrap() as u64) | |||
); | |||
cur = new; | |||
} | |||
num_docs += 1; | |||
@@ -129,18 +148,15 @@ fn index_documents(index_writer: &mut IndexWriter, doc_receiver: chan::Receiver< | |||
index_writer.commit() | |||
} | |||
enum DocumentSource { | |||
FromPipe, | |||
FromFile(PathBuf), | |||
} | |||
impl DocumentSource { | |||
fn read(&self,) -> io::Result<BufReader<Box<Read>>> { | |||
fn read(&self) -> io::Result<BufReader<Box<Read>>> { | |||
Ok(match self { | |||
&DocumentSource::FromPipe => { | |||
BufReader::new(Box::new(io::stdin())) | |||
} | |||
&DocumentSource::FromPipe => BufReader::new(Box::new(io::stdin())), | |||
&DocumentSource::FromFile(ref filepath) => { | |||
let read_file = File::open(&filepath)?; | |||
BufReader::new(Box::new(read_file)) | |||
@@ -1,13 +1,12 @@ | |||
extern crate tantivy; | |||
use tantivy::{Index, SegmentMeta}; | |||
use std::path::PathBuf; | |||
use clap::ArgMatches; | |||
use futures::Future; | |||
use std::path::PathBuf; | |||
use tantivy::{Index, SegmentMeta}; | |||
const HEAP_SIZE: usize = 300_000_000; | |||
fn error_msg(err: tantivy::Error) -> String { | |||
format!("Merge failed : {:?}", err) | |||
} | |||
@@ -15,12 +14,10 @@ fn error_msg(err: tantivy::Error) -> String { | |||
pub fn run_merge_cli(argmatch: &ArgMatches) -> Result<(), String> { | |||
let index_directory = PathBuf::from(argmatch.value_of("index").unwrap()); | |||
run_merge(index_directory).map_err(error_msg) | |||
// we rollback to force a gc. | |||
// we rollback to force a gc. | |||
} | |||
fn run_merge(path: PathBuf) -> tantivy::Result<()> { | |||
let index = Index::open_in_dir(&path)?; | |||
let segments = index.searchable_segment_ids()?; | |||
@@ -29,7 +26,7 @@ fn run_merge(path: PathBuf) -> tantivy::Result<()> { | |||
.merge(&segments)? | |||
.wait() | |||
.expect("Merge failed"); | |||
//.map_err(|_| tantivy::Error::ErrorInThread(String::from("Merge got cancelled"))); | |||
//.map_err(|_| tantivy::Error::ErrorInThread(String::from("Merge got cancelled"))); | |||
println!("Merge finished with segment meta {:?}", segment_meta); | |||
println!("Garbage collect irrelevant segments."); | |||
Index::open_in_dir(&path)? | |||
@@ -1,13 +1,13 @@ | |||
mod index; | |||
mod serve; | |||
mod new; | |||
mod bench; | |||
mod index; | |||
mod merge; | |||
mod new; | |||
mod search; | |||
mod serve; | |||
pub use self::new::run_new_cli; | |||
pub use self::index::run_index_cli; | |||
pub use self::serve::run_serve_cli; | |||
pub use self::search::run_search_cli; | |||
pub use self::bench::run_bench_cli; | |||
pub use self::index::run_index_cli; | |||
pub use self::merge::run_merge_cli; | |||
pub use self::new::run_new_cli; | |||
pub use self::search::run_search_cli; | |||
pub use self::serve::run_serve_cli; |
@@ -1,30 +1,35 @@ | |||
use ansi_term::Colour::{Blue, Green, Red}; | |||
use ansi_term::Style; | |||
use clap::ArgMatches; | |||
use serde_json; | |||
use std::convert::From; | |||
use std::io; | |||
use std::io::Write; | |||
use std::path::PathBuf; | |||
use tantivy; | |||
use tantivy::schema::Cardinality; | |||
use tantivy::schema::*; | |||
use tantivy::Index; | |||
use std::io; | |||
use ansi_term::Style; | |||
use ansi_term::Colour::{Red, Blue, Green}; | |||
use std::io::Write; | |||
use serde_json; | |||
pub fn run_new_cli(matches: &ArgMatches) -> Result<(), String> { | |||
let index_directory = PathBuf::from(matches.value_of("index").unwrap()); | |||
run_new(index_directory).map_err(|e| format!("{:?}" , e)) | |||
run_new(index_directory).map_err(|e| format!("{:?}", e)) | |||
} | |||
fn prompt_input<P: Fn(&str) -> Result<(), String>>(prompt_text: &str, predicate: P) -> String { | |||
loop { | |||
print!("{prompt_text:<width$} ? ", prompt_text=Style::new().bold().fg(Blue).paint(prompt_text), width=40); | |||
print!( | |||
"{prompt_text:<width$} ? ", | |||
prompt_text = Style::new().bold().fg(Blue).paint(prompt_text), | |||
width = 40 | |||
); | |||
io::stdout().flush().unwrap(); | |||
let mut buffer = String::new(); | |||
io::stdin().read_line(&mut buffer).ok().expect("Failed to read line"); | |||
let answer = buffer.trim_right().to_string(); | |||
io::stdin() | |||
.read_line(&mut buffer) | |||
.ok() | |||
.expect("Failed to read line"); | |||
let answer = buffer.trim_end().to_string(); | |||
match predicate(&answer) { | |||
Ok(()) => { | |||
return answer; | |||
@@ -36,30 +41,28 @@ fn prompt_input<P: Fn(&str) -> Result<(), String>>(prompt_text: &str, predicate: | |||
} | |||
} | |||
fn field_name_validate(field_name: &str) -> Result<(), String> { | |||
if is_valid_field_name(field_name) { | |||
Ok(()) | |||
} | |||
else { | |||
Err(String::from("Field name must match the pattern [_a-zA-Z0-9]+")) | |||
} else { | |||
Err(String::from( | |||
"Field name must match the pattern [_a-zA-Z0-9]+", | |||
)) | |||
} | |||
} | |||
fn prompt_options(msg: &str, codes: Vec<char>) -> char { | |||
let options_string: Vec<String> = codes.iter().map(|c| format!("{}", c)).collect(); | |||
let options = options_string.join("/"); | |||
let predicate = |entry: &str| { | |||
if entry.len() != 1 { | |||
return Err(format!("Invalid input. Options are ({})", options)) | |||
return Err(format!("Invalid input. Options are ({})", options)); | |||
} | |||
let c = entry.chars().next().unwrap().to_ascii_uppercase(); | |||
if codes.contains(&c) { | |||
return Ok(()) | |||
} | |||
else { | |||
return Err(format!("Invalid input. Options are ({})", options)) | |||
return Ok(()); | |||
} else { | |||
return Err(format!("Invalid input. Options are ({})", options)); | |||
} | |||
}; | |||
let message = format!("{} ({})", msg, options); | |||
@@ -68,30 +71,28 @@ fn prompt_options(msg: &str, codes: Vec<char>) -> char { | |||
} | |||
fn prompt_yn(msg: &str) -> bool { | |||
prompt_options(msg, vec!('Y', 'N')) == 'Y' | |||
prompt_options(msg, vec!['Y', 'N']) == 'Y' | |||
} | |||
fn ask_add_field_text(field_name: &str, schema_builder: &mut SchemaBuilder) { | |||
let mut text_options = TextOptions::default(); | |||
if prompt_yn("Should the field be stored") { | |||
text_options = text_options.set_stored(); | |||
} | |||
if prompt_yn("Should the field be indexed") { | |||
let mut text_indexing_options = TextFieldIndexing | |||
::default() | |||
let mut text_indexing_options = TextFieldIndexing::default() | |||
.set_index_option(IndexRecordOption::Basic) | |||
.set_tokenizer("en_stem"); | |||
if prompt_yn("Should the term be tokenized?") { | |||
if prompt_yn("Should the term frequencies (per doc) be in the index") { | |||
if prompt_yn("Should the term positions (per doc) be in the index") { | |||
text_indexing_options = text_indexing_options.set_index_option(IndexRecordOption::WithFreqsAndPositions); | |||
text_indexing_options = text_indexing_options | |||
.set_index_option(IndexRecordOption::WithFreqsAndPositions); | |||
} else { | |||
text_indexing_options = text_indexing_options.set_index_option(IndexRecordOption::WithFreqs); | |||
text_indexing_options = | |||
text_indexing_options.set_index_option(IndexRecordOption::WithFreqs); | |||
} | |||
} | |||
} else { | |||
@@ -101,11 +102,9 @@ fn ask_add_field_text(field_name: &str, schema_builder: &mut SchemaBuilder) { | |||
text_options = text_options.set_indexing_options(text_indexing_options); | |||
} | |||
schema_builder.add_text_field(field_name, text_options); | |||
} | |||
fn ask_add_field_u64(field_name: &str, schema_builder: &mut SchemaBuilder) { | |||
let mut u64_options = IntOptions::default(); | |||
if prompt_yn("Should the field be stored") { | |||
@@ -123,20 +122,28 @@ fn ask_add_field_u64(field_name: &str, schema_builder: &mut SchemaBuilder) { | |||
fn ask_add_field(schema_builder: &mut SchemaBuilder) { | |||
println!("\n\n"); | |||
let field_name = prompt_input("New field name ", field_name_validate); | |||
let text_or_integer = prompt_options("Text or unsigned 32-bit integer", vec!('T', 'I')); | |||
if text_or_integer =='T' { | |||
let text_or_integer = prompt_options("Text or unsigned 32-bit integer", vec!['T', 'I']); | |||
if text_or_integer == 'T' { | |||
ask_add_field_text(&field_name, schema_builder); | |||
} | |||
else { | |||
ask_add_field_u64(&field_name, schema_builder); | |||
} else { | |||
ask_add_field_u64(&field_name, schema_builder); | |||
} | |||
} | |||
fn run_new(directory: PathBuf) -> tantivy::Result<()> { | |||
println!("\n{} ", Style::new().bold().fg(Green).paint("Creating new index")); | |||
println!("{} ", Style::new().bold().fg(Green).paint("Let's define it's schema!")); | |||
println!( | |||
"\n{} ", | |||
Style::new().bold().fg(Green).paint("Creating new index") | |||
); | |||
println!( | |||
"{} ", | |||
Style::new() | |||
.bold() | |||
.fg(Green) | |||
.paint("Let's define it's schema!") | |||
); | |||
let mut schema_builder = SchemaBuilder::default(); | |||
loop { | |||
loop { | |||
ask_add_field(&mut schema_builder); | |||
if !prompt_yn("Add another field") { | |||
break; | |||
@@ -148,4 +155,3 @@ fn run_new(directory: PathBuf) -> tantivy::Result<()> { | |||
Index::create_in_dir(&directory, schema)?; | |||
Ok(()) | |||
} | |||
@@ -1,49 +1,46 @@ | |||
use clap::ArgMatches; | |||
use serde_json; | |||
use std::convert::From; | |||
use std::path::Path; | |||
use std::path::PathBuf; | |||
use tantivy; | |||
use tantivy::Index; | |||
use tantivy::query::QueryParser; | |||
use tantivy::schema::Field; | |||
use serde_json; | |||
use tantivy::schema::FieldType; | |||
use tantivy::Index; | |||
pub fn run_search_cli(matches: &ArgMatches) -> Result<(), String> { | |||
let index_directory = PathBuf::from(matches.value_of("index").unwrap()); | |||
let query = matches.value_of("query").unwrap(); | |||
run_search(&index_directory, &query) | |||
.map_err(|e| format!("{:?}", e)) | |||
run_search(&index_directory, &query).map_err(|e| format!("{:?}", e)) | |||
} | |||
fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { | |||
fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { | |||
let index = Index::open_in_dir(directory)?; | |||
let schema = index.schema(); | |||
let default_fields: Vec<Field> = schema | |||
.fields() | |||
.iter() | |||
.enumerate() | |||
.filter( | |||
|&(_, ref field_entry)| | |||
match *field_entry.field_type() { | |||
FieldType::Str(ref text_field_options) => { | |||
text_field_options.get_indexing_options().is_some() | |||
}, | |||
_ => false | |||
} | |||
) | |||
.filter(|&(_, ref field_entry)| match *field_entry.field_type() { | |||
FieldType::Str(ref text_field_options) => { | |||
text_field_options.get_indexing_options().is_some() | |||
} | |||
_ => false, | |||
}) | |||
.map(|(i, _)| Field(i as u32)) | |||
.collect(); | |||
let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); | |||
let query = query_parser.parse_query(query)?; | |||
let searcher = index.searcher(); | |||
let searcher = index.reader()?.searcher(); | |||
let weight = query.weight(&searcher, false)?; | |||
let schema = index.schema(); | |||
for segment_reader in searcher.segment_readers() { | |||
let mut scorer = weight.scorer(segment_reader)?; | |||
let store_reader = segment_reader.get_store_reader(); | |||
while scorer.advance() { | |||
let doc_id = scorer.doc(); | |||
let doc = segment_reader.doc(doc_id)?; | |||
let doc = store_reader.get(doc_id)?; | |||
let named_doc = schema.to_named_doc(&doc); | |||
println!("{}", serde_json::to_string(&named_doc).unwrap()); | |||
} | |||
@@ -1,8 +1,8 @@ | |||
/// This tantivy command starts a http server (by default on port 3000) | |||
/// | |||
/// | |||
/// Currently the only entrypoint is /api/ | |||
/// and it takes the following query string argument | |||
/// | |||
/// | |||
/// - `q=` : your query | |||
/// - `nhits`: the number of hits that should be returned. (default to 10) | |||
/// | |||
@@ -12,16 +12,14 @@ | |||
/// | |||
/// http://localhost:3000/api/?q=fulmicoton&&nhits=20 | |||
/// | |||
use clap::ArgMatches; | |||
use iron::mime::Mime; | |||
use iron::prelude::*; | |||
use iron::status; | |||
use serde_json; | |||
use iron::typemap::Key; | |||
use mount::Mount; | |||
use persistent::Read; | |||
use serde_json; | |||
use std::convert::From; | |||
use std::error::Error; | |||
use std::fmt::{self, Debug}; | |||
@@ -29,11 +27,7 @@ use std::path::Path; | |||
use std::path::PathBuf; | |||
use std::str::FromStr; | |||
use tantivy; | |||
use tantivy::collector; | |||
use tantivy::collector::CountCollector; | |||
use tantivy::collector::TopCollector; | |||
use tantivy::Document; | |||
use tantivy::Index; | |||
use tantivy::collector::{Count, TopDocs}; | |||
use tantivy::query::QueryParser; | |||
use tantivy::schema::Field; | |||
use tantivy::schema::FieldType; | |||
@@ -41,6 +35,9 @@ use tantivy::schema::NamedFieldDocument; | |||
use tantivy::schema::Schema; | |||
use tantivy::tokenizer::*; | |||
use tantivy::DocAddress; | |||
use tantivy::Document; | |||
use tantivy::Index; | |||
use tantivy::IndexReader; | |||
use timer::TimerTree; | |||
use urlencoded::UrlEncodedQuery; | |||
@@ -52,7 +49,6 @@ pub fn run_serve_cli(matches: &ArgMatches) -> Result<(), String> { | |||
run_serve(index_directory, &host).map_err(|e| format!("{:?}", e)) | |||
} | |||
#[derive(Serialize)] | |||
struct Serp { | |||
q: String, | |||
@@ -68,42 +64,40 @@ struct Hit { | |||
} | |||
struct IndexServer { | |||
index: Index, | |||
reader: IndexReader, | |||
query_parser: QueryParser, | |||
schema: Schema, | |||
} | |||
impl IndexServer { | |||
fn load(path: &Path) -> IndexServer { | |||
let index = Index::open_in_dir(path).unwrap(); | |||
index.tokenizers() | |||
.register("commoncrawl", SimpleTokenizer | |||
.filter(RemoveLongFilter::limit(40)) | |||
.filter(LowerCaser) | |||
.filter(AlphaNumOnlyFilter) | |||
.filter(Stemmer::new()) | |||
index.tokenizers().register( | |||
"commoncrawl", | |||
SimpleTokenizer | |||
.filter(RemoveLongFilter::limit(40)) | |||
.filter(LowerCaser) | |||
.filter(AlphaNumOnlyFilter) | |||
.filter(Stemmer::new(Language::English)), | |||
); | |||
let schema = index.schema(); | |||
let default_fields: Vec<Field> = schema | |||
.fields() | |||
.iter() | |||
.enumerate() | |||
.filter( | |||
|&(_, ref field_entry)| { | |||
match *field_entry.field_type() { | |||
FieldType::Str(ref text_field_options) => { | |||
text_field_options.get_indexing_options().is_some() | |||
}, | |||
_ => false | |||
} | |||
.filter(|&(_, ref field_entry)| match *field_entry.field_type() { | |||
FieldType::Str(ref text_field_options) => { | |||
text_field_options.get_indexing_options().is_some() | |||
} | |||
) | |||
_ => false, | |||
}) | |||
.map(|(i, _)| Field(i as u32)) | |||
.collect(); | |||
let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); | |||
let query_parser = | |||
QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); | |||
let reader = index.reader().unwrap(); | |||
IndexServer { | |||
index, | |||
reader, | |||
query_parser, | |||
schema, | |||
} | |||
@@ -115,25 +109,23 @@ impl IndexServer { | |||
id: doc_address.doc(), | |||
} | |||
} | |||
fn search(&self, q: String, num_hits: usize) -> tantivy::Result<Serp> { | |||
let query = self.query_parser.parse_query(&q).expect("Parsing the query failed"); | |||
let searcher = self.index.searcher(); | |||
let mut count_collector = CountCollector::default(); | |||
let mut top_collector = TopCollector::with_limit(num_hits); | |||
let query = self | |||
.query_parser | |||
.parse_query(&q) | |||
.expect("Parsing the query failed"); | |||
let searcher = self.reader.searcher(); | |||
let mut timer_tree = TimerTree::default(); | |||
{ | |||
let (top_docs, num_hits) = { | |||
let _search_timer = timer_tree.open("search"); | |||
let mut chained_collector = collector::chain() | |||
.push(&mut top_collector) | |||
.push(&mut count_collector); | |||
query.search(&searcher, &mut chained_collector)?; | |||
} | |||
searcher.search(&query, &(TopDocs::with_limit(num_hits), Count))? | |||
}; | |||
let hits: Vec<Hit> = { | |||
let _fetching_timer = timer_tree.open("fetching docs"); | |||
top_collector.docs() | |||
top_docs | |||
.iter() | |||
.map(|doc_address| { | |||
.map(|(_score, doc_address)| { | |||
let doc: Document = searcher.doc(*doc_address).unwrap(); | |||
self.create_hit(&doc, doc_address) | |||
}) | |||
@@ -141,7 +133,7 @@ impl IndexServer { | |||
}; | |||
Ok(Serp { | |||
q, | |||
num_hits: count_collector.count(), | |||
num_hits, | |||
hits, | |||
timings: timer_tree, | |||
}) | |||
@@ -162,42 +154,53 @@ impl fmt::Display for StringError { | |||
} | |||
impl Error for StringError { | |||
fn description(&self) -> &str { &*self.0 } | |||
fn description(&self) -> &str { | |||
&*self.0 | |||
} | |||
} | |||
fn search(req: &mut Request) -> IronResult<Response> { | |||
let index_server = req.get::<Read<IndexServer>>().unwrap(); | |||
req.get_ref::<UrlEncodedQuery>() | |||
.map_err(|_| IronError::new(StringError(String::from("Failed to decode error")), status::BadRequest)) | |||
.map_err(|_| { | |||
IronError::new( | |||
StringError(String::from("Failed to decode error")), | |||
status::BadRequest, | |||
) | |||
}) | |||
.and_then(|ref qs_map| { | |||
let num_hits: usize = qs_map | |||
.get("nhits") | |||
.and_then(|nhits_str| usize::from_str(&nhits_str[0]).ok()) | |||
.unwrap_or(10); | |||
let query = qs_map | |||
.get("q") | |||
.ok_or_else(|| IronError::new(StringError(String::from("Parameter q is missing from the query")), status::BadRequest))?[0].clone(); | |||
let query = qs_map.get("q").ok_or_else(|| { | |||
IronError::new( | |||
StringError(String::from("Parameter q is missing from the query")), | |||
status::BadRequest, | |||
) | |||
})?[0] | |||
.clone(); | |||
let serp = index_server.search(query, num_hits).unwrap(); | |||
let resp_json = serde_json::to_string_pretty(&serp).unwrap(); | |||
let content_type = "application/json".parse::<Mime>().unwrap(); | |||
Ok(Response::with((content_type, status::Ok, format!("{}", resp_json)))) | |||
Ok(Response::with(( | |||
content_type, | |||
status::Ok, | |||
format!("{}", resp_json), | |||
))) | |||
}) | |||
} | |||
fn run_serve(directory: PathBuf, host: &str) -> tantivy::Result<()> { | |||
let mut mount = Mount::new(); | |||
let server = IndexServer::load(&directory); | |||
mount.mount("/api", search); | |||
let mut middleware = Chain::new(mount); | |||
middleware.link(Read::<IndexServer>::both(server)); | |||
println!("listening on http://{}", host); | |||
Iron::new(middleware).http(host).unwrap(); | |||
Ok(()) | |||
} | |||
@@ -3,41 +3,39 @@ extern crate clap; | |||
extern crate serde_json; | |||
#[macro_use] | |||
extern crate log; | |||
extern crate ansi_term; | |||
extern crate bincode; | |||
extern crate byteorder; | |||
extern crate chan; | |||
extern crate env_logger; | |||
extern crate tantivy; | |||
extern crate time; | |||
extern crate persistent; | |||
extern crate futures; | |||
extern crate urlencoded; | |||
extern crate iron; | |||
extern crate chan; | |||
extern crate staticfile; | |||
extern crate ansi_term; | |||
extern crate mount; | |||
extern crate bincode; | |||
extern crate byteorder; | |||
extern crate persistent; | |||
extern crate staticfile; | |||
extern crate tantivy; | |||
extern crate time; | |||
extern crate urlencoded; | |||
#[macro_use] | |||
extern crate serde_derive; | |||
use std::io::Write; | |||
use clap::{AppSettings, Arg, App, SubCommand}; | |||
use clap::{App, AppSettings, Arg, SubCommand}; | |||
mod commands; | |||
pub mod timer; | |||
use self::commands::*; | |||
fn main() { | |||
env_logger::init().unwrap(); | |||
let index_arg = Arg::with_name("index") | |||
.short("i") | |||
.long("index") | |||
.value_name("directory") | |||
.help("Tantivy index directory filepath") | |||
.required(true); | |||
.short("i") | |||
.long("index") | |||
.value_name("directory") | |||
.help("Tantivy index directory filepath") | |||
.required(true); | |||
let cli_options = App::new("Tantivy") | |||
.setting(AppSettings::SubcommandRequiredElseHelp) | |||
@@ -135,7 +133,7 @@ fn main() { | |||
"search" => run_search_cli, | |||
"merge" => run_merge_cli, | |||
"bench" => run_bench_cli, | |||
_ => panic!("Subcommand {} is unknown", subcommand) | |||
_ => panic!("Subcommand {} is unknown", subcommand), | |||
}; | |||
if let Err(ref e) = run_cli(options) { | |||
@@ -26,7 +26,8 @@ impl<'a> Drop for OpenTimer<'a> { | |||
fn drop(&mut self) { | |||
self.timer_tree.timings.push(Timing { | |||
name: self.name, | |||
duration: self.start | |||
duration: self | |||
.start | |||
.to(PreciseTime::now()) | |||
.num_microseconds() | |||
.unwrap(), | |||