Browse Source

Working with tantivy's master.

develop
Paul Masurel 7 years ago
parent
commit
296e6a37bf
7 changed files with 90 additions and 51 deletions
  1. +4
    -4
      Cargo.toml
  2. +1
    -1
      src/commands/bench.rs
  3. +65
    -28
      src/commands/index.rs
  4. +1
    -1
      src/commands/merge.rs
  5. +10
    -9
      src/commands/new.rs
  6. +1
    -1
      src/commands/serve.rs
  7. +8
    -7
      src/main.rs

+ 4
- 4
Cargo.toml View File

@@ -13,8 +13,8 @@ keywords = ["search", "information", "retrieval"]
license = "MIT"

[dependencies]
#tantivy = { path = "../tantivy" }
tantivy = "0.1.1"
tantivy = { path = "../tantivy" }
#tantivy = "0.1.1"
time = "0.1.34"
iron = "0.4"
staticfile = "0.3.0"
@@ -24,7 +24,7 @@ clap = "2"
ansi_term = "0.8.0"
urlencoded = "0.4"
mount = "0.2.1"
chan = "*"


# [dependencies.clap]
@@ -34,4 +34,4 @@ mount = "0.2.1"

[[bin]]
name = "tantivy"
path = "src/main.rs"
path = "src/main.rs"

+ 1
- 1
src/commands/bench.rs View File

@@ -57,7 +57,7 @@ fn run_bench(index_path: &Path,
println!("-------------------------------\n\n\n");
let index = try!(Index::open(index_path).map_err(|e| format!("Failed to open index.\n{:?}", e)));
let searcher = try!(index.searcher().map_err(|e| format!("Failed to acquire searcher.\n{:?}", e)));
let searcher = index.searcher();
let default_search_fields: Vec<Field> = extract_search_fields(&index.schema());
let queries = try!(read_query_file(query_filepath).map_err(|e| format!("Failed reading the query file: {}", e)));
let query_parser = QueryParser::new(index.schema(), default_search_fields);


+ 65
- 28
src/commands/index.rs View File

@@ -5,11 +5,14 @@ use std::io::BufRead;
use std::io::BufReader;
use std::io::Read;
use std::path::PathBuf;
use tantivy;
use tantivy;
use tantivy::Index;
use tantivy::IndexWriter;
use tantivy::Document;
use time::PreciseTime;
use clap::ArgMatches;

use chan;
use std::thread;

pub fn run_index_cli(argmatch: &ArgMatches) -> Result<(), String> {
let index_directory = PathBuf::from(argmatch.value_of("index").unwrap());
@@ -22,7 +25,7 @@ pub fn run_index_cli(argmatch: &ArgMatches) -> Result<(), String> {
}
};
let num_threads = try!(value_t!(argmatch, "num_threads", usize).map_err(|_|format!("Failed to read num_threads argument as an integer.")));
run_index(index_directory, document_source, num_threads).map_err(|e| format!("Indexing failed : {:?}", e))
run_index(index_directory, document_source, num_threads).map_err(|e| format!("Indexing failed : {:?}", e))
}

enum DocumentSource {
@@ -31,12 +34,44 @@ enum DocumentSource {
}

fn run_index(directory: PathBuf, document_source: DocumentSource, num_threads: usize) -> tantivy::Result<()> {
let index = try!(Index::open(&directory));
let schema = index.schema();
let mut index_writer = try!(
let (line_sender, line_receiver) = chan::sync(10_000);
let (doc_sender, doc_receiver) = chan::sync(10_000);

thread::spawn(move || {
let articles = document_source.read().unwrap();
for article_line_res in articles.lines() {
let article_line = article_line_res.unwrap();
line_sender.send(article_line);
}
});

// using 3 threads to parse the json documents
for _ in 0..3 {

let schema_clone = schema.clone();
let doc_sender_clone = doc_sender.clone();
let line_receiver_clone = line_receiver.clone();

thread::spawn(move || {
for article_line in line_receiver_clone {
match schema_clone.parse_document(&article_line) {
Ok(doc) => {
doc_sender_clone.send(doc);
}
Err(err) => {
println!("Failed to add document doc {:?}", err);
}
}
}
});
}
drop(doc_sender);


let mut index_writer = try!(
if num_threads > 0 {
index.writer_with_num_threads(num_threads)
}
@@ -44,23 +79,29 @@ fn run_index(directory: PathBuf, document_source: DocumentSource, num_threads: u
index.writer()
}
);
let articles = try!(document_source.read());


let index_result = index_documents(&mut index_writer, doc_receiver);
match index_result {
Ok(docstamp) => {
println!("Commit succeed, docstamp at {}", docstamp);
Ok(())
}
Err(e) => {
println!("Error during indexing, rollbacking.");
index_writer.rollback().unwrap();
println!("Rollback succeeded");
Err(e)
}
}
}

fn index_documents(index_writer: &mut IndexWriter, doc_receiver: chan::Receiver<Document>) -> tantivy::Result<u64> {
let group_count = 100_000;
let mut num_docs = 0;
let mut cur = PreciseTime::now();
let group_count = 100000;
for article_line_res in articles.lines() {
let article_line = article_line_res.unwrap(); // TODO
match schema.parse_document(&article_line) {
Ok(doc) => {
index_writer.add_document(doc).unwrap();
}
Err(err) => {
println!("Failed to add document doc {:?}", err);
}
}
for doc in doc_receiver {
try!(index_writer.add_document(doc));
if num_docs > 0 && (num_docs % group_count == 0) {
println!("{} Docs", num_docs);
let new = PreciseTime::now();
@@ -68,12 +109,9 @@ fn run_index(directory: PathBuf, document_source: DocumentSource, num_threads: u
println!("{:?} docs / hour", group_count * 3600 * 1_000_000 as u64 / (elapsed.num_microseconds().unwrap() as u64));
cur = new;
}

num_docs += 1;

}
index_writer.wait().unwrap(); // TODO
Ok(())
index_writer.commit()
}


@@ -90,7 +128,7 @@ impl DocumentSource {
Ok(match self {
&DocumentSource::FromPipe => {
BufReader::new(Box::new(io::stdin()))
}
}
&DocumentSource::FromFile(ref filepath) => {
let read_file = try!(File::open(&filepath));
BufReader::new(Box::new(read_file))
@@ -98,4 +136,3 @@ impl DocumentSource {
})
}
}


+ 1
- 1
src/commands/merge.rs View File

@@ -12,7 +12,7 @@ pub fn run_merge_cli(argmatch: &ArgMatches) -> Result<(), String> {

fn run_merge(path: PathBuf) -> tantivy::Result<()> {
let index = try!(Index::open(&path));
let segments = index.segments();
let segments = try!(index.segments());
let mut index_writer = try!(index.writer());
index_writer.merge(&segments)
}

+ 10
- 9
src/commands/new.rs View File

@@ -72,7 +72,7 @@ fn prompt_yn(msg: &str) -> bool {
}


fn ask_add_field_text(field_name: &str, schema: &mut Schema) {
fn ask_add_field_text(field_name: &str, schema_builder: &mut SchemaBuilder) {
let mut text_options = TextOptions::new();
if prompt_yn("Should the field be stored") {
text_options = text_options.set_stored();
@@ -100,11 +100,11 @@ fn ask_add_field_text(field_name: &str, schema: &mut Schema) {
TextIndexingOptions::Unindexed
};
text_options = text_options.set_indexing_options(indexing_options);
schema.add_text_field(field_name, text_options);
schema_builder.add_text_field(field_name, text_options);
}


fn ask_add_field_u32(field_name: &str, schema: &mut Schema) {
fn ask_add_field_u32(field_name: &str, schema_builder: &mut SchemaBuilder) {
let mut u32_options = U32Options::new();
if prompt_yn("Should the field be stored") {
u32_options = u32_options.set_stored();
@@ -115,31 +115,32 @@ fn ask_add_field_u32(field_name: &str, schema: &mut Schema) {
if prompt_yn("Should the field be indexed") {
u32_options = u32_options.set_indexed();
}
schema.add_u32_field(field_name, u32_options);
schema_builder.add_u32_field(field_name, u32_options);
}

fn ask_add_field(schema: &mut Schema) {
fn ask_add_field(schema_builder: &mut SchemaBuilder) {
println!("\n\n");
let field_name = prompt_input("New field name ", field_name_validate);
let text_or_integer = prompt_options("Text or unsigned 32-bit Integer", vec!('T', 'I'));
if text_or_integer =='T' {
ask_add_field_text(&field_name, schema);
ask_add_field_text(&field_name, schema_builder);
}
else {
ask_add_field_u32(&field_name, schema);
ask_add_field_u32(&field_name, schema_builder);
}
}

fn run_new(directory: PathBuf) -> tantivy::Result<()> {
println!("\n{} ", Style::new().bold().fg(Green).paint("Creating new index"));
println!("{} ", Style::new().bold().fg(Green).paint("Let's define it's schema!"));
let mut schema = Schema::new();
let mut schema_builder = SchemaBuilder::new();
loop {
ask_add_field(&mut schema);
ask_add_field(&mut schema_builder);
if !prompt_yn("Add another field") {
break;
}
}
let schema = schema_builder.build();
let schema_json = format!("{}", json::as_pretty_json(&schema));
println!("\n{}\n", Style::new().fg(Green).paint(schema_json));
let mut index = try!(Index::create(&directory, schema));


+ 1
- 1
src/commands/serve.rs View File

@@ -111,7 +111,7 @@ impl IndexServer {
fn search(&self, q: String, num_hits: usize, explain: bool) -> Result<Serp> {
let query = self.query_parser.parse_query(&q).unwrap();
let searcher = self.index.searcher().unwrap();
let searcher = self.index.searcher();
let mut count_collector = CountCollector::new();
let mut top_collector = TopCollector::with_limit(num_hits);
let mut timer_tree = TimerTree::new();


+ 8
- 7
src/main.rs View File

@@ -7,6 +7,7 @@ extern crate time;
extern crate persistent;
extern crate urlencoded;
extern crate iron;
extern crate chan;
extern crate staticfile;
extern crate ansi_term;
extern crate mount;
@@ -23,7 +24,7 @@ fn main() {
.value_name("directory")
.help("Tantivy index directory filepath")
.required(true);
let cli_options = App::new("Tantivy")
.setting(AppSettings::SubcommandRequiredElseHelp)
.version("0.1")
@@ -33,7 +34,7 @@ fn main() {
SubCommand::with_name("new")
.about("Create a new index. The schema will be populated with a simple example schema")
.arg(index_arg.clone())
)
)
.subcommand(
SubCommand::with_name("serve")
.about("Start a server")
@@ -50,7 +51,7 @@ fn main() {
.help("Port")
.default_value("localhost")
)
)
)
.subcommand(
SubCommand::with_name("index")
.about("Index files")
@@ -90,11 +91,11 @@ fn main() {
.arg(index_arg.clone())
)
.get_matches();
let (subcommand, some_options) = cli_options.subcommand();
let options = some_options.unwrap();
match subcommand {
"new" => run_new_cli(options).unwrap(),
"index" => run_index_cli(options).unwrap(),
@@ -109,4 +110,4 @@ fn main() {
},
_ => {}
}
}
}

Loading…
Cancel
Save