diff --git a/README.md b/README.md new file mode 100644 index 0000000..fb9f409 --- /dev/null +++ b/README.md @@ -0,0 +1,169 @@ +Tantivy-cli is command line interface for [tantivy search engine](https://github.com/fulmicoton/tantivy). + + + +# Tutorial: Indexing Wikipedia with Tantivy CLI + +## Introduction + +In this tutorial, we will create a brand new index with the articles of English wikipedia in it. + +## Install + +There are two ways to get `tantivy`. +If you are a rust programmer, you can run `cargo install tantivy-cli`. +Alternatively, if you are on `Linux 64bits`, you can download a +static binary: [binaries/linux_x86_64/](http://fulmicoton.com/tantivy-files/binaries/linux_x86_64/tantivy) + +## Creating the index + +Create a directory in which your index will be stored. + +```bash + # create the directory + mkdir wikipedia-index +``` + + +We will now initialize the index and create it's schema. + +Our documents will contain +* a title +* a body +* a url + +Running `tantivy new` will start a wizard that will help you go through +the definition of the schema of our new index. + +```bash + tantivy new -i wikipedia-index +``` + +When asked answer to the question as follows: + +```none + + Creating new index + Let's define it's schema! + + + + New field name ? title + Text or unsigned 32-bit Integer (T/I) ? T + Should the field be stored (Y/N) ? Y + Should the field be indexed (Y/N) ? Y + Should the field be tokenized (Y/N) ? Y + Should the term frequencies (per doc) be in the index (Y/N) ? Y + Should the term positions (per doc) be in the index (Y/N) ? Y + Add another field (Y/N) ? Y + + + + New field name ? body + Text or unsigned 32-bit Integer (T/I) ? T + Should the field be stored (Y/N) ? Y + Should the field be indexed (Y/N) ? Y + Should the field be tokenized (Y/N) ? Y + Should the term frequencies (per doc) be in the index (Y/N) ? Y + Should the term positions (per doc) be in the index (Y/N) ? Y + Add another field (Y/N) ? Y + + + + New field name ? url + Text or unsigned 32-bit Integer (T/I) ? T + Should the field be stored (Y/N) ? Y + Should the field be indexed (Y/N) ? N + Add another field (Y/N) ? N + + [ + { + "name": "title", + "type": "text", + "options": { + "indexing": "position", + "stored": true + } + }, + { + "name": "body", + "type": "text", + "options": { + "indexing": "position", + "stored": true + } + }, + { + "name": "url", + "type": "text", + "options": { + "indexing": "unindexed", + "stored": true + } + } + ] + + +``` + +If you want to know more about the meaning of these options, you can check out the [schema doc page](http://fulmicoton.com/tantivy/tantivy/schema/index.html). + +The json displayed at the end has been written in `wikipedia-index/meta.json`. + + +# Get the documents to index + +Tantivy's index command offers a way to index a json file. +More accurately, the file must contain one document per line, in a json format. +The structure of this JSON object must match that of our schema definition. + +```json + {"body": "some text", "title": "some title", "url": "http://somedomain.com"} +``` + +You can download a corpus of more than 5 millions articles from wikipedia +formatted in the right format here : [wiki-articles.json (2.34 GB)](https://www.dropbox.com/s/wwnfnu441w1ec9p/wiki-articles.json.bz2?dl=0). +If you are in a rush you can [download 100 articles in the right format here](http://fulmicoton.com/tantivy-files/wiki-articles-1000.json). + +Make sure to uncompress the file + +```bash + bunzip2 wiki-articles.json.bz2 +``` + +# Index the documents. + +The `index` command will index your document. +By default it will use as many threads as there are core on your machine. + +On my computer (8 core Xeon(R) CPU X3450 @ 2.67GHz), it only takes 7 minutes. + +``` + cat /data/wiki-articles | tantivy index -i wikipedia-index +``` + +While it is indexing, you can peek at the index directory +to check what is happening. + +```bash + ls wikipedia-index +``` + +If you indexed the 5 millions articles, you should see a lot of files, all with the following format +The main file is `meta.json`. + +Our index is in fact divided in segments. Each segment acts as an individual smaller index. +It is named by a uuid. +Each different files is storing a different datastructure for the index. + + +# Serve the search index + +``` + tantivy serve -i wikipedia-index +``` + +You can start a small server with a JSON API to search into wikipedia. +By default, the server is serving on the port `3000`. + + diff --git a/src/commands/serve.rs b/src/commands/serve.rs index 10bdd85..921a3e0 100644 --- a/src/commands/serve.rs +++ b/src/commands/serve.rs @@ -1,3 +1,20 @@ +/// This tantivy command starts a http server (by default on port 3000) +/// +/// Currently the only entrypoint is /api/ +/// and it takes the following query string argument +/// +/// - `q=` : your query +// - `nhits`: the number of hits that should be returned. (default to 10) +/// - `explain=` : if true returns some information about the score. +/// +/// +/// For instance, the following call should return the 20 most relevant +/// hits for fulmicoton. +/// +/// http://localhost:3000/api/?q=fulmicoton&explain=false&nhits=20 +/// + + use clap::ArgMatches; use iron::mime::Mime; use iron::prelude::*; @@ -7,8 +24,11 @@ use mount::Mount; use persistent::Read; use rustc_serialize::json::as_pretty_json; use std::convert::From; +use std::error::Error; +use std::fmt::{self, Debug}; use std::path::Path; use std::path::PathBuf; +use std::str::FromStr; use tantivy; use tantivy::collector; use tantivy::collector::CountCollector; @@ -19,12 +39,12 @@ use tantivy::query::Explanation; use tantivy::query::Query; use tantivy::query::QueryParser; use tantivy::Result; -use tantivy::schema::Schema; +use tantivy::schema::Field; +use tantivy::schema::FieldType; use tantivy::schema::NamedFieldDocument; +use tantivy::schema::Schema; +use tantivy::TimerTree; use urlencoded::UrlEncodedQuery; -use std::str::FromStr; -use std::fmt::{self, Debug}; -use std::error::Error; pub fn run_serve_cli(matches: &ArgMatches) -> tantivy::Result<()> { let index_directory = PathBuf::from(matches.value_of("index").unwrap()); @@ -40,7 +60,7 @@ struct Serp { q: String, num_hits: usize, hits: Vec, - timings: Vec, + timings: TimerTree, } #[derive(RustcEncodable)] @@ -49,12 +69,6 @@ struct Hit { explain: Option, } -#[derive(RustcEncodable)] -struct Timing { - name: String, - duration: i64, -} - struct IndexServer { index: Index, query_parser: QueryParser, @@ -66,9 +80,21 @@ impl IndexServer { fn load(path: &Path) -> IndexServer { let index = Index::open(path).unwrap(); let schema = index.schema(); - let body_field = schema.get_field("body").unwrap(); - let title_field = schema.get_field("title").unwrap(); - let query_parser = QueryParser::new(schema.clone(), vec!(body_field, title_field)); + let default_fields: Vec = schema + .fields() + .iter() + .enumerate() + .filter( + |&(_, ref field_entry)| { + match *field_entry.field_type() { + FieldType::Str(_) => true, + FieldType::U32(_) => false + } + } + ) + .map(|(i, _)| Field(i as u8)) + .collect(); + let query_parser = QueryParser::new(schema.clone(), default_fields); IndexServer { index: index, query_parser: query_parser, @@ -88,14 +114,17 @@ impl IndexServer { let searcher = self.index.searcher().unwrap(); let mut count_collector = CountCollector::new(); let mut top_collector = TopCollector::with_limit(num_hits); - + let mut timer_tree = TimerTree::new(); { + let _search_timer = timer_tree.open("search"); let mut chained_collector = collector::chain() - .add(&mut top_collector) - .add(&mut count_collector); + .add(&mut top_collector) + .add(&mut count_collector); try!(query.search(&searcher, &mut chained_collector)); } - let hits: Vec = top_collector.docs() + let hits: Vec = { + let _fetching_timer = timer_tree.open("fetching docs"); + top_collector.docs() .iter() .map(|doc_address| { let doc: Document = searcher.doc(doc_address).unwrap(); @@ -108,12 +137,13 @@ impl IndexServer { } self.create_hit(&doc, explanation) }) - .collect(); + .collect() + }; Ok(Serp { q: q, - hits: hits, num_hits: count_collector.count(), - timings: Vec::new(), + hits: hits, + timings: timer_tree, }) } }