From 27cb9d41e17366ad14451265709fcfe39fc9ccda Mon Sep 17 00:00:00 2001
From: Paul Masurel <paul.masurel@gmail.com>
Date: Mon, 15 Aug 2016 00:16:58 +0900
Subject: [PATCH] Added readme

---
 README.md             | 169 ++++++++++++++++++++++++++++++++++++++++++
 src/commands/serve.rs |  72 ++++++++++++------
 2 files changed, 220 insertions(+), 21 deletions(-)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..fb9f409
--- /dev/null
+++ b/README.md
@@ -0,0 +1,169 @@
+Tantivy-cli is command line interface for [tantivy search engine](https://github.com/fulmicoton/tantivy).
+
+
+
+# Tutorial: Indexing Wikipedia with Tantivy CLI
+
+## Introduction
+
+In this tutorial, we will create a brand new index with the articles of English wikipedia in it.
+
+## Install
+
+There are two ways to get `tantivy`.
+If you are a rust programmer, you can run `cargo install tantivy-cli`.
+Alternatively, if you are on `Linux 64bits`, you can download a
+static binary:  [binaries/linux_x86_64/](http://fulmicoton.com/tantivy-files/binaries/linux_x86_64/tantivy) 
+
+## Creating the index
+
+Create a directory in which your index will be stored.
+
+```bash
+    # create the directory
+    mkdir wikipedia-index
+```
+
+
+We will now initialize the index and create it's schema.
+
+Our documents will contain
+* a title
+* a body 
+* a url
+
+Running `tantivy new` will start a wizard that will help you go through
+the definition of the schema of our new index.
+
+```bash
+    tantivy new -i wikipedia-index
+```
+
+When asked answer to the question as follows:
+
+```none
+
+    Creating new index 
+    Let's define it's schema! 
+
+
+
+    New field name  ? title
+    Text or unsigned 32-bit Integer (T/I) ? T
+    Should the field be stored (Y/N) ? Y
+    Should the field be indexed (Y/N) ? Y
+    Should the field be tokenized (Y/N) ? Y
+    Should the term frequencies (per doc) be in the index (Y/N) ? Y
+    Should the term positions (per doc) be in the index (Y/N) ? Y
+    Add another field (Y/N) ? Y
+
+
+
+    New field name  ? body
+    Text or unsigned 32-bit Integer (T/I) ? T
+    Should the field be stored (Y/N) ? Y
+    Should the field be indexed (Y/N) ? Y
+    Should the field be tokenized (Y/N) ? Y
+    Should the term frequencies (per doc) be in the index (Y/N) ? Y
+    Should the term positions (per doc) be in the index (Y/N) ? Y
+    Add another field (Y/N) ? Y
+
+
+
+    New field name  ? url
+    Text or unsigned 32-bit Integer (T/I) ? T
+    Should the field be stored (Y/N) ? Y
+    Should the field be indexed (Y/N) ? N
+    Add another field (Y/N) ? N
+
+    [
+    {
+        "name": "title",
+        "type": "text",
+        "options": {
+        "indexing": "position",
+        "stored": true
+        }
+    },
+    {
+        "name": "body",
+        "type": "text",
+        "options": {
+        "indexing": "position",
+        "stored": true
+        }
+    },
+    {
+        "name": "url",
+        "type": "text",
+        "options": {
+        "indexing": "unindexed",
+        "stored": true
+        }
+    }
+    ]
+
+
+```
+
+If you want to know more about the meaning of these options, you can check out the [schema doc page](http://fulmicoton.com/tantivy/tantivy/schema/index.html).  
+
+The json displayed at the end has been written in `wikipedia-index/meta.json`.
+
+
+# Get the documents to index
+
+Tantivy's index command offers a way to index a json file.
+More accurately, the file must contain one document per line, in a json format.
+The structure of this JSON object must match that of our schema definition.
+
+```json
+    {"body": "some text", "title": "some title", "url": "http://somedomain.com"}
+```
+
+You can download a corpus of more than 5 millions articles from wikipedia 
+formatted in the right format here : [wiki-articles.json (2.34 GB)](https://www.dropbox.com/s/wwnfnu441w1ec9p/wiki-articles.json.bz2?dl=0).
+If you are in a rush you can [download 100 articles in the right format here](http://fulmicoton.com/tantivy-files/wiki-articles-1000.json).
+
+Make sure to uncompress the file
+
+```bash
+    bunzip2 wiki-articles.json.bz2
+``` 
+
+# Index the documents.
+
+The `index` command will index your document.
+By default it will use as many threads as there are core on your machine.
+
+On my computer (8 core Xeon(R) CPU X3450  @ 2.67GHz), it only takes 7 minutes.
+
+```
+    cat /data/wiki-articles | tantivy index -i wikipedia-index
+```
+
+While it is indexing, you can peek at the index directory
+to check what is happening.
+
+```bash
+    ls wikipedia-index
+```
+
+If you indexed the 5 millions articles, you should see a lot of files, all with the following format
+The main file is `meta.json`.
+
+Our index is in fact divided in segments. Each segment acts as an individual smaller index.
+It is named by a uuid. 
+Each different files is storing a different datastructure for the index.
+
+
+# Serve the search index
+
+```
+    tantivy serve -i wikipedia-index
+```
+
+You can start a small server with a JSON API to search into wikipedia.
+By default, the server is serving on the port `3000`.
+
+
diff --git a/src/commands/serve.rs b/src/commands/serve.rs
index 10bdd85..921a3e0 100644
--- a/src/commands/serve.rs
+++ b/src/commands/serve.rs
@@ -1,3 +1,20 @@
+/// This tantivy command starts a http server (by default on port 3000)
+/// 
+/// Currently the only entrypoint is /api/
+/// and it takes the following query string argument
+/// 
+/// - `q=` :    your query
+//  - `nhits`:  the number of hits that should be returned. (default to 10)  
+/// - `explain=` : if true returns some information about the score.  
+///
+///
+/// For instance, the following call should return the 20 most relevant
+/// hits for fulmicoton.
+///
+///     http://localhost:3000/api/?q=fulmicoton&explain=false&nhits=20
+///
+
+
 use clap::ArgMatches;
 use iron::mime::Mime;
 use iron::prelude::*;
@@ -7,8 +24,11 @@ use mount::Mount;
 use persistent::Read;
 use rustc_serialize::json::as_pretty_json;
 use std::convert::From;
+use std::error::Error;
+use std::fmt::{self, Debug};
 use std::path::Path;
 use std::path::PathBuf;
+use std::str::FromStr;
 use tantivy;
 use tantivy::collector;
 use tantivy::collector::CountCollector;
@@ -19,12 +39,12 @@ use tantivy::query::Explanation;
 use tantivy::query::Query;
 use tantivy::query::QueryParser;
 use tantivy::Result;
-use tantivy::schema::Schema;
+use tantivy::schema::Field;
+use tantivy::schema::FieldType;
 use tantivy::schema::NamedFieldDocument;
+use tantivy::schema::Schema;
+use tantivy::TimerTree;
 use urlencoded::UrlEncodedQuery;
-use std::str::FromStr;
-use std::fmt::{self, Debug};
-use std::error::Error;
 
 pub fn run_serve_cli(matches: &ArgMatches) -> tantivy::Result<()> {
     let index_directory = PathBuf::from(matches.value_of("index").unwrap());
@@ -40,7 +60,7 @@ struct Serp {
     q: String,
     num_hits: usize,
     hits: Vec<Hit>,
-    timings: Vec<Timing>,
+    timings: TimerTree,
 }
 
 #[derive(RustcEncodable)]
@@ -49,12 +69,6 @@ struct Hit {
     explain: Option<Explanation>,
 }
 
-#[derive(RustcEncodable)]
-struct Timing {
-    name: String,
-    duration: i64,
-}
-
 struct IndexServer {
     index: Index,
     query_parser: QueryParser,
@@ -66,9 +80,21 @@ impl IndexServer {
     fn load(path: &Path) -> IndexServer {
         let index = Index::open(path).unwrap();
         let schema = index.schema();
-        let body_field = schema.get_field("body").unwrap();
-        let title_field = schema.get_field("title").unwrap();
-        let query_parser = QueryParser::new(schema.clone(), vec!(body_field, title_field));
+        let default_fields: Vec<Field> = schema
+            .fields()
+            .iter()
+            .enumerate()
+            .filter(
+                |&(_, ref field_entry)| {
+                    match *field_entry.field_type() {
+                        FieldType::Str(_) => true,
+                        FieldType::U32(_) => false
+                    }
+                }
+            )
+            .map(|(i, _)| Field(i as u8))
+            .collect();
+        let query_parser = QueryParser::new(schema.clone(), default_fields);
         IndexServer {
             index: index,
             query_parser: query_parser,
@@ -88,14 +114,17 @@ impl IndexServer {
         let searcher = self.index.searcher().unwrap();
         let mut count_collector = CountCollector::new();
         let mut top_collector = TopCollector::with_limit(num_hits);
-
+        let mut timer_tree = TimerTree::new();
         {
+            let _search_timer = timer_tree.open("search");
             let mut chained_collector = collector::chain()
-                    .add(&mut top_collector)
-                    .add(&mut count_collector);
+                .add(&mut top_collector)
+                .add(&mut count_collector);
             try!(query.search(&searcher, &mut chained_collector));
         }
-        let hits: Vec<Hit> = top_collector.docs()
+        let hits: Vec<Hit> = {
+            let _fetching_timer = timer_tree.open("fetching docs");
+            top_collector.docs()
                 .iter()
                 .map(|doc_address| {
                     let doc: Document = searcher.doc(doc_address).unwrap();
@@ -108,12 +137,13 @@ impl IndexServer {
                     }
                     self.create_hit(&doc, explanation)
                 })
-                .collect();
+                .collect()
+        };
         Ok(Serp {
             q: q,
-            hits: hits,
             num_hits: count_collector.count(),
-            timings: Vec::new(),
+            hits: hits,
+            timings: timer_tree,
         })
     }
 }