Browse Source

Working with master branch tantivy. Incldues commoncrawl analyzers

develop
Paul Masurel 6 years ago
parent
commit
fd35d407b0
7 changed files with 141 additions and 98 deletions
  1. +64
    -39
      Cargo.lock
  2. +2
    -2
      Cargo.toml
  3. +10
    -10
      src/commands/bench.rs
  4. +9
    -12
      src/commands/index.rs
  5. +19
    -18
      src/commands/new.rs
  6. +13
    -4
      src/commands/search.rs
  7. +24
    -13
      src/commands/serve.rs

+ 64
- 39
Cargo.lock View File

@@ -1,28 +1,3 @@
[root]
name = "tantivy-cli"
version = "0.4.2"
dependencies = [
"ansi_term 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"bincode 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
"byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"chan 0.1.19 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.25.0 (registry+https://github.com/rust-lang/crates.io-index)",
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"futures 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)",
"iron 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"mount 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"persistent 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
"staticfile 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"tantivy 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)",
"urlencoded 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
"version 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
]

[[package]]
name = "aho-corasick"
version = "0.5.3"
@@ -174,6 +149,11 @@ name = "byteorder"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"

[[package]]
name = "cc"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"

[[package]]
name = "cfg-if"
version = "0.1.2"
@@ -230,7 +210,7 @@ dependencies = [

[[package]]
name = "crossbeam"
version = "0.2.10"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"

[[package]]
@@ -295,13 +275,11 @@ dependencies = [

[[package]]
name = "fst"
version = "0.1.38"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
"utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
]

[[package]]
@@ -460,6 +438,11 @@ dependencies = [
"libc 0.2.26 (registry+https://github.com/rust-lang/crates.io-index)",
]

[[package]]
name = "maplit"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"

[[package]]
name = "matches"
version = "0.1.6"
@@ -700,6 +683,11 @@ name = "regex-syntax"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"

[[package]]
name = "rust-stemmers"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"

[[package]]
name = "rustc-demangle"
version = "0.1.4"
@@ -847,30 +835,29 @@ dependencies = [

[[package]]
name = "tantivy"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
version = "0.5.0-dev"
dependencies = [
"atomicwrites 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"bincode 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"bit-set 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"cc 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
"chan 0.1.19 (registry+https://github.com/rust-lang/crates.io-index)",
"combine 2.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"error-chain 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
"fst 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)",
"fst 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"futures 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)",
"futures-cpupool 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"gcc 0.3.51 (registry+https://github.com/rust-lang/crates.io-index)",
"itertools 0.5.10 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.26 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"lz4 1.21.1 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"num_cpus 1.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
"owning_ref 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"rust-stemmers 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -878,11 +865,37 @@ dependencies = [
"tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"tempfile 2.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)",
"tinysegmenter 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"uuid 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"version 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
]

[[package]]
name = "tantivy-cli"
version = "0.5.0-alpha"
dependencies = [
"ansi_term 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"bincode 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
"byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"chan 0.1.19 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.25.0 (registry+https://github.com/rust-lang/crates.io-index)",
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"futures 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)",
"iron 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"mount 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"persistent 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
"staticfile 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"tantivy 0.5.0-dev",
"time 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)",
"urlencoded 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
"version 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
]

[[package]]
name = "tempdir"
version = "0.3.5"
@@ -958,6 +971,15 @@ dependencies = [
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
]

[[package]]
name = "tinysegmenter"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"lazy_static 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)",
"maplit 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
]

[[package]]
name = "traitobject"
version = "0.0.1"
@@ -1124,13 +1146,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum byteorder 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "96c8b41881888cc08af32d47ac4edd52bc7fa27fef774be47a92443756451304"
"checksum byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855"
"checksum byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ff81738b726f5d099632ceaffe7fb65b90212e8dce59d518729e7e8634032d3d"
"checksum cc 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a9b13a57efd6b30ecd6598ebdb302cca617930b5470647570468a65d12ef9719"
"checksum cfg-if 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d4c819a1287eb618df47cc647173c5c4c66ba19d888a6e50d605672aed3140de"
"checksum chan 0.1.19 (registry+https://github.com/rust-lang/crates.io-index)" = "f93bfe971116428a9066c1c3c69a09ae3ef69432f8418be28ab50f96783e6a50"
"checksum clap 2.25.0 (registry+https://github.com/rust-lang/crates.io-index)" = "867a885995b4184be051b70a592d4d70e32d7a188db6e8dff626af286a962771"
"checksum combine 2.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9d48b4a5b824c441d0a5194e2aa63491d1c9e5aa2acc5f6e4924f990715afa2a"
"checksum conduit-mime-types 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)" = "95ca30253581af809925ef68c2641cc140d6183f43e12e0af4992d53768bd7b8"
"checksum cookie 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0e3d6405328b6edb412158b3b7710e2634e23f3614b9bb1c412df7952489a626"
"checksum crossbeam 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)" = "0c5ea215664ca264da8a9d9c3be80d2eaf30923c259d03e870388eb927508f97"
"checksum crossbeam 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8837ab96533202c5b610ed44bc7f4183e7957c1c8f56e8cc78bb098593c8ba0a"
"checksum dbghelp-sys 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "97590ba53bcb8ac28279161ca943a924d1fd4a8fb3fa63302591647c4fc5b850"
"checksum dtoa 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "0dd841b58510c9618291ffa448da2e4e0f699d984d436122372f446dae62263d"
"checksum dtoa 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "80c8b71fd71146990a9742fc06dcbbde19161a267e0ad4e572c35162f4578c90"
@@ -1139,7 +1162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum error 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "a6e606f14042bb87cc02ef6a14db6c90ab92ed6f62d87e69377bc759fd7987cc"
"checksum error-chain 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "6930e04918388a9a2e41d518c25cf679ccafe26733fb4127dbf21993f2575d46"
"checksum fs2 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "bcd414e5a1a979b931bb92f41b7a54106d3f6d2e6c253e9ce943b7cd468251ef"
"checksum fst 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)" = "4667468a5e6f0eea9cc30ebf1cce752cb831974e319d7fff312aad85652c1596"
"checksum fst 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "11e21bdd626be09f2bd66b44dbb724538176aa3549f3109208db35538dd2699f"
"checksum futures 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)" = "4b63a4792d4f8f686defe3b39b92127fea6344de5d38202b2ee5a11bbbf29d6a"
"checksum futures-cpupool 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "a283c84501e92cade5ea673a2a7ca44f71f209ccdd302a3e0896f50083d2c5ff"
"checksum gcc 0.3.51 (registry+https://github.com/rust-lang/crates.io-index)" = "120d07f202dcc3f72859422563522b66fe6463a4c513df062874daad05f85f0a"
@@ -1160,6 +1183,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "880f77541efa6e5cc74e76910c9884d9859683118839d6a1dc3b11e63512565b"
"checksum lz4 1.21.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2f7f1c5ea96c6b31551477a743d75c483a6a039319dd33e4e31c27df24361713"
"checksum lz4-sys 1.7.5 (registry+https://github.com/rust-lang/crates.io-index)" = "39e8a451abc4169b50dddbc9b34f93ecabd447330d99d27d518a3f8d4e209e9b"
"checksum maplit 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "22593015b8df7747861c69c28acd32589fb96c1686369f3b661d12e409d4cf65"
"checksum matches 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "100aabe6b8ff4e4a7e32c1c13523379802df0772b82466207ac25b013f193376"
"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20"
"checksum memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1dbccc0e46f1ea47b9f17e6d67c5a96bd27030519c519c9c91327e31275a47b4"
@@ -1189,6 +1213,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum regex 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1731164734096285ec2a5ec7fea5248ae2f5485b3feeb0115af4fda2183b2d1b"
"checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957"
"checksum regex-syntax 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad890a5eef7953f55427c50575c680c42841653abd2b028b68cd223d157f62db"
"checksum rust-stemmers 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8398e39ef1740238f87fcc4171fccc2231ba7ef1ecd64075d77feb0041927fc7"
"checksum rustc-demangle 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "3058a43ada2c2d0b92b3ae38007a2d0fa5e9db971be260e0171408a4ff471c95"
"checksum rustc-serialize 0.3.24 (registry+https://github.com/rust-lang/crates.io-index)" = "dcf128d1287d2ea9d80910b5f1120d0b8eede3fbf1abe91c40d39ea7d51e6fda"
"checksum rustc_version 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084"
@@ -1208,7 +1233,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum strsim 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b4d15c810519a91cf877e7e36e63fe068815c678181439f2f29e2562147c3694"
"checksum syn 0.11.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d3b891b9015c88c576343b9b3e41c2c11a51c219ef067b264bd9c8aa9b441dad"
"checksum synom 0.11.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a393066ed9010ebaed60b9eafa373d4b1baac186dd7e008555b0f702b51945b6"
"checksum tantivy 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b345d470c3523df9dc2e85444e80165016894ef441a45e2aca86bfcaa1335cb0"
"checksum tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "87974a6f5c1dfb344d733055601650059a3363de2a6104819293baff662132d6"
"checksum tempfile 2.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "5b92290d7f1ce2d221405d5c78b9c568c9f1debb314aa92a513cd99db709f931"
"checksum term_size 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2b6b55df3198cc93372e85dd2ed817f0e38ce8cc0f22eb32391bfad9c4bf209"
@@ -1217,6 +1241,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5"
"checksum thread_local 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "1697c4b57aeeb7a536b647165a2825faddffb1d3bad386d507709bd51a90bb14"
"checksum time 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)" = "d5d788d3aa77bc0ef3e9621256885555368b47bd495c13dd2e7413c89f845520"
"checksum tinysegmenter 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7e767ff68150da3d23c88482da07abd6532e2e928093b80e79dc4818119bbc36"
"checksum traitobject 0.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "07eaeb7689bb7fca7ce15628319635758eda769fed481ecfe6686ddef2600616"
"checksum traitobject 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "efd1f82c56340fdf16f2a953d7bda4f8fdffba13d93b00844c25572110b26079"
"checksum typeable 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1410f6f91f21d1612654e7cc69193b0334f909dcf2c790c4826254fbb86f8887"


+ 2
- 2
Cargo.toml View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-cli"
version = "0.4.3"
version = "0.5.0-alpha"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]

description = """Command line interface for Tantivy, a search engine library."""
@@ -31,7 +31,7 @@ log = "0.3"
futures = "0.1"
env_logger = "0.3"
version = "2"
tantivy = "0.4.3"
tantivy = {path="../tantivy"}

[[bin]]
name = "tantivy"


+ 10
- 10
src/commands/bench.rs View File

@@ -17,7 +17,7 @@ use std::path::PathBuf;
pub fn run_bench_cli(matches: &ArgMatches) -> Result<(), String> {
let index_path = PathBuf::from(matches.value_of("index").unwrap());
let queries_path = PathBuf::from(matches.value_of("queries").unwrap()); // the unwrap is safe as long as it is comming from the main cli.
let num_repeat = try!(value_t!(matches, "num_repeat", usize).map_err(|e|format!("Failed to read num_repeat argument as an integer. {:?}", e)));
let num_repeat = value_t!(matches, "num_repeat", usize).map_err(|e| format!("Failed to read num_repeat argument as an integer. {:?}", e))?;
run_bench(&index_path, &queries_path, num_repeat).map_err(From::from)
}

@@ -34,13 +34,11 @@ fn extract_search_fields(schema: &Schema) -> Vec<Field> {
}

fn read_query_file(query_path: &Path) -> io::Result<Vec<String>> {
let query_file: File = try!(File::open(&query_path));
let query_file: File = File::open(&query_path)?;
let file = BufReader::new(&query_file);
let mut queries = Vec::new();
for line_res in file.lines() {
let line = try!(line_res);
let query = String::from(line.trim());
queries.push(query);
queries.push(line_res?);
}
Ok(queries)
}
@@ -54,11 +52,11 @@ fn run_bench(index_path: &Path,
println!("Query : {:?}", index_path);
println!("-------------------------------\n\n\n");
let index = try!(Index::open(index_path).map_err(|e| format!("Failed to open index.\n{:?}", e)));
let index = Index::open(index_path).map_err(|e| format!("Failed to open index.\n{:?}", e))?;
let searcher = index.searcher();
let default_search_fields: Vec<Field> = extract_search_fields(&index.schema());
let queries = try!(read_query_file(query_filepath).map_err(|e| format!("Failed reading the query file: {}", e)));
let query_parser = QueryParser::new(index.schema(), default_search_fields);
let queries = read_query_file(query_filepath).map_err(|e| format!("Failed reading the query file: {}", e))?;
let query_parser = QueryParser::new(index.schema(), default_search_fields, index.tokenizers().clone());
println!("SEARCH\n");
println!("{}\t{}\t{}\t{}", "query", "num_terms", "num hits", "time in microsecs");
@@ -71,7 +69,8 @@ fn run_bench(index_path: &Path,
let timing;
{
let mut collector = chain().push(&mut top_collector).push(&mut count_collector);
timing = try!(query.search(&searcher, &mut collector).map_err(|e| format!("Failed while searching query {:?}.\n\n{:?}", query_txt, e)));
timing = query.search(&searcher, &mut collector)
.map_err(|e| format!("Failed while searching query {:?}.\n\n{:?}", query_txt, e))?;
}
println!("{}\t{}\t{}", query_txt, count_collector.count(), timing.total_time());
}
@@ -84,7 +83,8 @@ fn run_bench(index_path: &Path,
for query_txt in &queries {
let query = query_parser.parse_query(&query_txt).unwrap();
let mut top_collector = TopCollector::with_limit(10);
try!(query.search(&searcher, &mut top_collector).map_err(|e| format!("Failed while retrieving document for query {:?}.\n{:?}", query, e)));
query.search(&searcher, &mut top_collector)
.map_err(|e| format!("Failed while retrieving document for query {:?}.\n{:?}", query, e))?;
let mut timer = TimerTree::default();
{
let _scoped_timer_ = timer.open("total");


+ 9
- 12
src/commands/index.rs View File

@@ -22,11 +22,11 @@ pub fn run_index_cli(argmatch: &ArgMatches) -> Result<(), String> {
.map(|path| DocumentSource::FromFile(PathBuf::from(path)))
.unwrap_or(DocumentSource::FromPipe);
let no_merge = argmatch.is_present("nomerge");
let mut num_threads = try!(value_t!(argmatch, "num_threads", usize).map_err(|_|format!("Failed to read num_threads argument as an integer.")));
let mut num_threads = value_t!(argmatch, "num_threads", usize).map_err(|_| format!("Failed to read num_threads argument as an integer."))?;
if num_threads == 0 {
num_threads = 1;
}
let buffer_size = try!(value_t!(argmatch, "memory_size", usize).map_err(|_|format!("Failed to read the buffer size argument as an integer.")));
let buffer_size = value_t!(argmatch, "memory_size", usize).map_err(|_| format!("Failed to read the buffer size argument as an integer."))?;
let buffer_size_per_thread = buffer_size / num_threads;
run_index(index_directory, document_source, buffer_size_per_thread, num_threads, no_merge).map_err(|e| format!("Indexing failed : {:?}", e))
}
@@ -37,7 +37,7 @@ fn run_index(directory: PathBuf,
num_threads: usize,
no_merge: bool) -> tantivy::Result<()> {
let index = try!(Index::open(&directory));
let index = Index::open(&directory)?;
let schema = index.schema();
let (line_sender, line_receiver) = chan::sync(10_000);
let (doc_sender, doc_receiver) = chan::sync(10_000);
@@ -71,14 +71,11 @@ fn run_index(directory: PathBuf,
}
drop(doc_sender);

let mut index_writer = try!(
if num_threads > 0 {
index.writer_with_num_threads(num_threads, buffer_size_per_thread)
}
else {
index.writer(buffer_size_per_thread)
}
);
let mut index_writer = if num_threads > 0 {
index.writer_with_num_threads(num_threads, buffer_size_per_thread)
} else {
index.writer(buffer_size_per_thread)
}?;
if no_merge {
index_writer.set_merge_policy(Box::new(NoMergePolicy));
@@ -145,7 +142,7 @@ impl DocumentSource {
BufReader::new(Box::new(io::stdin()))
}
&DocumentSource::FromFile(ref filepath) => {
let read_file = try!(File::open(&filepath));
let read_file = File::open(&filepath)?;
BufReader::new(Box::new(read_file))
}
})


+ 19
- 18
src/commands/new.rs View File

@@ -8,7 +8,6 @@ use std::io;
use ansi_term::Style;
use ansi_term::Colour::{Red, Blue, Green};
use std::io::Write;
use std::ascii::AsciiExt;
use serde_json;


@@ -77,29 +76,31 @@ fn ask_add_field_text(field_name: &str, schema_builder: &mut SchemaBuilder) {
if prompt_yn("Should the field be stored") {
text_options = text_options.set_stored();
}
let is_indexed = prompt_yn("Should the field be indexed");
let indexing_options = if is_indexed {
if prompt_yn("Should the field be tokenized") {



if prompt_yn("Should the field be indexed") {
let mut text_indexing_options = TextFieldIndexing
::default()
.set_index_option(IndexRecordOption::Basic)
.set_tokenizer("en_stem");

if prompt_yn("Should the term be tokenized?") {
if prompt_yn("Should the term frequencies (per doc) be in the index") {
if prompt_yn("Should the term positions (per doc) be in the index") {
TextIndexingOptions::TokenizedWithFreqAndPosition
}
else {
TextIndexingOptions::TokenizedWithFreq
text_indexing_options = text_indexing_options.set_index_option(IndexRecordOption::WithFreqsAndPositions);
} else {
text_indexing_options = text_indexing_options.set_index_option(IndexRecordOption::WithFreqs);
}
}
else {
TextIndexingOptions::TokenizedNoFreq
}
}
else {
TextIndexingOptions::Untokenized
} else {
text_indexing_options = text_indexing_options.set_tokenizer("raw");
}

text_options = text_options.set_indexing_options(text_indexing_options);
}
else {
TextIndexingOptions::Unindexed
};
text_options = text_options.set_indexing_options(indexing_options);


schema_builder.add_text_field(field_name, text_options);
}



+ 13
- 4
src/commands/search.rs View File

@@ -8,6 +8,7 @@ use tantivy::query::QueryParser;
use tantivy::schema::Field;
use serde_json;
use tantivy::schema::FieldType;
use tantivy::tokenizer::*;

pub fn run_search_cli(matches: &ArgMatches) -> Result<(), String> {
let index_directory = PathBuf::from(matches.value_of("index").unwrap());
@@ -18,16 +19,24 @@ pub fn run_search_cli(matches: &ArgMatches) -> Result<(), String> {

fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> {
let index = Index::open(directory)?;
index
.tokenizers()
.register("commoncrawl", SimpleTokenizer
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AlphaNumOnlyFilter)
.filter(Stemmer::new())
);
let schema = index.schema();
let default_fields: Vec<Field> = schema
.fields()
.iter()
.enumerate()
.filter(
|&(_, ref field_entry)| {
|&(_, ref field_entry) | {
match *field_entry.field_type() {
FieldType::Str(ref text_field_options) => {
text_field_options.get_indexing_options().is_indexed()
text_field_options.get_indexing_options().is_some()
},
_ => false
}
@@ -35,13 +44,13 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> {
)
.map(|(i, _)| Field(i as u32))
.collect();
let query_parser = QueryParser::new(schema.clone(), default_fields);
let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone());
let query = query_parser.parse_query(query)?;
let searcher = index.searcher();
let weight = query.weight(&searcher)?;
let schema = index.schema();
for segment_reader in searcher.segment_readers() {
let mut scorer = try!(weight.scorer(segment_reader));
let mut scorer = weight.scorer(segment_reader)?;
while scorer.advance() {
let doc_id = scorer.doc();
let doc = segment_reader.doc(doc_id)?;


+ 24
- 13
src/commands/serve.rs View File

@@ -40,6 +40,8 @@ use tantivy::schema::FieldType;
use tantivy::schema::NamedFieldDocument;
use tantivy::schema::Schema;
use tantivy::TimerTree;
use tantivy::tokenizer::*;
use tantivy::DocAddress;
use urlencoded::UrlEncodedQuery;

pub fn run_serve_cli(matches: &ArgMatches) -> Result<(), String> {
@@ -62,6 +64,7 @@ struct Serp {
#[derive(Serialize)]
struct Hit {
doc: NamedFieldDocument,
id: u32,
}

struct IndexServer {
@@ -74,6 +77,13 @@ impl IndexServer {
fn load(path: &Path) -> IndexServer {
let index = Index::open(path).unwrap();
index.tokenizers()
.register("commoncrawl", SimpleTokenizer
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AlphaNumOnlyFilter)
.filter(Stemmer::new())
);
let schema = index.schema();
let default_fields: Vec<Field> = schema
.fields()
@@ -83,7 +93,7 @@ impl IndexServer {
|&(_, ref field_entry)| {
match *field_entry.field_type() {
FieldType::Str(ref text_field_options) => {
text_field_options.get_indexing_options().is_indexed()
text_field_options.get_indexing_options().is_some()
},
_ => false
}
@@ -91,17 +101,18 @@ impl IndexServer {
)
.map(|(i, _)| Field(i as u32))
.collect();
let query_parser = QueryParser::new(schema.clone(), default_fields);
let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone());
IndexServer {
index: index,
query_parser: query_parser,
schema: schema,
index,
query_parser,
schema,
}
}

fn create_hit(&self, doc: &Document) -> Hit {
fn create_hit(&self, doc: &Document, doc_address: &DocAddress) -> Hit {
Hit {
doc: self.schema.to_named_doc(&doc)
doc: self.schema.to_named_doc(&doc),
id: doc_address.doc(),
}
}
@@ -116,7 +127,7 @@ impl IndexServer {
let mut chained_collector = collector::chain()
.push(&mut top_collector)
.push(&mut count_collector);
try!(query.search(&searcher, &mut chained_collector));
query.search(&searcher, &mut chained_collector)?;
}
let hits: Vec<Hit> = {
let _fetching_timer = timer_tree.open("fetching docs");
@@ -124,14 +135,14 @@ impl IndexServer {
.iter()
.map(|doc_address| {
let doc: Document = searcher.doc(doc_address).unwrap();
self.create_hit(&doc)
self.create_hit(&doc, doc_address)
})
.collect()
};
Ok(Serp {
q: q,
q,
num_hits: count_collector.count(),
hits: hits,
hits,
timings: timer_tree,
})
}
@@ -163,9 +174,9 @@ fn search(req: &mut Request) -> IronResult<Response> {
.get("nhits")
.and_then(|nhits_str| usize::from_str(&nhits_str[0]).ok())
.unwrap_or(10);
let query = try!(qs_map
let query = qs_map
.get("q")
.ok_or_else(|| IronError::new(StringError(String::from("Parameter q is missing from the query")), status::BadRequest)))[0].clone();
.ok_or_else(|| IronError::new(StringError(String::from("Parameter q is missing from the query")), status::BadRequest))?[0].clone();
let serp = index_server.search(query, num_hits).unwrap();
let resp_json = serde_json::to_string_pretty(&serp).unwrap();
let content_type = "application/json".parse::<Mime>().unwrap();


Loading…
Cancel
Save