From fd35d407b03405f999e0b9531c5b0cea462b16eb Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 25 Jan 2018 12:58:17 +0900 Subject: [PATCH] Working with master branch tantivy. Incldues commoncrawl analyzers --- Cargo.lock | 103 +++++++++++++++++++++++++---------------- Cargo.toml | 4 +- src/commands/bench.rs | 20 ++++---- src/commands/index.rs | 21 ++++----- src/commands/new.rs | 37 ++++++++------- src/commands/search.rs | 17 +++++-- src/commands/serve.rs | 37 +++++++++------ 7 files changed, 141 insertions(+), 98 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6fd3103..fce1d86 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,28 +1,3 @@ -[root] -name = "tantivy-cli" -version = "0.4.2" -dependencies = [ - "ansi_term 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", - "bincode 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", - "byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", - "chan 0.1.19 (registry+https://github.com/rust-lang/crates.io-index)", - "clap 2.25.0 (registry+https://github.com/rust-lang/crates.io-index)", - "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", - "futures 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)", - "iron 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", - "mount 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", - "persistent 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_derive 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_json 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", - "staticfile 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", - "tantivy 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", - "time 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)", - "urlencoded 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", - "version 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "aho-corasick" version = "0.5.3" @@ -174,6 +149,11 @@ name = "byteorder" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "cc" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "cfg-if" version = "0.1.2" @@ -230,7 +210,7 @@ dependencies = [ [[package]] name = "crossbeam" -version = "0.2.10" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -295,13 +275,11 @@ dependencies = [ [[package]] name = "fst" -version = "0.1.38" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", - "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -460,6 +438,11 @@ dependencies = [ "libc 0.2.26 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "maplit" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "matches" version = "0.1.6" @@ -700,6 +683,11 @@ name = "regex-syntax" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "rust-stemmers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "rustc-demangle" version = "0.1.4" @@ -847,30 +835,29 @@ dependencies = [ [[package]] name = "tantivy" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" +version = "0.5.0-dev" dependencies = [ "atomicwrites 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", "bincode 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "bit-set 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "cc 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", "chan 0.1.19 (registry+https://github.com/rust-lang/crates.io-index)", "combine 2.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "crossbeam 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "error-chain 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", - "fst 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)", + "fst 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)", "futures-cpupool 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", - "gcc 0.3.51 (registry+https://github.com/rust-lang/crates.io-index)", "itertools 0.5.10 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.26 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", "lz4 1.21.1 (registry+https://github.com/rust-lang/crates.io-index)", - "memmap 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "num_cpus 1.6.2 (registry+https://github.com/rust-lang/crates.io-index)", "owning_ref 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rust-stemmers 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -878,11 +865,37 @@ dependencies = [ "tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "tempfile 2.1.6 (registry+https://github.com/rust-lang/crates.io-index)", "time 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)", + "tinysegmenter 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "uuid 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", "version 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "tantivy-cli" +version = "0.5.0-alpha" +dependencies = [ + "ansi_term 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", + "bincode 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "chan 0.1.19 (registry+https://github.com/rust-lang/crates.io-index)", + "clap 2.25.0 (registry+https://github.com/rust-lang/crates.io-index)", + "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", + "futures 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)", + "iron 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", + "mount 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "persistent 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "staticfile 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "tantivy 0.5.0-dev", + "time 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)", + "urlencoded 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", + "version 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "tempdir" version = "0.3.5" @@ -958,6 +971,15 @@ dependencies = [ "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "tinysegmenter" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)", + "maplit 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "traitobject" version = "0.0.1" @@ -1124,13 +1146,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum byteorder 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "96c8b41881888cc08af32d47ac4edd52bc7fa27fef774be47a92443756451304" "checksum byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855" "checksum byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ff81738b726f5d099632ceaffe7fb65b90212e8dce59d518729e7e8634032d3d" +"checksum cc 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a9b13a57efd6b30ecd6598ebdb302cca617930b5470647570468a65d12ef9719" "checksum cfg-if 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d4c819a1287eb618df47cc647173c5c4c66ba19d888a6e50d605672aed3140de" "checksum chan 0.1.19 (registry+https://github.com/rust-lang/crates.io-index)" = "f93bfe971116428a9066c1c3c69a09ae3ef69432f8418be28ab50f96783e6a50" "checksum clap 2.25.0 (registry+https://github.com/rust-lang/crates.io-index)" = "867a885995b4184be051b70a592d4d70e32d7a188db6e8dff626af286a962771" "checksum combine 2.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9d48b4a5b824c441d0a5194e2aa63491d1c9e5aa2acc5f6e4924f990715afa2a" "checksum conduit-mime-types 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)" = "95ca30253581af809925ef68c2641cc140d6183f43e12e0af4992d53768bd7b8" "checksum cookie 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0e3d6405328b6edb412158b3b7710e2634e23f3614b9bb1c412df7952489a626" -"checksum crossbeam 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)" = "0c5ea215664ca264da8a9d9c3be80d2eaf30923c259d03e870388eb927508f97" +"checksum crossbeam 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8837ab96533202c5b610ed44bc7f4183e7957c1c8f56e8cc78bb098593c8ba0a" "checksum dbghelp-sys 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "97590ba53bcb8ac28279161ca943a924d1fd4a8fb3fa63302591647c4fc5b850" "checksum dtoa 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "0dd841b58510c9618291ffa448da2e4e0f699d984d436122372f446dae62263d" "checksum dtoa 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "80c8b71fd71146990a9742fc06dcbbde19161a267e0ad4e572c35162f4578c90" @@ -1139,7 +1162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum error 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "a6e606f14042bb87cc02ef6a14db6c90ab92ed6f62d87e69377bc759fd7987cc" "checksum error-chain 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "6930e04918388a9a2e41d518c25cf679ccafe26733fb4127dbf21993f2575d46" "checksum fs2 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "bcd414e5a1a979b931bb92f41b7a54106d3f6d2e6c253e9ce943b7cd468251ef" -"checksum fst 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)" = "4667468a5e6f0eea9cc30ebf1cce752cb831974e319d7fff312aad85652c1596" +"checksum fst 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "11e21bdd626be09f2bd66b44dbb724538176aa3549f3109208db35538dd2699f" "checksum futures 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)" = "4b63a4792d4f8f686defe3b39b92127fea6344de5d38202b2ee5a11bbbf29d6a" "checksum futures-cpupool 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "a283c84501e92cade5ea673a2a7ca44f71f209ccdd302a3e0896f50083d2c5ff" "checksum gcc 0.3.51 (registry+https://github.com/rust-lang/crates.io-index)" = "120d07f202dcc3f72859422563522b66fe6463a4c513df062874daad05f85f0a" @@ -1160,6 +1183,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "880f77541efa6e5cc74e76910c9884d9859683118839d6a1dc3b11e63512565b" "checksum lz4 1.21.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2f7f1c5ea96c6b31551477a743d75c483a6a039319dd33e4e31c27df24361713" "checksum lz4-sys 1.7.5 (registry+https://github.com/rust-lang/crates.io-index)" = "39e8a451abc4169b50dddbc9b34f93ecabd447330d99d27d518a3f8d4e209e9b" +"checksum maplit 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "22593015b8df7747861c69c28acd32589fb96c1686369f3b661d12e409d4cf65" "checksum matches 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "100aabe6b8ff4e4a7e32c1c13523379802df0772b82466207ac25b013f193376" "checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" "checksum memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1dbccc0e46f1ea47b9f17e6d67c5a96bd27030519c519c9c91327e31275a47b4" @@ -1189,6 +1213,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum regex 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1731164734096285ec2a5ec7fea5248ae2f5485b3feeb0115af4fda2183b2d1b" "checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957" "checksum regex-syntax 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad890a5eef7953f55427c50575c680c42841653abd2b028b68cd223d157f62db" +"checksum rust-stemmers 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8398e39ef1740238f87fcc4171fccc2231ba7ef1ecd64075d77feb0041927fc7" "checksum rustc-demangle 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "3058a43ada2c2d0b92b3ae38007a2d0fa5e9db971be260e0171408a4ff471c95" "checksum rustc-serialize 0.3.24 (registry+https://github.com/rust-lang/crates.io-index)" = "dcf128d1287d2ea9d80910b5f1120d0b8eede3fbf1abe91c40d39ea7d51e6fda" "checksum rustc_version 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084" @@ -1208,7 +1233,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum strsim 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b4d15c810519a91cf877e7e36e63fe068815c678181439f2f29e2562147c3694" "checksum syn 0.11.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d3b891b9015c88c576343b9b3e41c2c11a51c219ef067b264bd9c8aa9b441dad" "checksum synom 0.11.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a393066ed9010ebaed60b9eafa373d4b1baac186dd7e008555b0f702b51945b6" -"checksum tantivy 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b345d470c3523df9dc2e85444e80165016894ef441a45e2aca86bfcaa1335cb0" "checksum tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "87974a6f5c1dfb344d733055601650059a3363de2a6104819293baff662132d6" "checksum tempfile 2.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "5b92290d7f1ce2d221405d5c78b9c568c9f1debb314aa92a513cd99db709f931" "checksum term_size 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2b6b55df3198cc93372e85dd2ed817f0e38ce8cc0f22eb32391bfad9c4bf209" @@ -1217,6 +1241,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" "checksum thread_local 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "1697c4b57aeeb7a536b647165a2825faddffb1d3bad386d507709bd51a90bb14" "checksum time 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)" = "d5d788d3aa77bc0ef3e9621256885555368b47bd495c13dd2e7413c89f845520" +"checksum tinysegmenter 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7e767ff68150da3d23c88482da07abd6532e2e928093b80e79dc4818119bbc36" "checksum traitobject 0.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "07eaeb7689bb7fca7ce15628319635758eda769fed481ecfe6686ddef2600616" "checksum traitobject 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "efd1f82c56340fdf16f2a953d7bda4f8fdffba13d93b00844c25572110b26079" "checksum typeable 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1410f6f91f21d1612654e7cc69193b0334f909dcf2c790c4826254fbb86f8887" diff --git a/Cargo.toml b/Cargo.toml index a5e98a3..6903859 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tantivy-cli" -version = "0.4.3" +version = "0.5.0-alpha" authors = ["Paul Masurel "] description = """Command line interface for Tantivy, a search engine library.""" @@ -31,7 +31,7 @@ log = "0.3" futures = "0.1" env_logger = "0.3" version = "2" -tantivy = "0.4.3" +tantivy = {path="../tantivy"} [[bin]] name = "tantivy" diff --git a/src/commands/bench.rs b/src/commands/bench.rs index 4f348d4..1ce56d2 100644 --- a/src/commands/bench.rs +++ b/src/commands/bench.rs @@ -17,7 +17,7 @@ use std::path::PathBuf; pub fn run_bench_cli(matches: &ArgMatches) -> Result<(), String> { let index_path = PathBuf::from(matches.value_of("index").unwrap()); let queries_path = PathBuf::from(matches.value_of("queries").unwrap()); // the unwrap is safe as long as it is comming from the main cli. - let num_repeat = try!(value_t!(matches, "num_repeat", usize).map_err(|e|format!("Failed to read num_repeat argument as an integer. {:?}", e))); + let num_repeat = value_t!(matches, "num_repeat", usize).map_err(|e| format!("Failed to read num_repeat argument as an integer. {:?}", e))?; run_bench(&index_path, &queries_path, num_repeat).map_err(From::from) } @@ -34,13 +34,11 @@ fn extract_search_fields(schema: &Schema) -> Vec { } fn read_query_file(query_path: &Path) -> io::Result> { - let query_file: File = try!(File::open(&query_path)); + let query_file: File = File::open(&query_path)?; let file = BufReader::new(&query_file); let mut queries = Vec::new(); for line_res in file.lines() { - let line = try!(line_res); - let query = String::from(line.trim()); - queries.push(query); + queries.push(line_res?); } Ok(queries) } @@ -54,11 +52,11 @@ fn run_bench(index_path: &Path, println!("Query : {:?}", index_path); println!("-------------------------------\n\n\n"); - let index = try!(Index::open(index_path).map_err(|e| format!("Failed to open index.\n{:?}", e))); + let index = Index::open(index_path).map_err(|e| format!("Failed to open index.\n{:?}", e))?; let searcher = index.searcher(); let default_search_fields: Vec = extract_search_fields(&index.schema()); - let queries = try!(read_query_file(query_filepath).map_err(|e| format!("Failed reading the query file: {}", e))); - let query_parser = QueryParser::new(index.schema(), default_search_fields); + let queries = read_query_file(query_filepath).map_err(|e| format!("Failed reading the query file: {}", e))?; + let query_parser = QueryParser::new(index.schema(), default_search_fields, index.tokenizers().clone()); println!("SEARCH\n"); println!("{}\t{}\t{}\t{}", "query", "num_terms", "num hits", "time in microsecs"); @@ -71,7 +69,8 @@ fn run_bench(index_path: &Path, let timing; { let mut collector = chain().push(&mut top_collector).push(&mut count_collector); - timing = try!(query.search(&searcher, &mut collector).map_err(|e| format!("Failed while searching query {:?}.\n\n{:?}", query_txt, e))); + timing = query.search(&searcher, &mut collector) + .map_err(|e| format!("Failed while searching query {:?}.\n\n{:?}", query_txt, e))?; } println!("{}\t{}\t{}", query_txt, count_collector.count(), timing.total_time()); } @@ -84,7 +83,8 @@ fn run_bench(index_path: &Path, for query_txt in &queries { let query = query_parser.parse_query(&query_txt).unwrap(); let mut top_collector = TopCollector::with_limit(10); - try!(query.search(&searcher, &mut top_collector).map_err(|e| format!("Failed while retrieving document for query {:?}.\n{:?}", query, e))); + query.search(&searcher, &mut top_collector) + .map_err(|e| format!("Failed while retrieving document for query {:?}.\n{:?}", query, e))?; let mut timer = TimerTree::default(); { let _scoped_timer_ = timer.open("total"); diff --git a/src/commands/index.rs b/src/commands/index.rs index 9e81393..161cd17 100644 --- a/src/commands/index.rs +++ b/src/commands/index.rs @@ -22,11 +22,11 @@ pub fn run_index_cli(argmatch: &ArgMatches) -> Result<(), String> { .map(|path| DocumentSource::FromFile(PathBuf::from(path))) .unwrap_or(DocumentSource::FromPipe); let no_merge = argmatch.is_present("nomerge"); - let mut num_threads = try!(value_t!(argmatch, "num_threads", usize).map_err(|_|format!("Failed to read num_threads argument as an integer."))); + let mut num_threads = value_t!(argmatch, "num_threads", usize).map_err(|_| format!("Failed to read num_threads argument as an integer."))?; if num_threads == 0 { num_threads = 1; } - let buffer_size = try!(value_t!(argmatch, "memory_size", usize).map_err(|_|format!("Failed to read the buffer size argument as an integer."))); + let buffer_size = value_t!(argmatch, "memory_size", usize).map_err(|_| format!("Failed to read the buffer size argument as an integer."))?; let buffer_size_per_thread = buffer_size / num_threads; run_index(index_directory, document_source, buffer_size_per_thread, num_threads, no_merge).map_err(|e| format!("Indexing failed : {:?}", e)) } @@ -37,7 +37,7 @@ fn run_index(directory: PathBuf, num_threads: usize, no_merge: bool) -> tantivy::Result<()> { - let index = try!(Index::open(&directory)); + let index = Index::open(&directory)?; let schema = index.schema(); let (line_sender, line_receiver) = chan::sync(10_000); let (doc_sender, doc_receiver) = chan::sync(10_000); @@ -71,14 +71,11 @@ fn run_index(directory: PathBuf, } drop(doc_sender); - let mut index_writer = try!( - if num_threads > 0 { - index.writer_with_num_threads(num_threads, buffer_size_per_thread) - } - else { - index.writer(buffer_size_per_thread) - } - ); + let mut index_writer = if num_threads > 0 { + index.writer_with_num_threads(num_threads, buffer_size_per_thread) + } else { + index.writer(buffer_size_per_thread) + }?; if no_merge { index_writer.set_merge_policy(Box::new(NoMergePolicy)); @@ -145,7 +142,7 @@ impl DocumentSource { BufReader::new(Box::new(io::stdin())) } &DocumentSource::FromFile(ref filepath) => { - let read_file = try!(File::open(&filepath)); + let read_file = File::open(&filepath)?; BufReader::new(Box::new(read_file)) } }) diff --git a/src/commands/new.rs b/src/commands/new.rs index 7f84183..e6ca82a 100644 --- a/src/commands/new.rs +++ b/src/commands/new.rs @@ -8,7 +8,6 @@ use std::io; use ansi_term::Style; use ansi_term::Colour::{Red, Blue, Green}; use std::io::Write; -use std::ascii::AsciiExt; use serde_json; @@ -77,29 +76,31 @@ fn ask_add_field_text(field_name: &str, schema_builder: &mut SchemaBuilder) { if prompt_yn("Should the field be stored") { text_options = text_options.set_stored(); } - let is_indexed = prompt_yn("Should the field be indexed"); - let indexing_options = if is_indexed { - if prompt_yn("Should the field be tokenized") { + + + + if prompt_yn("Should the field be indexed") { + let mut text_indexing_options = TextFieldIndexing + ::default() + .set_index_option(IndexRecordOption::Basic) + .set_tokenizer("en_stem"); + + if prompt_yn("Should the term be tokenized?") { if prompt_yn("Should the term frequencies (per doc) be in the index") { if prompt_yn("Should the term positions (per doc) be in the index") { - TextIndexingOptions::TokenizedWithFreqAndPosition - } - else { - TextIndexingOptions::TokenizedWithFreq + text_indexing_options = text_indexing_options.set_index_option(IndexRecordOption::WithFreqsAndPositions); + } else { + text_indexing_options = text_indexing_options.set_index_option(IndexRecordOption::WithFreqs); } } - else { - TextIndexingOptions::TokenizedNoFreq - } - } - else { - TextIndexingOptions::Untokenized + } else { + text_indexing_options = text_indexing_options.set_tokenizer("raw"); } + + text_options = text_options.set_indexing_options(text_indexing_options); } - else { - TextIndexingOptions::Unindexed - }; - text_options = text_options.set_indexing_options(indexing_options); + + schema_builder.add_text_field(field_name, text_options); } diff --git a/src/commands/search.rs b/src/commands/search.rs index 882c15d..0397e5d 100644 --- a/src/commands/search.rs +++ b/src/commands/search.rs @@ -8,6 +8,7 @@ use tantivy::query::QueryParser; use tantivy::schema::Field; use serde_json; use tantivy::schema::FieldType; +use tantivy::tokenizer::*; pub fn run_search_cli(matches: &ArgMatches) -> Result<(), String> { let index_directory = PathBuf::from(matches.value_of("index").unwrap()); @@ -18,16 +19,24 @@ pub fn run_search_cli(matches: &ArgMatches) -> Result<(), String> { fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { let index = Index::open(directory)?; + index + .tokenizers() + .register("commoncrawl", SimpleTokenizer + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AlphaNumOnlyFilter) + .filter(Stemmer::new()) + ); let schema = index.schema(); let default_fields: Vec = schema .fields() .iter() .enumerate() .filter( - |&(_, ref field_entry)| { + |&(_, ref field_entry) | { match *field_entry.field_type() { FieldType::Str(ref text_field_options) => { - text_field_options.get_indexing_options().is_indexed() + text_field_options.get_indexing_options().is_some() }, _ => false } @@ -35,13 +44,13 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { ) .map(|(i, _)| Field(i as u32)) .collect(); - let query_parser = QueryParser::new(schema.clone(), default_fields); + let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); let query = query_parser.parse_query(query)?; let searcher = index.searcher(); let weight = query.weight(&searcher)?; let schema = index.schema(); for segment_reader in searcher.segment_readers() { - let mut scorer = try!(weight.scorer(segment_reader)); + let mut scorer = weight.scorer(segment_reader)?; while scorer.advance() { let doc_id = scorer.doc(); let doc = segment_reader.doc(doc_id)?; diff --git a/src/commands/serve.rs b/src/commands/serve.rs index d9162ff..a7484b3 100644 --- a/src/commands/serve.rs +++ b/src/commands/serve.rs @@ -40,6 +40,8 @@ use tantivy::schema::FieldType; use tantivy::schema::NamedFieldDocument; use tantivy::schema::Schema; use tantivy::TimerTree; +use tantivy::tokenizer::*; +use tantivy::DocAddress; use urlencoded::UrlEncodedQuery; pub fn run_serve_cli(matches: &ArgMatches) -> Result<(), String> { @@ -62,6 +64,7 @@ struct Serp { #[derive(Serialize)] struct Hit { doc: NamedFieldDocument, + id: u32, } struct IndexServer { @@ -74,6 +77,13 @@ impl IndexServer { fn load(path: &Path) -> IndexServer { let index = Index::open(path).unwrap(); + index.tokenizers() + .register("commoncrawl", SimpleTokenizer + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AlphaNumOnlyFilter) + .filter(Stemmer::new()) + ); let schema = index.schema(); let default_fields: Vec = schema .fields() @@ -83,7 +93,7 @@ impl IndexServer { |&(_, ref field_entry)| { match *field_entry.field_type() { FieldType::Str(ref text_field_options) => { - text_field_options.get_indexing_options().is_indexed() + text_field_options.get_indexing_options().is_some() }, _ => false } @@ -91,17 +101,18 @@ impl IndexServer { ) .map(|(i, _)| Field(i as u32)) .collect(); - let query_parser = QueryParser::new(schema.clone(), default_fields); + let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); IndexServer { - index: index, - query_parser: query_parser, - schema: schema, + index, + query_parser, + schema, } } - fn create_hit(&self, doc: &Document) -> Hit { + fn create_hit(&self, doc: &Document, doc_address: &DocAddress) -> Hit { Hit { - doc: self.schema.to_named_doc(&doc) + doc: self.schema.to_named_doc(&doc), + id: doc_address.doc(), } } @@ -116,7 +127,7 @@ impl IndexServer { let mut chained_collector = collector::chain() .push(&mut top_collector) .push(&mut count_collector); - try!(query.search(&searcher, &mut chained_collector)); + query.search(&searcher, &mut chained_collector)?; } let hits: Vec = { let _fetching_timer = timer_tree.open("fetching docs"); @@ -124,14 +135,14 @@ impl IndexServer { .iter() .map(|doc_address| { let doc: Document = searcher.doc(doc_address).unwrap(); - self.create_hit(&doc) + self.create_hit(&doc, doc_address) }) .collect() }; Ok(Serp { - q: q, + q, num_hits: count_collector.count(), - hits: hits, + hits, timings: timer_tree, }) } @@ -163,9 +174,9 @@ fn search(req: &mut Request) -> IronResult { .get("nhits") .and_then(|nhits_str| usize::from_str(&nhits_str[0]).ok()) .unwrap_or(10); - let query = try!(qs_map + let query = qs_map .get("q") - .ok_or_else(|| IronError::new(StringError(String::from("Parameter q is missing from the query")), status::BadRequest)))[0].clone(); + .ok_or_else(|| IronError::new(StringError(String::from("Parameter q is missing from the query")), status::BadRequest))?[0].clone(); let serp = index_server.search(query, num_hits).unwrap(); let resp_json = serde_json::to_string_pretty(&serp).unwrap(); let content_type = "application/json".parse::().unwrap();