diff --git a/Cargo.lock b/Cargo.lock index e25e55d..0847c9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -82,16 +82,6 @@ dependencies = [ "serde 0.6.15 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "bincode" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "byteorder 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", - "num-traits 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "bit-set" version = "0.4.0" @@ -207,7 +197,12 @@ dependencies = [ [[package]] name = "crossbeam" -version = "0.2.12" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "downcast" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -263,13 +258,11 @@ dependencies = [ [[package]] name = "fst" -version = "0.1.38" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", - "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -457,6 +450,11 @@ dependencies = [ "libc 0.2.35 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "maplit" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "matches" version = "0.1.6" @@ -704,6 +702,11 @@ name = "regex-syntax" version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "rust-stemmers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "rustc-demangle" version = "0.1.5" @@ -838,30 +841,31 @@ dependencies = [ [[package]] name = "tantivy" -version = "0.4.4" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "atomicwrites 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", - "bincode 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "bit-set 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "cc 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "chan 0.1.19 (registry+https://github.com/rust-lang/crates.io-index)", "combine 2.5.2 (registry+https://github.com/rust-lang/crates.io-index)", - "crossbeam 0.2.12 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", + "downcast 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)", "error-chain 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", - "fst 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)", + "fst 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.17 (registry+https://github.com/rust-lang/crates.io-index)", "futures-cpupool 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", - "gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)", "itertools 0.5.10 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.35 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", "lz4 1.22.0 (registry+https://github.com/rust-lang/crates.io-index)", - "memmap 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "matches 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", "num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "owning_ref 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)", + "rust-stemmers 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", @@ -869,14 +873,14 @@ dependencies = [ "tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "tempfile 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "time 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)", + "tinysegmenter 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "uuid 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", - "version 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "tantivy-cli" -version = "0.4.4" +version = "0.5.0" dependencies = [ "ansi_term 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "bincode 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -893,7 +897,7 @@ dependencies = [ "serde_derive 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", "staticfile 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", - "tantivy 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "tantivy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "time 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)", "urlencoded 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", "version 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -973,6 +977,15 @@ dependencies = [ "winapi 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "tinysegmenter" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)", + "maplit 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "traitobject" version = "0.0.1" @@ -1143,7 +1156,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum backtrace 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "ebbbf59b1c43eefa8c3ede390fcc36820b4999f7914104015be25025e0d62af2" "checksum backtrace-sys 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "44585761d6161b0f57afc49482ab6bd067e4edef48c12a152c237eb0203f7661" "checksum bincode 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "60f89d68caf4f2e8a94efd192a2b8393869e72336dea4e0fe077cc6eb5f2057e" -"checksum bincode 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e103c8b299b28a9c6990458b7013dc4a8356a9b854c51b9883241f5866fac36e" "checksum bit-set 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d9bf6104718e80d7b26a68fdbacff3481cfc05df670821affc7e9cbc1884400c" "checksum bit-vec 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "02b4ff8b16e6076c3e14220b39fbc1fabb6737522281a388998046859400895f" "checksum bitflags 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4f67931368edf3a9a51d29886d245f1c3db2f1ef0dcc9e35ff70341b78c10d23" @@ -1160,7 +1172,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum combine 2.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1645a65a99c7c8d345761f4b75a6ffe5be3b3b27a93ee731fccc5050ba6be97c" "checksum conduit-mime-types 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)" = "95ca30253581af809925ef68c2641cc140d6183f43e12e0af4992d53768bd7b8" "checksum cookie 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0e3d6405328b6edb412158b3b7710e2634e23f3614b9bb1c412df7952489a626" -"checksum crossbeam 0.2.12 (registry+https://github.com/rust-lang/crates.io-index)" = "bd66663db5a988098a89599d4857919b3acf7f61402e61365acfd3919857b9be" +"checksum crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "24ce9782d4d5c53674646a6a4c1863a21a8fc0cb649b3c94dfc16e45071dea19" +"checksum downcast 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6c6fe31318b6ef21166c8e839e680238eb16f875849d597544eead7ec882eed3" "checksum dtoa 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "0dd841b58510c9618291ffa448da2e4e0f699d984d436122372f446dae62263d" "checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab" "checksum either 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "740178ddf48b1a9e878e6d6509a1442a2d42fd2928aae8e7a6f8a36fb01981b3" @@ -1168,7 +1181,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum error 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "a6e606f14042bb87cc02ef6a14db6c90ab92ed6f62d87e69377bc759fd7987cc" "checksum error-chain 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "6930e04918388a9a2e41d518c25cf679ccafe26733fb4127dbf21993f2575d46" "checksum fs2 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "bcd414e5a1a979b931bb92f41b7a54106d3f6d2e6c253e9ce943b7cd468251ef" -"checksum fst 0.1.38 (registry+https://github.com/rust-lang/crates.io-index)" = "4667468a5e6f0eea9cc30ebf1cce752cb831974e319d7fff312aad85652c1596" +"checksum fst 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "11e21bdd626be09f2bd66b44dbb724538176aa3549f3109208db35538dd2699f" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum futures 0.1.17 (registry+https://github.com/rust-lang/crates.io-index)" = "118b49cac82e04121117cbd3121ede3147e885627d82c4546b87c702debb90c1" @@ -1193,6 +1206,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "89f010e843f2b1a31dbd316b3b8d443758bc634bed37aabade59c686d644e0a2" "checksum lz4 1.22.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fe55d2ebbc2e4fc987e6fbfc13f416d97b06d06e50bc1124d613aa790842f80c" "checksum lz4-sys 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a59044c3ba3994f3d2aa2270ddd6c5947922219501e67efde5604d36aad462b5" +"checksum maplit 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "22593015b8df7747861c69c28acd32589fb96c1686369f3b661d12e409d4cf65" "checksum matches 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "100aabe6b8ff4e4a7e32c1c13523379802df0772b82466207ac25b013f193376" "checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" "checksum memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "796fba70e76612589ed2ce7f45282f5af869e0fdd7cc6199fa1aa1f1d591ba9d" @@ -1223,6 +1237,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum regex 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "744554e01ccbd98fff8c457c3b092cd67af62a555a43bfe97ae8a0451f7799fa" "checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957" "checksum regex-syntax 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "8e931c58b93d86f080c734bfd2bce7dd0079ae2331235818133c8be7f422e20e" +"checksum rust-stemmers 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8398e39ef1740238f87fcc4171fccc2231ba7ef1ecd64075d77feb0041927fc7" "checksum rustc-demangle 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "aee45432acc62f7b9a108cc054142dac51f979e69e71ddce7d6fc7adf29e817e" "checksum rustc-serialize 0.3.24 (registry+https://github.com/rust-lang/crates.io-index)" = "dcf128d1287d2ea9d80910b5f1120d0b8eede3fbf1abe91c40d39ea7d51e6fda" "checksum sequence_trie 0.0.13 (registry+https://github.com/rust-lang/crates.io-index)" = "d5b4eb0f7d1ff9b9666d8b8ff543f3705dd464025269a5b0e1988ffa60ca1be8" @@ -1240,7 +1255,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum strsim 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b4d15c810519a91cf877e7e36e63fe068815c678181439f2f29e2562147c3694" "checksum syn 0.11.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d3b891b9015c88c576343b9b3e41c2c11a51c219ef067b264bd9c8aa9b441dad" "checksum synom 0.11.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a393066ed9010ebaed60b9eafa373d4b1baac186dd7e008555b0f702b51945b6" -"checksum tantivy 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "dc901afb44c7e1f163383c2ae8af25d1e212bbcb0bcc95f5485f0e62749d17c5" +"checksum tantivy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "dcaf44ed828eea2da561b2bb5bb490e9dc68ad375609330e512818c143cc9d7c" "checksum tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "87974a6f5c1dfb344d733055601650059a3363de2a6104819293baff662132d6" "checksum tempfile 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "11ce2fe9db64b842314052e2421ac61a73ce41b898dc8e3750398b219c5fc1e0" "checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" @@ -1249,6 +1264,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" "checksum thread_local 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "279ef31c19ededf577bfd12dfae728040a21f635b06a24cd670ff510edd38963" "checksum time 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "a15375f1df02096fb3317256ce2cee6a1f42fc84ea5ad5fc8c421cfe40c73098" +"checksum tinysegmenter 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7e767ff68150da3d23c88482da07abd6532e2e928093b80e79dc4818119bbc36" "checksum traitobject 0.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "07eaeb7689bb7fca7ce15628319635758eda769fed481ecfe6686ddef2600616" "checksum traitobject 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "efd1f82c56340fdf16f2a953d7bda4f8fdffba13d93b00844c25572110b26079" "checksum typeable 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1410f6f91f21d1612654e7cc69193b0334f909dcf2c790c4826254fbb86f8887" diff --git a/Cargo.toml b/Cargo.toml index 64d38f0..04c93b4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tantivy-cli" -version = "0.4.5" +version = "0.5.0" authors = ["Paul Masurel "] description = """Command line interface for Tantivy, a search engine library.""" @@ -31,7 +31,7 @@ log = "0.3" futures = "0.1" env_logger = "0.3" version = "2" -tantivy = "0.4.4" +tantivy = "0.5.0" [[bin]] name = "tantivy" @@ -44,6 +44,7 @@ debug = false debug-assertions = false lto = true + [features] default = ["tantivy/simdcompression"] diff --git a/src/commands/bench.rs b/src/commands/bench.rs index 4f348d4..1ce56d2 100644 --- a/src/commands/bench.rs +++ b/src/commands/bench.rs @@ -17,7 +17,7 @@ use std::path::PathBuf; pub fn run_bench_cli(matches: &ArgMatches) -> Result<(), String> { let index_path = PathBuf::from(matches.value_of("index").unwrap()); let queries_path = PathBuf::from(matches.value_of("queries").unwrap()); // the unwrap is safe as long as it is comming from the main cli. - let num_repeat = try!(value_t!(matches, "num_repeat", usize).map_err(|e|format!("Failed to read num_repeat argument as an integer. {:?}", e))); + let num_repeat = value_t!(matches, "num_repeat", usize).map_err(|e| format!("Failed to read num_repeat argument as an integer. {:?}", e))?; run_bench(&index_path, &queries_path, num_repeat).map_err(From::from) } @@ -34,13 +34,11 @@ fn extract_search_fields(schema: &Schema) -> Vec { } fn read_query_file(query_path: &Path) -> io::Result> { - let query_file: File = try!(File::open(&query_path)); + let query_file: File = File::open(&query_path)?; let file = BufReader::new(&query_file); let mut queries = Vec::new(); for line_res in file.lines() { - let line = try!(line_res); - let query = String::from(line.trim()); - queries.push(query); + queries.push(line_res?); } Ok(queries) } @@ -54,11 +52,11 @@ fn run_bench(index_path: &Path, println!("Query : {:?}", index_path); println!("-------------------------------\n\n\n"); - let index = try!(Index::open(index_path).map_err(|e| format!("Failed to open index.\n{:?}", e))); + let index = Index::open(index_path).map_err(|e| format!("Failed to open index.\n{:?}", e))?; let searcher = index.searcher(); let default_search_fields: Vec = extract_search_fields(&index.schema()); - let queries = try!(read_query_file(query_filepath).map_err(|e| format!("Failed reading the query file: {}", e))); - let query_parser = QueryParser::new(index.schema(), default_search_fields); + let queries = read_query_file(query_filepath).map_err(|e| format!("Failed reading the query file: {}", e))?; + let query_parser = QueryParser::new(index.schema(), default_search_fields, index.tokenizers().clone()); println!("SEARCH\n"); println!("{}\t{}\t{}\t{}", "query", "num_terms", "num hits", "time in microsecs"); @@ -71,7 +69,8 @@ fn run_bench(index_path: &Path, let timing; { let mut collector = chain().push(&mut top_collector).push(&mut count_collector); - timing = try!(query.search(&searcher, &mut collector).map_err(|e| format!("Failed while searching query {:?}.\n\n{:?}", query_txt, e))); + timing = query.search(&searcher, &mut collector) + .map_err(|e| format!("Failed while searching query {:?}.\n\n{:?}", query_txt, e))?; } println!("{}\t{}\t{}", query_txt, count_collector.count(), timing.total_time()); } @@ -84,7 +83,8 @@ fn run_bench(index_path: &Path, for query_txt in &queries { let query = query_parser.parse_query(&query_txt).unwrap(); let mut top_collector = TopCollector::with_limit(10); - try!(query.search(&searcher, &mut top_collector).map_err(|e| format!("Failed while retrieving document for query {:?}.\n{:?}", query, e))); + query.search(&searcher, &mut top_collector) + .map_err(|e| format!("Failed while retrieving document for query {:?}.\n{:?}", query, e))?; let mut timer = TimerTree::default(); { let _scoped_timer_ = timer.open("total"); diff --git a/src/commands/index.rs b/src/commands/index.rs index 9e81393..161cd17 100644 --- a/src/commands/index.rs +++ b/src/commands/index.rs @@ -22,11 +22,11 @@ pub fn run_index_cli(argmatch: &ArgMatches) -> Result<(), String> { .map(|path| DocumentSource::FromFile(PathBuf::from(path))) .unwrap_or(DocumentSource::FromPipe); let no_merge = argmatch.is_present("nomerge"); - let mut num_threads = try!(value_t!(argmatch, "num_threads", usize).map_err(|_|format!("Failed to read num_threads argument as an integer."))); + let mut num_threads = value_t!(argmatch, "num_threads", usize).map_err(|_| format!("Failed to read num_threads argument as an integer."))?; if num_threads == 0 { num_threads = 1; } - let buffer_size = try!(value_t!(argmatch, "memory_size", usize).map_err(|_|format!("Failed to read the buffer size argument as an integer."))); + let buffer_size = value_t!(argmatch, "memory_size", usize).map_err(|_| format!("Failed to read the buffer size argument as an integer."))?; let buffer_size_per_thread = buffer_size / num_threads; run_index(index_directory, document_source, buffer_size_per_thread, num_threads, no_merge).map_err(|e| format!("Indexing failed : {:?}", e)) } @@ -37,7 +37,7 @@ fn run_index(directory: PathBuf, num_threads: usize, no_merge: bool) -> tantivy::Result<()> { - let index = try!(Index::open(&directory)); + let index = Index::open(&directory)?; let schema = index.schema(); let (line_sender, line_receiver) = chan::sync(10_000); let (doc_sender, doc_receiver) = chan::sync(10_000); @@ -71,14 +71,11 @@ fn run_index(directory: PathBuf, } drop(doc_sender); - let mut index_writer = try!( - if num_threads > 0 { - index.writer_with_num_threads(num_threads, buffer_size_per_thread) - } - else { - index.writer(buffer_size_per_thread) - } - ); + let mut index_writer = if num_threads > 0 { + index.writer_with_num_threads(num_threads, buffer_size_per_thread) + } else { + index.writer(buffer_size_per_thread) + }?; if no_merge { index_writer.set_merge_policy(Box::new(NoMergePolicy)); @@ -145,7 +142,7 @@ impl DocumentSource { BufReader::new(Box::new(io::stdin())) } &DocumentSource::FromFile(ref filepath) => { - let read_file = try!(File::open(&filepath)); + let read_file = File::open(&filepath)?; BufReader::new(Box::new(read_file)) } }) diff --git a/src/commands/new.rs b/src/commands/new.rs index 7f84183..14cb23c 100644 --- a/src/commands/new.rs +++ b/src/commands/new.rs @@ -2,13 +2,13 @@ use clap::ArgMatches; use std::convert::From; use std::path::PathBuf; use tantivy; +use tantivy::schema::Cardinality; use tantivy::schema::*; use tantivy::Index; use std::io; use ansi_term::Style; use ansi_term::Colour::{Red, Blue, Green}; use std::io::Write; -use std::ascii::AsciiExt; use serde_json; @@ -77,29 +77,31 @@ fn ask_add_field_text(field_name: &str, schema_builder: &mut SchemaBuilder) { if prompt_yn("Should the field be stored") { text_options = text_options.set_stored(); } - let is_indexed = prompt_yn("Should the field be indexed"); - let indexing_options = if is_indexed { - if prompt_yn("Should the field be tokenized") { + + + + if prompt_yn("Should the field be indexed") { + let mut text_indexing_options = TextFieldIndexing + ::default() + .set_index_option(IndexRecordOption::Basic) + .set_tokenizer("en_stem"); + + if prompt_yn("Should the term be tokenized?") { if prompt_yn("Should the term frequencies (per doc) be in the index") { if prompt_yn("Should the term positions (per doc) be in the index") { - TextIndexingOptions::TokenizedWithFreqAndPosition - } - else { - TextIndexingOptions::TokenizedWithFreq + text_indexing_options = text_indexing_options.set_index_option(IndexRecordOption::WithFreqsAndPositions); + } else { + text_indexing_options = text_indexing_options.set_index_option(IndexRecordOption::WithFreqs); } } - else { - TextIndexingOptions::TokenizedNoFreq - } - } - else { - TextIndexingOptions::Untokenized + } else { + text_indexing_options = text_indexing_options.set_tokenizer("raw"); } + + text_options = text_options.set_indexing_options(text_indexing_options); } - else { - TextIndexingOptions::Unindexed - }; - text_options = text_options.set_indexing_options(indexing_options); + + schema_builder.add_text_field(field_name, text_options); } @@ -110,7 +112,7 @@ fn ask_add_field_u64(field_name: &str, schema_builder: &mut SchemaBuilder) { u64_options = u64_options.set_stored(); } if prompt_yn("Should the field be fast") { - u64_options = u64_options.set_fast(); + u64_options = u64_options.set_fast(Cardinality::SingleValue); } if prompt_yn("Should the field be indexed") { u64_options = u64_options.set_indexed(); diff --git a/src/commands/search.rs b/src/commands/search.rs index 882c15d..6324b0b 100644 --- a/src/commands/search.rs +++ b/src/commands/search.rs @@ -24,10 +24,10 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { .iter() .enumerate() .filter( - |&(_, ref field_entry)| { + |&(_, ref field_entry) | { match *field_entry.field_type() { FieldType::Str(ref text_field_options) => { - text_field_options.get_indexing_options().is_indexed() + text_field_options.get_indexing_options().is_some() }, _ => false } @@ -35,13 +35,13 @@ fn run_search(directory: &Path, query: &str) -> tantivy::Result<()> { ) .map(|(i, _)| Field(i as u32)) .collect(); - let query_parser = QueryParser::new(schema.clone(), default_fields); + let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); let query = query_parser.parse_query(query)?; let searcher = index.searcher(); - let weight = query.weight(&searcher)?; + let weight = query.weight(&searcher, false)?; let schema = index.schema(); for segment_reader in searcher.segment_readers() { - let mut scorer = try!(weight.scorer(segment_reader)); + let mut scorer = weight.scorer(segment_reader)?; while scorer.advance() { let doc_id = scorer.doc(); let doc = segment_reader.doc(doc_id)?; diff --git a/src/commands/serve.rs b/src/commands/serve.rs index d9162ff..a7484b3 100644 --- a/src/commands/serve.rs +++ b/src/commands/serve.rs @@ -40,6 +40,8 @@ use tantivy::schema::FieldType; use tantivy::schema::NamedFieldDocument; use tantivy::schema::Schema; use tantivy::TimerTree; +use tantivy::tokenizer::*; +use tantivy::DocAddress; use urlencoded::UrlEncodedQuery; pub fn run_serve_cli(matches: &ArgMatches) -> Result<(), String> { @@ -62,6 +64,7 @@ struct Serp { #[derive(Serialize)] struct Hit { doc: NamedFieldDocument, + id: u32, } struct IndexServer { @@ -74,6 +77,13 @@ impl IndexServer { fn load(path: &Path) -> IndexServer { let index = Index::open(path).unwrap(); + index.tokenizers() + .register("commoncrawl", SimpleTokenizer + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AlphaNumOnlyFilter) + .filter(Stemmer::new()) + ); let schema = index.schema(); let default_fields: Vec = schema .fields() @@ -83,7 +93,7 @@ impl IndexServer { |&(_, ref field_entry)| { match *field_entry.field_type() { FieldType::Str(ref text_field_options) => { - text_field_options.get_indexing_options().is_indexed() + text_field_options.get_indexing_options().is_some() }, _ => false } @@ -91,17 +101,18 @@ impl IndexServer { ) .map(|(i, _)| Field(i as u32)) .collect(); - let query_parser = QueryParser::new(schema.clone(), default_fields); + let query_parser = QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone()); IndexServer { - index: index, - query_parser: query_parser, - schema: schema, + index, + query_parser, + schema, } } - fn create_hit(&self, doc: &Document) -> Hit { + fn create_hit(&self, doc: &Document, doc_address: &DocAddress) -> Hit { Hit { - doc: self.schema.to_named_doc(&doc) + doc: self.schema.to_named_doc(&doc), + id: doc_address.doc(), } } @@ -116,7 +127,7 @@ impl IndexServer { let mut chained_collector = collector::chain() .push(&mut top_collector) .push(&mut count_collector); - try!(query.search(&searcher, &mut chained_collector)); + query.search(&searcher, &mut chained_collector)?; } let hits: Vec = { let _fetching_timer = timer_tree.open("fetching docs"); @@ -124,14 +135,14 @@ impl IndexServer { .iter() .map(|doc_address| { let doc: Document = searcher.doc(doc_address).unwrap(); - self.create_hit(&doc) + self.create_hit(&doc, doc_address) }) .collect() }; Ok(Serp { - q: q, + q, num_hits: count_collector.count(), - hits: hits, + hits, timings: timer_tree, }) } @@ -163,9 +174,9 @@ fn search(req: &mut Request) -> IronResult { .get("nhits") .and_then(|nhits_str| usize::from_str(&nhits_str[0]).ok()) .unwrap_or(10); - let query = try!(qs_map + let query = qs_map .get("q") - .ok_or_else(|| IronError::new(StringError(String::from("Parameter q is missing from the query")), status::BadRequest)))[0].clone(); + .ok_or_else(|| IronError::new(StringError(String::from("Parameter q is missing from the query")), status::BadRequest))?[0].clone(); let serp = index_server.search(query, num_hits).unwrap(); let resp_json = serde_json::to_string_pretty(&serp).unwrap(); let content_type = "application/json".parse::().unwrap();