You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

209 lines
6.1KB

  1. /// This tantivy command starts a http server (by default on port 3000)
  2. ///
  3. /// Currently the only entrypoint is /api/
  4. /// and it takes the following query string argument
  5. ///
  6. /// - `q=` : your query
  7. /// - `nhits`: the number of hits that should be returned. (default to 10)
  8. ///
  9. ///
  10. /// For instance, the following call should return the 20 most relevant
  11. /// hits for fulmicoton.
  12. ///
  13. /// http://localhost:3000/api/?q=fulmicoton&&nhits=20
  14. ///
  15. use clap::ArgMatches;
  16. use iron::mime::Mime;
  17. use iron::prelude::*;
  18. use iron::status;
  19. use iron::typemap::Key;
  20. use mount::Mount;
  21. use persistent::Read;
  22. use serde_json;
  23. use std::convert::From;
  24. use std::error::Error;
  25. use std::fmt::{self, Debug};
  26. use std::path::Path;
  27. use std::path::PathBuf;
  28. use std::str::FromStr;
  29. use tantivy;
  30. use tantivy::collector::{Count, TopDocs};
  31. use tantivy::query::QueryParser;
  32. use tantivy::schema::Field;
  33. use tantivy::schema::FieldType;
  34. use tantivy::schema::NamedFieldDocument;
  35. use tantivy::schema::Schema;
  36. use tantivy::tokenizer::*;
  37. use tantivy::{DocAddress, Score};
  38. use tantivy::Document;
  39. use tantivy::Index;
  40. use tantivy::IndexReader;
  41. use crate::timer::TimerTree;
  42. use urlencoded::UrlEncodedQuery;
  43. pub fn run_serve_cli(matches: &ArgMatches) -> Result<(), String> {
  44. let index_directory = PathBuf::from(matches.value_of("index").unwrap());
  45. let port = value_t!(matches, "port", u16).unwrap_or(3000u16);
  46. let host_str = matches.value_of("host").unwrap_or("localhost");
  47. let host = format!("{}:{}", host_str, port);
  48. run_serve(index_directory, &host).map_err(|e| format!("{:?}", e))
  49. }
  50. #[derive(Serialize)]
  51. struct Serp {
  52. q: String,
  53. num_hits: usize,
  54. hits: Vec<Hit>,
  55. timings: TimerTree,
  56. }
  57. #[derive(Serialize)]
  58. struct Hit {
  59. score: Score,
  60. doc: NamedFieldDocument,
  61. id: u32,
  62. }
  63. struct IndexServer {
  64. reader: IndexReader,
  65. query_parser: QueryParser,
  66. schema: Schema,
  67. }
  68. impl IndexServer {
  69. fn load(path: &Path) -> IndexServer {
  70. let index = Index::open_in_dir(path).unwrap();
  71. index.tokenizers().register(
  72. "commoncrawl",
  73. SimpleTokenizer
  74. .filter(RemoveLongFilter::limit(40))
  75. .filter(LowerCaser)
  76. .filter(AlphaNumOnlyFilter)
  77. .filter(Stemmer::new(Language::English)),
  78. );
  79. let schema = index.schema();
  80. let default_fields: Vec<Field> = schema
  81. .fields()
  82. .iter()
  83. .enumerate()
  84. .filter(|&(_, ref field_entry)| match *field_entry.field_type() {
  85. FieldType::Str(ref text_field_options) => {
  86. text_field_options.get_indexing_options().is_some()
  87. }
  88. _ => false,
  89. })
  90. .map(|(i, _)| Field(i as u32))
  91. .collect();
  92. let query_parser =
  93. QueryParser::new(schema.clone(), default_fields, index.tokenizers().clone());
  94. let reader = index.reader().unwrap();
  95. IndexServer {
  96. reader,
  97. query_parser,
  98. schema,
  99. }
  100. }
  101. fn create_hit(&self, score: Score, doc: &Document, doc_address: DocAddress) -> Hit {
  102. Hit {
  103. score,
  104. doc: self.schema.to_named_doc(&doc),
  105. id: doc_address.doc(),
  106. }
  107. }
  108. fn search(&self, q: String, num_hits: usize) -> tantivy::Result<Serp> {
  109. let query = self
  110. .query_parser
  111. .parse_query(&q)
  112. .expect("Parsing the query failed");
  113. let searcher = self.reader.searcher();
  114. let mut timer_tree = TimerTree::default();
  115. let (top_docs, num_hits) = {
  116. let _search_timer = timer_tree.open("search");
  117. searcher.search(&query, &(TopDocs::with_limit(num_hits), Count))?
  118. };
  119. let hits: Vec<Hit> = {
  120. let _fetching_timer = timer_tree.open("fetching docs");
  121. top_docs
  122. .iter()
  123. .map(|(score, doc_address)| {
  124. let doc: Document = searcher.doc(*doc_address).unwrap();
  125. self.create_hit(*score, &doc, *doc_address)
  126. })
  127. .collect()
  128. };
  129. Ok(Serp {
  130. q,
  131. num_hits,
  132. hits,
  133. timings: timer_tree,
  134. })
  135. }
  136. }
  137. impl Key for IndexServer {
  138. type Value = IndexServer;
  139. }
  140. #[derive(Debug)]
  141. struct StringError(String);
  142. impl fmt::Display for StringError {
  143. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  144. Debug::fmt(self, f)
  145. }
  146. }
  147. impl Error for StringError {
  148. fn description(&self) -> &str {
  149. &*self.0
  150. }
  151. }
  152. fn search(req: &mut Request) -> IronResult<Response> {
  153. let index_server = req.get::<Read<IndexServer>>().unwrap();
  154. req.get_ref::<UrlEncodedQuery>()
  155. .map_err(|_| {
  156. IronError::new(
  157. StringError(String::from("Failed to decode error")),
  158. status::BadRequest,
  159. )
  160. })
  161. .and_then(|ref qs_map| {
  162. let num_hits: usize = qs_map
  163. .get("nhits")
  164. .and_then(|nhits_str| usize::from_str(&nhits_str[0]).ok())
  165. .unwrap_or(10);
  166. let query = qs_map.get("q").ok_or_else(|| {
  167. IronError::new(
  168. StringError(String::from("Parameter q is missing from the query")),
  169. status::BadRequest,
  170. )
  171. })?[0]
  172. .clone();
  173. let serp = index_server.search(query, num_hits).unwrap();
  174. let resp_json = serde_json::to_string_pretty(&serp).unwrap();
  175. let content_type = "application/json".parse::<Mime>().unwrap();
  176. Ok(Response::with((
  177. content_type,
  178. status::Ok,
  179. format!("{}", resp_json),
  180. )))
  181. })
  182. }
  183. fn run_serve(directory: PathBuf, host: &str) -> tantivy::Result<()> {
  184. let mut mount = Mount::new();
  185. let server = IndexServer::load(&directory);
  186. mount.mount("/api", search);
  187. let mut middleware = Chain::new(mount);
  188. middleware.link(Read::<IndexServer>::both(server));
  189. println!("listening on http://{}", host);
  190. Iron::new(middleware).http(host).unwrap();
  191. Ok(())
  192. }