You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

102 lines
2.8KB

  1. use std::convert::From;
  2. use std::fs::File;
  3. use std::io;
  4. use std::io::BufRead;
  5. use std::io::BufReader;
  6. use std::io::Read;
  7. use std::path::PathBuf;
  8. use tantivy;
  9. use tantivy::Index;
  10. use time::PreciseTime;
  11. use clap::ArgMatches;
  12. pub fn run_index_cli(argmatch: &ArgMatches) -> Result<(), String> {
  13. let index_directory = PathBuf::from(argmatch.value_of("index").unwrap());
  14. let document_source = {
  15. match argmatch.value_of("file") {
  16. Some(path) => {
  17. DocumentSource::FromFile(PathBuf::from(path))
  18. }
  19. None => DocumentSource::FromPipe,
  20. }
  21. };
  22. let num_threads = try!(value_t!(argmatch, "num_threads", usize).map_err(|_|format!("Failed to read num_threads argument as an integer.")));
  23. run_index(index_directory, document_source, num_threads).map_err(|e| format!("Indexing failed : {:?}", e))
  24. }
  25. enum DocumentSource {
  26. FromPipe,
  27. FromFile(PathBuf),
  28. }
  29. fn run_index(directory: PathBuf, document_source: DocumentSource, num_threads: usize) -> tantivy::Result<()> {
  30. let index = try!(Index::open(&directory));
  31. let schema = index.schema();
  32. let mut index_writer = try!(
  33. if num_threads > 0 {
  34. index.writer_with_num_threads(num_threads)
  35. }
  36. else {
  37. index.writer()
  38. }
  39. );
  40. let articles = try!(document_source.read());
  41. let mut num_docs = 0;
  42. let mut cur = PreciseTime::now();
  43. let group_count = 100000;
  44. for article_line_res in articles.lines() {
  45. let article_line = article_line_res.unwrap(); // TODO
  46. match schema.parse_document(&article_line) {
  47. Ok(doc) => {
  48. index_writer.add_document(doc).unwrap();
  49. }
  50. Err(err) => {
  51. println!("Failed to add document doc {:?}", err);
  52. }
  53. }
  54. if num_docs > 0 && (num_docs % group_count == 0) {
  55. println!("{} Docs", num_docs);
  56. let new = PreciseTime::now();
  57. let elapsed = cur.to(new);
  58. println!("{:?} docs / hour", group_count * 3600 * 1_000_000 as u64 / (elapsed.num_microseconds().unwrap() as u64));
  59. cur = new;
  60. }
  61. num_docs += 1;
  62. }
  63. index_writer.wait().unwrap(); // TODO
  64. Ok(())
  65. }
  66. #[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
  67. pub struct WikiArticle {
  68. pub url: String,
  69. pub title: String,
  70. pub body: String,
  71. }
  72. impl DocumentSource {
  73. fn read(&self,) -> io::Result<BufReader<Box<Read>>> {
  74. Ok(match self {
  75. &DocumentSource::FromPipe => {
  76. BufReader::new(Box::new(io::stdin()))
  77. }
  78. &DocumentSource::FromFile(ref filepath) => {
  79. let read_file = try!(File::open(&filepath));
  80. BufReader::new(Box::new(read_file))
  81. }
  82. })
  83. }
  84. }