From 37f5d296805f12bcb32fa641f7149594506d7449 Mon Sep 17 00:00:00 2001 From: Jonathan Strong Date: Tue, 10 Mar 2020 23:21:25 -0400 Subject: [PATCH] initial commit --- .gitignore | 3 +++ Cargo.toml | 17 +++++++++++++++ query.js | 18 ++++++++++++++++ src/main.rs | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 99 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 query.js create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ab14ca0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target +*.swp +node_modules/ diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..1f0f9f4 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "elasticlunr-indexer" +version = "0.1.0" +authors = ["Jonathan Strong "] +edition = "2018" + +[[bin]] +name = "indexer" +path = "src/main.rs" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +elasticlunr-rs = "2.3" +serde = { version = "1.0", features = ["derive"] } +bytelines = "2.2" +serde_json = "1.0" diff --git a/query.js b/query.js new file mode 100644 index 0000000..7b095ad --- /dev/null +++ b/query.js @@ -0,0 +1,18 @@ +const fs = require('fs'); +const elasticlunr = require('elasticlunr'); + +const start = process.hrtime(); + +const elapsed = (msg) => { + let now = process.hrtime(start); + console.info('[ %ds %dms elapsed ] %s', now[0], now[1] / 1000000, msg) +}; + +const inputFile = '../sample2.json'; +//const inputFile = 'elasticlunr-wiki-index.json'; + +elapsed('reading file ...'); +let rawData = fs.readFileSync(inputFile); +elapsed('parsing json ...'); +let indexData = JSON.parse(rawData); +elapsed('indexData.length = ' + indexData.length); diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..6247824 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,61 @@ +#![allow(unused)] +use std::io::{self, prelude::*}; +use std::fs; +use std::time::*; +use std::env; +use std::path::Path; +use elasticlunr::{Index, Language}; +use serde::Deserialize; +use bytelines::ByteLinesReader; + +#[derive(Deserialize, Debug)] +struct Record { + pub title: String, + pub body: String, + pub url: String, +} + +const WIKIPEDIA_PATH: &str = "src/search/wiki-articles.json"; +const OUTPUT_PATH: &str = "src/search/js/elasticlunr-wiki-index.json"; + +fn main() -> Result<(), io::Error> { + let start = Instant::now(); + + let mut index = Index::new(&["title", "body"]); + + let home: String = env::var("HOME").unwrap(); + let home_dir = Path::new(&home); + let input_file = home_dir.join(WIKIPEDIA_PATH); + assert!(input_file.exists(), "path does not exist: {}", input_file.display()); + + let mut rdr = io::BufReader::new(fs::File::open(input_file)?); + let mut lines = rdr.byte_lines(); + + let mut n_indexed = 0; + + while let Some(line_result) = lines.next() { + let line = line_result?; + let rec: Record = + serde_json::from_slice(line) + .map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("deserializing failed: {}", e) + ) + })?; + index.add_doc(&rec.url, &[&rec.title, &rec.body]); + n_indexed += 1; + } + + let output_file = home_dir.join(OUTPUT_PATH); + let mut wtr = io::BufWriter::new(fs::File::create(&output_file)?); + + // hope you have plenty of ram! + let index_json: String = index.to_json(); + + wtr.write_all(index_json.as_bytes())?; + + let took = Instant::now() - start; + println!("indexed {} records in {:?}", n_indexed, took); + Ok(()) +}