initial commit

4 years ago · 37f5d29680
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
 /target
 *.swp
 node_modules/
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,17 @@
 [package]
 name = "elasticlunr-indexer"
 version = "0.1.0"
 authors = ["Jonathan Strong <jonathan.strong@gmail.com>"]
 edition = "2018"

 [[bin]]
 name = "indexer"
 path = "src/main.rs"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
 elasticlunr-rs = "2.3"
 serde = { version = "1.0", features = ["derive"] }
 bytelines = "2.2"
 serde_json = "1.0"
--- a/query.js
+++ b/query.js
@@ -0,0 +1,18 @@
 const fs = require('fs');
 const elasticlunr = require('elasticlunr');

 const start = process.hrtime();

 const elapsed = (msg) => {
    let now = process.hrtime(start);
    console.info('[ %ds %dms elapsed ] %s', now[0], now[1] / 1000000, msg)
 };

 const inputFile = '../sample2.json';
 //const inputFile = 'elasticlunr-wiki-index.json';

 elapsed('reading file ...');
 let rawData = fs.readFileSync(inputFile);
 elapsed('parsing json ...');
 let indexData = JSON.parse(rawData);
 elapsed('indexData.length = ' + indexData.length);
--- a/src/main.rs
+++ b/src/main.rs
@@ -0,0 +1,61 @@
 #![allow(unused)]
 use std::io::{self, prelude::*};
 use std::fs;
 use std::time::*;
 use std::env;
 use std::path::Path;
 use elasticlunr::{Index, Language};
 use serde::Deserialize;
 use bytelines::ByteLinesReader;

 #[derive(Deserialize, Debug)]
 struct Record {
    pub title: String,
    pub body: String,
    pub url: String,
 }

 const WIKIPEDIA_PATH: &str = "src/search/wiki-articles.json";
 const OUTPUT_PATH: &str = "src/search/js/elasticlunr-wiki-index.json";

 fn main() -> Result<(), io::Error> {
    let start = Instant::now();

    let mut index = Index::new(&["title", "body"]);

    let home: String = env::var("HOME").unwrap();
    let home_dir = Path::new(&home);
    let input_file = home_dir.join(WIKIPEDIA_PATH);
    assert!(input_file.exists(), "path does not exist: {}", input_file.display());

    let mut rdr = io::BufReader::new(fs::File::open(input_file)?);
    let mut lines = rdr.byte_lines();

    let mut n_indexed = 0;

    while let Some(line_result) = lines.next() {
        let line = line_result?;
        let rec: Record =
            serde_json::from_slice(line)
                .map_err(|e| {
                    io::Error::new(
                        io::ErrorKind::InvalidData, 
                        format!("deserializing failed: {}", e)
                    )
                })?;
        index.add_doc(&rec.url, &[&rec.title, &rec.body]);
        n_indexed += 1;
    }

    let output_file = home_dir.join(OUTPUT_PATH);
    let mut wtr = io::BufWriter::new(fs::File::create(&output_file)?);

    // hope you have plenty of ram!
    let index_json: String = index.to_json();

    wtr.write_all(index_json.as_bytes())?;

    let took = Instant::now() - start;
    println!("indexed {} records in {:?}", n_indexed, took);
    Ok(())
 }