jstrong
/
data-pipelines


			
							#![allow(unused_imports)]

#[macro_use]
extern crate slog;
#[macro_use]
extern crate markets;

use std::path::PathBuf;
use std::time::*;
use std::io::{self, prelude::*};
use std::fs;
use std::f64::NAN;
use structopt::StructOpt;
use serde::{Serialize, Deserialize};
use slog::Drain;
use pretty_toa::ThousandsSep;
use markets::crypto::{Exchange, Ticker, Side};
use pipelines::windows::WeightedMeanWindow;


// equivalent to panic! but without the ugly 'thread main panicked' yada yada
macro_rules! fatal { ($fmt:expr, $($args:tt)*) => {{
    eprintln!($fmt, $($args)*);
    std::process::exit(1);
}}}

const PROGRESS_EVERY: usize = 1024 * 1024;
const ONE_SECOND: u64 = 1_000_000_000;
const ONE_HOUR: u64 = ONE_SECOND * 60 * 60;


#[derive(Debug, StructOpt)]
struct Opt {
    /// Path to CSV file with trades data
    #[structopt(short = "f", long = "trades-csv")]
    #[structopt(parse(from_os_str))]
    trades_csv: PathBuf,

    /// Where to save the query results (CSV output)
    #[structopt(short = "o", long = "output-path")]
    #[structopt(parse(from_os_str))]
    output_path: PathBuf,

    #[structopt(short = "z", long = "hard-mode")]
    hard_mode: bool,
}

#[derive(Deserialize)]
struct Trade {
    /// Time of trade in unix nanoseconds
    pub time: u64,
    /// Exchange where trade executed
    pub exch: Exchange,
    /// Currency rate of trade (base/quote)
    pub ticker: Ticker,
    /// Price of trade, in quote denomination
    pub price: f64,
    /// Size/Volume of trade, in base denomination 
    pub amount: f64,
}

fn per_sec(n: usize, span: Duration) -> f64 {
    if n == 0 || span < Duration::from_micros(1) { return 0.0 }
    let s: f64 = span.as_nanos() as f64 / 1e9f64;
    n as f64 / s
}

#[allow(dead_code)]
#[inline(always)]
fn manual_deserialize_bytes(row: &csv::ByteRecord) -> Result<Trade, &'static str> {
    let time: u64 = atoi::atoi(row.get(0).ok_or("no time")?)
        .ok_or("parsing time failed")?;

    let amount: f64 = lexical::parse(row.get(1).ok_or("no amount")?)
        .map_err(|_| "parsing amount failed")?;

    let exch = match row.get(2).ok_or("no exch")? {
        b"bmex" => e!(bmex),
        b"bnce" => e!(bnce),
        b"btfx" => e!(btfx),
        b"gdax" => e!(gdax),
        b"okex" => e!(okex),
        b"bits" => e!(bits),
        b"plnx" => e!(plnx),
        b"krkn" => e!(krkn),
        _ => return Err("illegal exch"),
    };

    let price: f64 = lexical::parse(row.get(3).ok_or("no price")?)
        .map_err(|_| "parsing price failed")?;

    let ticker = match row.get(6).ok_or("no ticker")? {
        b"btc_usd" => t!(btc-usd),
        b"eth_usd" => t!(eth-usd),
        b"ltc_usd" => t!(ltc-usd),
        b"etc_usd" => t!(etc-usd),
        b"bch_usd" => t!(bch-usd),
        b"xmr_usd" => t!(xmr-usd),
        b"usdt_usd" => t!(usdt-usd),
        _ => return Err("illegal ticker"),
    };

    Ok(Trade { time, amount, exch, price, ticker })
}

#[allow(dead_code)]
#[inline(always)]
fn manual_deserialize_str(row: &csv::StringRecord) -> Result<Trade, &'static str> {
    let time: u64 = atoi::atoi(row.get(0).ok_or("no time")?.as_bytes())
        .ok_or("parsing time failed")?;

    let amount: f64 = lexical::parse(row.get(1).ok_or("no amount")?)
        .map_err(|_| "parsing amount failed")?;

    let exch = match row.get(2).ok_or("no exch")? {
        "bmex" => e!(bmex),
        "bnce" => e!(bnce),
        "btfx" => e!(btfx),
        "gdax" => e!(gdax),
        "okex" => e!(okex),
        "bits" => e!(bits),
        "plnx" => e!(plnx),
        "krkn" => e!(krkn),
        _ => return Err("illegal exch"),
    };

    let price: f64 = lexical::parse(row.get(3).ok_or("no price")?)
        .map_err(|_| "parsing price failed")?;

    let ticker = match row.get(6).ok_or("no ticker")? {
        "btc_usd" => t!(btc-usd),
        "eth_usd" => t!(eth-usd),
        "ltc_usd" => t!(ltc-usd),
        "etc_usd" => t!(etc-usd),
        "bch_usd" => t!(bch-usd),
        "xmr_usd" => t!(xmr-usd),
        "usdt_usd" => t!(usdt-usd),
        _ => return Err("illegal ticker"),
    };

    Ok(Trade { time, amount, exch, price, ticker })
}

/// Example of code used in discussion of increasing CSV parsing performance
#[allow(dead_code)]
fn fast_parse_bytes<R: Read>(mut rdr: csv::Reader<R>) -> Result<usize, String> {
    // our data is ascii, so parsing with the slightly faster ByteRecord is fine
    let headers: csv::ByteRecord = rdr.byte_headers().map_err(|e| format!("failed to parse CSV headers: {}", e))?.clone();
    let mut row = csv::ByteRecord::new();

    // manual_deserialize_bytes assumes the column order of the CSV,
    // so here we verify that it actually matches that assumption
    assert_eq!(headers.get(0), Some(&b"time"[..]));
    assert_eq!(headers.get(1), Some(&b"amount"[..]));
    assert_eq!(headers.get(2), Some(&b"exch"[..]));
    assert_eq!(headers.get(3), Some(&b"price"[..]));
    assert_eq!(headers.get(6), Some(&b"ticker"[..]));

    let mut n = 0;
    let mut last_time = 0;

    while rdr.read_byte_record(&mut row)
        .map_err(|e| {
            format!("reading row {} failed: {}", (n+1).thousands_sep(), e)
        })?
    {
        let trade: Trade = manual_deserialize_bytes(&row)
            .map_err(|e| {
                format!("deserializing row failed: {}\n\nFailing row:\n{:?}", e, row)
            })?;
        assert!(trade.time >= last_time);
        last_time = trade.time;

        n += 1;
    }

    Ok(n)
}

fn hard_mode<R, W>(
    mut rdr: csv::Reader<R>,
    mut wtr: csv::Writer<W>,
    logger: &slog::Logger,
) -> Result<usize, String>
    where R: Read,
          W: Write
{
    let logger = logger.new(o!("hard-mode" => "challenge accepted"));
    info!(logger, "beginning hard mode");

    let headers: csv::StringRecord = rdr.headers().map_err(|e| format!("failed to parse CSV headers: {}", e))?.clone();
    let mut row = csv::StringRecord::new();

    // pull out first row to initialize query calculations
    rdr.read_record(&mut row).map_err(|e| format!("reading first row failed: {}", e))?;
    let trade: Trade = row.deserialize(Some(&headers))
        .map_err(|e| {
            format!("deserializing first row failed: {}\n\nFailing row:\n{:?}", e, row)
        })?;

    let mut cur_bucket = trade.time - (trade.time % (ONE_SECOND * 10)) + ONE_SECOND * 10;
    //let mut next_bucket = cur_bucket + ONE_SECOND * 10;
    let mut last_price: f64 = NAN;

    #[derive(Default, Clone)]
    struct Lookbacks<T> {
        pub p5: T,
        pub p15: T,
        pub p60: T,
    }

    let mut bprices: Lookbacks<f64> = Default::default();
    let mut gprices: Lookbacks<f64> = Default::default();

    let mut ratios: Lookbacks<f64> = Default::default();

    let mut bmex_windows: Lookbacks<WeightedMeanWindow> =
        Lookbacks { 
            p5:  WeightedMeanWindow::new(ONE_SECOND * 60 * 5 ),
            p15: WeightedMeanWindow::new(ONE_SECOND * 60 * 15),
            p60: WeightedMeanWindow::new(ONE_SECOND * 60 * 60),
        };
    let mut gdax_windows = bmex_windows.clone();


    #[inline(always)]
    fn do_purge(windows: &mut Lookbacks<WeightedMeanWindow>, prices: &mut Lookbacks<f64>, time: u64) {
        //if windows.p5.purge(time)  { prices.p5  = windows.p5 .checked_weighted_mean().unwrap_or(NAN); }
        //if windows.p15.purge(time) { prices.p15 = windows.p15.checked_weighted_mean().unwrap_or(NAN); }
        //if windows.p60.purge(time) { prices.p60 = windows.p60.checked_weighted_mean().unwrap_or(NAN); }
        windows.p5 .purge(time);
        windows.p15.purge(time);
        windows.p60.purge(time);
    }

    #[allow(unused)]
    #[inline(always)]
    fn do_update(windows: &mut Lookbacks<WeightedMeanWindow>, prices: &mut Lookbacks<f64>, time: u64, price: f64, amount: f64) {
        //prices.p5  = windows.p5 .update(time, price, amount).unwrap_or(NAN);
        //prices.p15 = windows.p15.update(time, price, amount).unwrap_or(NAN);
        //prices.p60 = windows.p60.update(time, price, amount).unwrap_or(NAN);

        windows.p5 .push(time, price, amount);
        windows.p15.push(time, price, amount);
        windows.p60.push(time, price, amount);
    }
   
    macro_rules! update { // in macro to avoid repeating code once outside loop, and again in loop body
        ($trade:ident) => {{
            match $trade.exch {
                e!(bmex) => {
                    do_update(&mut bmex_windows, &mut bprices, $trade.time, $trade.price, $trade.amount);
                    //do_purge(&mut gdax_windows, &mut gprices, $trade.time);
                    last_price = $trade.price;
                }

                e!(gdax) => {
                    do_update(&mut gdax_windows, &mut gprices, $trade.time, $trade.price, $trade.amount);
                    //do_purge(&mut bmex_windows, &mut bprices, $trade.time);
                    last_price = $trade.price;
                }
                
                _ => {}
            }
        }}
    }

    wtr.write_record(&[
        "time",
        //"last",
        //"bmex_5min",
        //"gdax_5min",
        //"n_bmex_p5",
        //"n_gdax_p5",
        "r5",
        "r15",
        "r60",
        //"n_bmex_p5",
        //"n_bmex_p15",
        //"n_bmex_p60",
        //"n_gdax_p5",
        //"n_gdax_p15",
        //"n_gdax_p60",
        //"gdax_p5_is_empty",
        //"gdax_p5_checked_weighted_mean",
        //"tradetime_minus_cur_bucket",
    ]).map_err(|e| format!("writing CSV headers to output file failed: {}", e))?;

    if trade.ticker == t!(btc-usd) { update!(trade); }

    let mut n = 0;
    let mut n_written = 0;

    while rdr.read_record(&mut row)
        .map_err(|e| {
            format!("reading row {} failed: {}", (n+1).thousands_sep(), e)
        })?
    {
        n += 1;

        let trade: Trade = row.deserialize(Some(&headers))
            .map_err(|e| {
                format!("deserializing row failed: {}\n\nFailing row:\n{:?}", e, row)
            })?;

        if trade.time > cur_bucket {
            debug!(logger, "about to purge";
                "n" => n,
                "n written" => n_written,
                "trade.time" => trade.time,
                "cur_bucket" => cur_bucket,
                "gdax p5 len" => gdax_windows.p5.len(),
                "gdax p5 wt avg" => gdax_windows.p5.weighted_mean(),
            );

            do_purge(&mut gdax_windows, &mut gprices, cur_bucket);
            do_purge(&mut bmex_windows, &mut bprices, cur_bucket);

            debug!(logger, "finished purge";
                "n" => n,
                "n written" => n_written,
                "trade.time" => trade.time,
                "cur_bucket" => cur_bucket,
                "gdax p5 len" => gdax_windows.p5.len(),
                "gdax p5 wt avg" => gdax_windows.p5.weighted_mean(),
            );


            ratios.p5  = bmex_windows.p5 .weighted_mean() / gdax_windows.p5 .weighted_mean();
            ratios.p15 = bmex_windows.p15.weighted_mean() / gdax_windows.p15.weighted_mean();
            ratios.p60 = bmex_windows.p60.weighted_mean() / gdax_windows.p60.weighted_mean();

            //ratios.p5  = bmex_windows.p5 .weighted_mean() / gdax_windows.p5 .weighted_mean();
            //ratios.p15 = bmex_windows.p15.weighted_mean() / gdax_windows.p15.weighted_mean();
            //ratios.p60 = bmex_windows.p60.weighted_mean() / gdax_windows.p60.weighted_mean();

            wtr.write_record(&[
                &format!("{}", cur_bucket),
                //&format!("{}", last_price),
                //&format!("{}", bmex_windows.p5.checked_weighted_mean().unwrap_or(NAN)),
                //&format!("{}", gdax_windows.p5.checked_weighted_mean().unwrap_or(NAN)),
                //&format!("{}", bmex_windows.p5.len()),
                //&format!("{}", gdax_windows.p5.len()),
                &format!("{}", ratios.p5),
                &format!("{}", ratios.p15),
                &format!("{}", ratios.p60),
                //&format!("{}", bmex_windows.p15.len()),
                //&format!("{}", gdax_windows.p60.len()),
                //&format!("{}", gdax_windows.p15.len()),
                //&format!("{}", gdax_windows.p15.len()),
                //&format!("{}", bmex_windows.p60.len()),
                //&format!("{}", bmex_windows.p5.is_empty()),
                //&format!("{:?}", bmex_windows.p5.checked_weighted_mean()),
                //&format!("{}", trade.time - cur_bucket),

            ]).map_err(|e| {
                format!("writing csv row failed: {}", e)
            })?;
            n_written += 1;
            cur_bucket += ONE_SECOND * 10;
            //cur_bucket = next_bucket;
            //next_bucket += ONE_SECOND * 10;
        }

        if trade.ticker == t!(btc-usd) { update!(trade); }
        
        if n % PROGRESS_EVERY == 0 {
            info!(logger, "calculating hard query";
                "n" => %n.thousands_sep(),
                "n_written" => %n_written.thousands_sep(),
                "ratios.p5" => ratios.p5,
                "ratios.p15" => ratios.p15,
                "ratios.p60" => ratios.p60,
            );
        }
    }

    info!(logger, "finished with hard query");

    Ok(n)
}

fn run(start: Instant, logger: &slog::Logger) -> Result<usize, String> {
    let opt = Opt::from_args();

    info!(logger, "initializing...";
        "trades-csv" => %opt.trades_csv.display(),
        "output-path" => %opt.output_path.display()
    );

    if ! opt.trades_csv.exists() { 
        error!(logger, "path does not exist: {}", opt.trades_csv.display());
        fatal!("Error: path does not exist: {}", opt.trades_csv.display());
    }

    debug!(logger, "verified csv path exists"; "trades_csv" => %opt.trades_csv.display());

    let rdr = fs::File::open(&opt.trades_csv)
        .map_err(|e| format!("opening trades csv file failed: {} (tried to open {})", e, opt.trades_csv.display()))?;

    let rdr = io::BufReader::new(rdr);

    let mut rdr = csv::Reader::from_reader(rdr);

    // initializing --output-path CSV
    let wtr = fs::File::create(&opt.output_path)
        .map_err(|e| format!("creating output csv file failed: {} (tried to create {})", e, opt.output_path.display()))?;

    let wtr = io::BufWriter::new(wtr);

    let mut wtr = csv::Writer::from_writer(wtr);

    if opt.hard_mode { return hard_mode(rdr, wtr, &logger) }

    wtr.write_record(&[
        "time",
        "ratio",
        "bmex",
        "gdax",
    ]).map_err(|e| format!("writing CSV headers to output file failed: {}", e))?;

    let headers: csv::StringRecord = rdr.headers().map_err(|e| format!("failed to parse CSV headers: {}", e))?.clone();
    let mut row = csv::StringRecord::new();

    // pull out first row to initialize query calculations
    rdr.read_record(&mut row).map_err(|e| format!("reading first row failed: {}", e))?;
    let trade: Trade = row.deserialize(Some(&headers))
        .map_err(|e| {
            format!("deserializing first row failed: {}\n\nFailing row:\n{:?}", e, row)
        })?;

    let mut cur_hour = trade.time - trade.time % ONE_HOUR;
    let mut next_hour = cur_hour + ONE_HOUR;

    let mut bmex_total = if trade.exch == e!(bmex) { trade.price * trade.amount } else { 0.0 };
    let mut bmex_amt = if trade.exch == e!(bmex) { trade.amount } else { 0.0 };
    let mut n_bmex = 0;

    let mut gdax_total = if trade.exch == e!(gdax) { trade.price * trade.amount } else { 0.0 };
    let mut gdax_amt = if trade.exch == e!(gdax) { trade.amount } else { 0.0 };
    let mut n_gdax = 0;

    let mut n = 0;
    let mut n_written = 0;
    let mut last_time = 0;

    while rdr.read_record(&mut row)
        .map_err(|e| {
            format!("reading row {} failed: {}", (n+1).thousands_sep(), e)
        })?
    {
        let trade: Trade = row.deserialize(Some(&headers))
            .map_err(|e| {
                format!("deserializing row failed: {}\n\nFailing row:\n{:?}", e, row)
            })?;

        n += 1;

        // verify data is sorted by time
        assert!(trade.time >= last_time);
        last_time = trade.time;

        if trade.ticker != t!(btc-usd) { continue }

        if trade.time >= next_hour { 
            // `trade` is past the last hour bucket, so finalize/write last
            // hour results, and reset state for this hour

            if n_bmex == 0 || n_gdax == 0 {
                wtr.write_record(&[
                    &format!("{}", cur_hour), 
                    "NaN",
                    "NaN",
                    "NaN",
                ]).map_err(|e| format!("writing output row failed: {}", e))?;
            } else {
                let bmex_wt_avg = bmex_total / bmex_amt;
                let gdax_wt_avg = gdax_total / gdax_amt;
                let ratio = bmex_wt_avg / gdax_wt_avg; 
                wtr.write_record(&[
                    &format!("{}", cur_hour),
                    &format!("{}", ratio),
                    &format!("{}", bmex_wt_avg),
                    &format!("{}", gdax_wt_avg),
                ]).map_err(|e| format!("writing output row failed: {}", e))?;
            }
            n_written += 1;

            // reset state
            bmex_total = 0.0;
            bmex_amt = 0.0;
            gdax_total = 0.0;
            gdax_amt = 0.0;
            n_bmex = 0;
            n_gdax = 0;

            cur_hour = next_hour;
            next_hour += ONE_HOUR;

            // if we are skipping hours in between the last and current row, we
            // need to write a NaN row for the hours that had no data
            while next_hour <= trade.time {
                wtr.write_record(&[
                    &format!("{}", cur_hour), 
                    "NaN",
                    "NaN",
                    "NaN",
                ]).map_err(|e| format!("writing output row failed: {}", e))?;

                n_written += 1;
                cur_hour = next_hour;
                next_hour += ONE_HOUR;
            }
        }

        match trade.exch {
            e!(bmex) => {
                bmex_total += trade.price * trade.amount;
                bmex_amt += trade.amount;
                n_bmex += 1;
            }

            e!(gdax) => {
                gdax_total += trade.price * trade.amount;
                gdax_amt += trade.amount;
                n_gdax += 1;
            }

            _ => {}
        }

        if n % PROGRESS_EVERY == 0 || (cfg!(debug_assertions) && n % (1024 * 96) == 0) {
            info!(logger, "parsing csv file";
                "n rows" => %n.thousands_sep(),
                "n written" => %n_written.thousands_sep(),
                "elapsed" => ?(Instant::now() - start),
            );
        }

        if cfg!(debug_assertions) && n > PROGRESS_EVERY {
            warn!(logger, "debug mode: exiting early";
                "n rows" => %n.thousands_sep(),
                "n written" => %n_written.thousands_sep(),
                "elapsed" => ?(Instant::now() - start),
            );
            break
        }
    }

    // intentionally skipping handling the partial hour here 
    
    info!(logger, "finished parsing CSV/calculating query. closing output file");
    drop(wtr);

    Ok(n)
}

fn main() {
    let start = Instant::now();

    let decorator = slog_term::TermDecorator::new().stdout().force_color().build();
    let drain = slog_term::FullFormat::new(decorator).use_utc_timestamp().build().fuse();
    let drain = slog_async::Async::new(drain).chan_size(1024 * 64).thread_name("recv".into()).build().fuse();
    let logger = slog::Logger::root(drain, o!("version" => structopt::clap::crate_version!()));

    match run(start, &logger) {
        Ok(n) => {
            let took = Instant::now() - start;
            let took_secs = took.as_millis() as f64 / 1000.0;
            let took_str = format!("{}min, {:.1}sec", took.as_secs() / 60, took_secs % 60.0);

            info!(logger, "finished in {}", took_str;
                "n rows" => %n.thousands_sep(),
                "rows/sec" => &((per_sec(n, took) * 100.0).round() / 10.0).thousands_sep(),
            );
        }

        Err(e) => {
            crit!(logger, "run failed: {:?}", e);
            eprintln!("\n\nError: {}", e);
            std::thread::sleep(Duration::from_millis(100));
            std::process::exit(1);
        }
    }
}