Browse Source

working csv with diagnostic cols

tags/v0.2.0
Jonathan Strong 4 years ago
parent
commit
ec3ef73e5c
2 changed files with 503 additions and 25 deletions
  1. +337
    -0
      notebooks/plotting-initial-query-results-in-rust-reference-impl.ipynb
  2. +166
    -25
      src/csv.rs

+ 337
- 0
notebooks/plotting-initial-query-results-in-rust-reference-impl.ipynb
File diff suppressed because it is too large
View File


+ 166
- 25
src/csv.rs View File

@@ -23,6 +23,8 @@ macro_rules! fatal { ($fmt:expr, $($args:tt)*) => {{
}}} }}}


const PROGRESS_EVERY: usize = 1024 * 1024; const PROGRESS_EVERY: usize = 1024 * 1024;
const ONE_SECOND: u64 = 1_000_000_000;
const ONE_HOUR: u64 = ONE_SECOND * 60 * 60;




#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
@@ -40,27 +42,25 @@ struct Opt {


#[derive(Deserialize)] #[derive(Deserialize)]
struct Trade { struct Trade {
/// Unix nanoseconds
/// Time of trade in unix nanoseconds
pub time: u64, pub time: u64,
/// Exchange where trade executed
pub exch: Exchange, pub exch: Exchange,
/// Currency rate of trade (base/quote)
pub ticker: Ticker, pub ticker: Ticker,
//pub side: Option<Side>,
/// Price of trade, in quote denomination
pub price: f64, pub price: f64,
/// Size/Volume of trade, in base denomination
pub amount: f64, pub amount: f64,
} }


/*
struct HourSummary {
pub n_trades: usize,
pub
*/

fn per_sec(n: usize, span: Duration) -> f64 { fn per_sec(n: usize, span: Duration) -> f64 {
if n == 0 || span < Duration::from_micros(1) { return 0.0 } if n == 0 || span < Duration::from_micros(1) { return 0.0 }
let s: f64 = span.as_nanos() as f64 / 1e9f64; let s: f64 = span.as_nanos() as f64 / 1e9f64;
n as f64 / s n as f64 / s
} }


#[allow(dead_code)]
#[inline(always)] #[inline(always)]
fn manual_deserialize_bytes(row: &csv::ByteRecord) -> Result<Trade, &'static str> { fn manual_deserialize_bytes(row: &csv::ByteRecord) -> Result<Trade, &'static str> {
let time: u64 = atoi::atoi(row.get(0).ok_or("no time")?) let time: u64 = atoi::atoi(row.get(0).ok_or("no time")?)
@@ -98,6 +98,7 @@ fn manual_deserialize_bytes(row: &csv::ByteRecord) -> Result<Trade, &'static str
Ok(Trade { time, amount, exch, price, ticker }) Ok(Trade { time, amount, exch, price, ticker })
} }


#[allow(dead_code)]
#[inline(always)] #[inline(always)]
fn manual_deserialize_str(row: &csv::StringRecord) -> Result<Trade, &'static str> { fn manual_deserialize_str(row: &csv::StringRecord) -> Result<Trade, &'static str> {
let time: u64 = atoi::atoi(row.get(0).ok_or("no time")?.as_bytes()) let time: u64 = atoi::atoi(row.get(0).ok_or("no time")?.as_bytes())
@@ -135,6 +136,42 @@ fn manual_deserialize_str(row: &csv::StringRecord) -> Result<Trade, &'static str
Ok(Trade { time, amount, exch, price, ticker }) Ok(Trade { time, amount, exch, price, ticker })
} }


/// Example of code used in discussion of increasing CSV parsing performance
#[allow(dead_code)]
fn fast_parse_bytes<R: Read>(mut rdr: csv::Reader<R>) -> Result<usize, String> {
// our data is ascii, so parsing with the slightly faster ByteRecord is fine
let headers: csv::ByteRecord = rdr.byte_headers().map_err(|e| format!("failed to parse CSV headers: {}", e))?.clone();
let mut row = csv::ByteRecord::new();

// manual_deserialize_bytes assumes the column order of the CSV,
// so here we verify that it actually matches that assumption
assert_eq!(headers.get(0), Some(&b"time"[..]));
assert_eq!(headers.get(1), Some(&b"amount"[..]));
assert_eq!(headers.get(2), Some(&b"exch"[..]));
assert_eq!(headers.get(3), Some(&b"price"[..]));
assert_eq!(headers.get(6), Some(&b"ticker"[..]));

let mut n = 0;
let mut last_time = 0;

while rdr.read_byte_record(&mut row)
.map_err(|e| {
format!("reading row {} failed: {}", (n+1).thousands_sep(), e)
})?
{
let trade: Trade = manual_deserialize_bytes(&row)
.map_err(|e| {
format!("deserializing row failed: {}\n\nFailing row:\n{:?}", e, row)
})?;
assert!(trade.time >= last_time);
last_time = trade.time;

n += 1;
}

Ok(n)
}

fn run(start: Instant, logger: &slog::Logger) -> Result<usize, String> { fn run(start: Instant, logger: &slog::Logger) -> Result<usize, String> {
let opt = Opt::from_args(); let opt = Opt::from_args();


@@ -157,44 +194,139 @@ fn run(start: Instant, logger: &slog::Logger) -> Result<usize, String> {


let mut rdr = csv::Reader::from_reader(rdr); let mut rdr = csv::Reader::from_reader(rdr);


// our data is ascii, so parsing with the slightly faster ByteRecord is fine
//let headers: csv::ByteRecord = rdr.byte_headers().map_err(|e| format!("failed to parse CSV headers: {}", e))?.clone();
//let mut row = csv::ByteRecord::new();
//assert_eq!(headers.get(0), Some(&b"time"[..]));
//assert_eq!(headers.get(1), Some(&b"amount"[..]));
//assert_eq!(headers.get(2), Some(&b"exch"[..]));
//assert_eq!(headers.get(3), Some(&b"price"[..]));
//assert_eq!(headers.get(6), Some(&b"ticker"[..]));

//let headers: csv::StringRecord = rdr.headers().map_err(|e| format!("failed to parse CSV headers: {}", e))?.clone();
// initializing --output-path CSV
let wtr = fs::File::create(&opt.output_path)
.map_err(|e| format!("creating output csv file failed: {} (tried to create {})", e, opt.output_path.display()))?;

let wtr = io::BufWriter::new(wtr);

let mut wtr = csv::Writer::from_writer(wtr);

wtr.write_record(&["time","ratio","bmex","gdax","n_bmex","n_gdax","bmex_amt","gdax_amt"]).map_err(|e| format!("writing CSV headers to output file failed: {}", e))?;

let headers: csv::StringRecord = rdr.headers().map_err(|e| format!("failed to parse CSV headers: {}", e))?.clone();
let mut row = csv::StringRecord::new(); let mut row = csv::StringRecord::new();


// pull out first row to initialize query calculations
rdr.read_record(&mut row).map_err(|e| format!("reading first row failed: {}", e))?;
let trade: Trade = row.deserialize(Some(&headers))
.map_err(|e| {
format!("deserializing first row failed: {}\n\nFailing row:\n{:?}", e, row)
})?;

let mut cur_hour = trade.time - trade.time % ONE_HOUR;
let mut next_hour = cur_hour + ONE_HOUR;

let mut bmex_total = if trade.exch == e!(bmex) { trade.price * trade.amount } else { 0.0 };
let mut bmex_amt = if trade.exch == e!(bmex) { trade.amount } else { 0.0 };
let mut n_bmex = 0;

let mut gdax_total = if trade.exch == e!(gdax) { trade.price * trade.amount } else { 0.0 };
let mut gdax_amt = if trade.exch == e!(gdax) { trade.amount } else { 0.0 };
let mut n_gdax = 0;

let mut n = 0; let mut n = 0;
let mut n_written = 0;
let mut last_time = 0; let mut last_time = 0;


//while rdr.read_byte_record(&mut row)

while rdr.read_record(&mut row) while rdr.read_record(&mut row)
.map_err(|e| { .map_err(|e| {
format!("reading row {} failed: {}", (n+1).thousands_sep(), e) format!("reading row {} failed: {}", (n+1).thousands_sep(), e)
})? })?
{ {
//let trade: Trade = row.deserialize(Some(&headers))
//let trade: Trade = manual_deserialize_bytes(&row)
let trade: Trade = manual_deserialize_str(&row)
let trade: Trade = row.deserialize(Some(&headers))
.map_err(|e| { .map_err(|e| {
format!("deserializing row failed: {}\n\nFailing row:\n{:?}", e, row) format!("deserializing row failed: {}\n\nFailing row:\n{:?}", e, row)
})?; })?;


n += 1; n += 1;


if trade.ticker != t!(btc-usd) { continue }

// verify data is sorted by time // verify data is sorted by time
debug_assert!(trade.time >= last_time);
assert!(trade.time >= last_time);
last_time = trade.time; last_time = trade.time;


if trade.time >= next_hour { // finalize last hour, and prepare for this hour
if n_bmex == 0 || n_gdax == 0 {
wtr.write_record(&[
&format!("{}", cur_hour),
"NaN",
"NaN",
"NaN",
&format!("{}", n_bmex),
&format!("{}", n_gdax),
&format!("{}", bmex_amt),
&format!("{}", gdax_amt),
]).map_err(|e| format!("writing output row failed: {}", e))?;
} else {
let bmex_wt_avg = bmex_total / bmex_amt;
let gdax_wt_avg = gdax_total / gdax_amt;
let ratio = bmex_wt_avg / gdax_wt_avg;
wtr.write_record(&[
&format!("{}", cur_hour),
&format!("{}", ratio),
&format!("{}", bmex_wt_avg),
&format!("{}", gdax_wt_avg),
&format!("{}", n_bmex),
&format!("{}", n_gdax),
&format!("{}", bmex_amt),
&format!("{}", gdax_amt),
]).map_err(|e| format!("writing output row failed: {}", e))?;
}
n_written += 1;

// reset state
bmex_total = 0.0;
bmex_amt = 0.0;
gdax_total = 0.0;
gdax_amt = 0.0;
n_bmex = 0;
n_gdax = 0;

cur_hour = next_hour;
next_hour += ONE_HOUR;

// if we are skipping hours in between the last and current row, we
// need to write a NaN row for the hours that had no data
while next_hour <= trade.time {
wtr.write_record(&[
&format!("{}", cur_hour),
"NaN",
"NaN",
"NaN",
"0",
"0",
"0.0",
"0.0",
]).map_err(|e| format!("writing output row failed: {}", e))?;

n_written += 1;
cur_hour = next_hour;
next_hour += ONE_HOUR;
}
}

match trade.exch {
e!(bmex) => {
bmex_total += trade.price * trade.amount;
bmex_amt += trade.amount;
n_bmex += 1;
}

e!(gdax) => {
gdax_total += trade.price * trade.amount;
gdax_amt += trade.amount;
n_gdax += 1;
}

_ => {}
}

if n % PROGRESS_EVERY == 0 || (cfg!(debug_assertions) && n % (1024 * 96) == 0) { if n % PROGRESS_EVERY == 0 || (cfg!(debug_assertions) && n % (1024 * 96) == 0) {
info!(logger, "parsing csv file"; info!(logger, "parsing csv file";
"n rows" => %n.thousands_sep(), "n rows" => %n.thousands_sep(),
"n written" => %n_written.thousands_sep(),
"elapsed" => ?(Instant::now() - start), "elapsed" => ?(Instant::now() - start),
); );
} }
@@ -202,12 +334,18 @@ fn run(start: Instant, logger: &slog::Logger) -> Result<usize, String> {
if cfg!(debug_assertions) && n > PROGRESS_EVERY { if cfg!(debug_assertions) && n > PROGRESS_EVERY {
warn!(logger, "debug mode: exiting early"; warn!(logger, "debug mode: exiting early";
"n rows" => %n.thousands_sep(), "n rows" => %n.thousands_sep(),
"n written" => %n_written.thousands_sep(),
"elapsed" => ?(Instant::now() - start), "elapsed" => ?(Instant::now() - start),
); );
break break
} }
} }


// intentionally skipping the partial hour here
info!(logger, "finished parsing CSV/calculating query. closing output file");
drop(wtr);

Ok(n) Ok(n)
} }


@@ -222,7 +360,10 @@ fn main() {
match run(start, &logger) { match run(start, &logger) {
Ok(n) => { Ok(n) => {
let took = Instant::now() - start; let took = Instant::now() - start;
info!(logger, "finished in {:?}", took;
let took_secs = took.as_millis() as f64 / 1000.0;
let took_str = format!("{}min, {:.1}sec", took.as_secs() / 60, took_secs % 60.0);

info!(logger, "finished in {}", took_str;
"n rows" => %n.thousands_sep(), "n rows" => %n.thousands_sep(),
"rows/sec" => &((per_sec(n, took) * 100.0).round() / 10.0).thousands_sep(), "rows/sec" => &((per_sec(n, took) * 100.0).round() / 10.0).thousands_sep(),
); );


Loading…
Cancel
Save