|
- #![allow(unused_imports)]
- #![allow(unused_labels)]
-
- use std::str::FromStr;
- use std::time::{Instant, Duration};
- use std::{fs, io};
- use std::io::prelude::*;
- use std::str::from_utf8;
- use std::error::Error;
- use std::f64::NAN;
- use serde::{Serialize, Deserialize};
- use itertools_num::linspace;
- use std::collections::HashMap as Map;
-
- const N: usize = 128;
- const LOGSPACE: [i64; 128] =
- [-2134300000000, -1854700000000, -1611800000000, -1400600000000,
- -1217200000000, -1057700000000, -919200000000, -798800000000,
- -694100000000, -603200000000, -524200000000, -455500000000,
- -395800000000, -344000000000, -298900000000, -259700000000,
- -225700000000, -196100000000, -170400000000, -148100000000,
- -128700000000, -111800000000, -97200000000, -84400000000,
- -73400000000, -63800000000, -55400000000, -48100000000,
- -41800000000, -36300000000, -31600000000, -27400000000,
- -23800000000, -20700000000, -18000000000, -15600000000,
- -13600000000, -11800000000, -10200000000, -8900000000,
- -7700000000, -6700000000, -5800000000, -5000000000,
- -4400000000, -3800000000, -3300000000, -2900000000,
- -2500000000, -2100000000, -1900000000, -1600000000,
- -1400000000, -1200000000, -1000000000, -900000000,
- -800000000, -700000000, -600000000, -500000000,
- -400000000, -300000000, -200000000, -100000000,
- 100000000, 200000000, 300000000, 400000000,
- 500000000, 600000000, 700000000, 800000000,
- 900000000, 1000000000, 1200000000, 1400000000,
- 1600000000, 1900000000, 2100000000, 2500000000,
- 2900000000, 3300000000, 3800000000, 4400000000,
- 5000000000, 5800000000, 6700000000, 7700000000,
- 8900000000, 10200000000, 11800000000, 13600000000,
- 15600000000, 18000000000, 20700000000, 23800000000,
- 27400000000, 31600000000, 36300000000, 41800000000,
- 48100000000, 55400000000, 63800000000, 73400000000,
- 84400000000, 97200000000, 111800000000, 128700000000,
- 148100000000, 170400000000, 196100000000, 225700000000,
- 259700000000, 298900000000, 344000000000, 395800000000,
- 455500000000, 524200000000, 603200000000, 694100000000,
- 798800000000, 919200000000, 1057700000000, 1217200000000,
- 1400600000000, 1611800000000, 1854700000000, 2134300000000];
-
-
- #[derive(Deserialize)]
- struct Trade {
- pub time: i64,
- pub price: f64,
- pub amount: f64,
- }
-
- /// Use this to deserialize just the time column on the first pass through
- /// the events file.
- #[derive(Deserialize)]
- struct EventTime {
- pub time: i64,
- }
-
- struct Event {
- pub time: i64,
- pub data: Vec<f64>,
- }
-
- pub fn seconds(d: Duration) -> f64 {
- d.as_secs() as f64 + (d.subsec_nanos() as f64 / 1_000_000_000_f64)
- }
-
- fn main() -> Result<(), Box<dyn Error>> {
- let start = Instant::now();
- let args: clap::ArgMatches = clap::App::new("time-explorer")
- .version("0.1")
- .arg(clap::Arg::with_name("trades")
- .long("trades-csv")
- .short("t")
- .help("Path of csv with time (integer nanoseconds timestamp), \
- price (f64), and amount (f64) columns.")
- .takes_value(true)
- .required(true))
- .arg(clap::Arg::with_name("events")
- .long("events-csv")
- .short("e")
- .help("Path of csv file with a time (integer nanoseconds timestamp) as column 0, \
- along with any other metadata columns that will be included in results")
- .takes_value(true)
- .required(true))
- .arg(clap::Arg::with_name("output")
- .long("output-file")
- .short("o")
- .help("Path to save results csv to")
- .takes_value(true)
- .required(true))
- .arg(clap::Arg::with_name("verbose")
- .long("verbose")
- .short("v"))
- .arg(clap::Arg::with_name("n-periods")
- .long("n-periods")
- .short("n")
- .help("Controls how many time buckets are evaluated")
- .takes_value(true)
- .default_value("50"))
- .get_matches();
-
- let verbose = args.is_present("verbose");
-
- if verbose { println!("{:>8.2}s reading...", seconds(Instant::now() - start)); }
-
- let trades_csv = args.value_of("trades").unwrap();
- let events_csv = args.value_of("events").unwrap();
- let output = args.value_of("output").unwrap();
- let n: &str = args.value_of("n-periods").unwrap();
- let n: usize = usize::from_str(n)?;
-
- let trades_csv =
- fs::OpenOptions::new()
- .read(true)
- .open(trades_csv)?;
-
- let mut times: Vec<i64> = Vec::with_capacity(8192);
- let mut amounts: Vec<f64> = Vec::with_capacity(8192);
- let mut totals: Vec<f64> = Vec::with_capacity(8192);
-
- #[cfg(feature = "super-fast-csv-parsing")]
- {
- // lookout below! MANY unwraps in here
-
- // note: this code NOT part of original time-explorer. this code is what
- // I was referring to in the "fine print" note where it says "With 10
- // minutes work (knowing what I know today), I was able to get CSV parsing
- // down to 3.46sec"
-
- let mut rdr = csv::Reader::from_reader(io::BufReader::new(rdr));
- let headers = rdr.byte_headers().unwrap().clone();
- let mut row = csv::ByteRecord::new();
- let mut col_index: [usize; 3] = [
- headers.iter().position(|x| x == b"time").unwrap(),
- headers.iter().position(|x| x == b"amount").unwrap(),
- headers.iter().position(|x| x == b"price").unwrap(),
- ];
-
- while rdr.read_byte_record(&mut row).unwrap() {
- times.push(atoi::atoi(row.get(col_index[0]).unwrap()).unwrap());
-
- let amount: f64 = lexical::parse(row.get(col_index[1]).unwrap()).unwrap();
- let price: f64 = lexical::parse(row.get(col_index[2]).unwrap()).unwrap();
-
- totals.push(price * amount);
- amounts.push(amount);
- }
- }
-
- #[cfg(not(feature = "super-fast-csv-parsing"))]
- {
- // this is what was originally in time-explorer
-
- let mut trades: Vec<Trade> =
- csv::Reader::from_reader(trades_csv)
- .deserialize()
- .map(|x| x.unwrap())
- .collect();
-
- trades.sort_by_key(|k| k.time);
-
- for Trade { time, price, amount } in trades {
- times.push(time);
- totals.push(price * amount);
- amounts.push(amount);
- }
- }
-
- if verbose { println!("{:>8.2}s finished parsing trades csv (times.len() = {}) ...", seconds(Instant::now() - start), times.len()); }
-
- let mut events: Vec<Event> = {
- let events_csv =
- fs::OpenOptions::new()
- .read(true)
- .open(events_csv)?;
-
- csv::Reader::from_reader(events_csv)
- .deserialize()
- .map(|t: Result<EventTime, _>| {
- let EventTime { time } = t.unwrap();
- //let data = [0.0; N - 1];
- let data = vec![0.0; n - 1];
- Event { time, data }
- }).collect()
- };
-
- assert!(!events.is_empty());
-
- events.sort_by_key(|k| k.time);
-
- let mut cursor: usize = 0;
- let mut truncate_events = None;
-
- let buckets: Vec<i64> =
- linspace(LOGSPACE[0] as f64, LOGSPACE[N - 1] as f64, n)
- .map(|x| x as i64)
- .collect();
-
- if verbose { println!("{:>8.2}s calculating...", seconds(Instant::now() - start)); }
-
- let mut n_incomplete_buckets = 0;
- let mut n_skipped_buckets = 0;
- let mut n_time_buckets = 0;
-
- 'a: for (i, event) in events.iter_mut().enumerate() {
-
- let mut min_time: i64 = event.time + buckets[0];
- let mut max_time: i64 = event.time + buckets[1];
-
- 'oops: while times[cursor] > min_time && cursor > 0 { cursor -= 1; }
- n_incomplete_buckets += (times[cursor] > min_time) as usize;
- n_skipped_buckets += (times[cursor] > max_time) as usize;
-
- // find the beginning if there are gaps
- 'b: while times[cursor] < min_time {
- if cursor >= times.len() - 1 {
- truncate_events = Some(i);
- break 'a
- } else {
- cursor += 1
- }
- }
-
- let mut j: usize = cursor;
-
- 'c: for k in 0..(n - 2) {
- let mut wsum: f64 = 0.0;
- let mut w: f64 = 0.0;
-
- 'd: while j < times.len() - 1 && times[j] < max_time {
- wsum += totals[j];
- w += amounts[j];
- j += 1;
- }
-
- event.data[k] = if w > 0.0 { wsum / w } else { NAN };
-
- min_time = max_time;
- max_time = event.time + buckets[k + 2];
- n_time_buckets += 1;
- }
-
- if i % 512 == 0 {
- assert!(max_time > min_time);
- if verbose {
- //let n_nan = event.data.iter().filter(|x| !x.is_finite()).count();
- println!("{:>8.2}s No. {:>5} {:>12.2}, {:>12.2}, {:>12.2} ...", //, {:>12.2}, {:>12.2}, {:>12.2} ...",
- //cursor={}, j={}, times[cursor]={}, n_nan={}, max_time-min_time={}",
- seconds(Instant::now() - start), i,
- event.data[0], event.data[20], event.data[40]); //, event.data[60], event.data[80], event.data[100]);
- //min_time, max_time, cursor,
- //j, times[cursor], n_nan, max_time-min_time);
- }
- }
- }
-
- assert!(truncate_events.is_none()); // for now
-
- if verbose { println!("{:>8.2} writing... (n_time_buckets={}, n_incomplete_buckets={}, n_skipped_buckets={})", seconds(Instant::now() - start), n_time_buckets, n_incomplete_buckets, n_skipped_buckets); }
-
- // we have to read this again because I could not figure out ownership problems
- let events_csv =
- fs::OpenOptions::new()
- .read(true)
- .open(events_csv)?;
-
- let mut events_csv = csv::Reader::from_reader(events_csv);
-
- let output_csv =
- fs::OpenOptions::new()
- .write(true)
- .create(true)
- .truncate(true)
- .open(output)?;
-
- let mut wtr = csv::Writer::from_writer(output_csv);
-
- let data_cols: Vec<i64> = {
- let mut xs = vec![0; n - 1];
- for i in 0..(n - 1) {
- xs[i] = (buckets[i] + buckets[i + 1]) / 2;
- }
- xs
- };
-
- {
- let headers = events_csv.byte_headers()?;
- for col in headers.iter() {
- wtr.write_field(col)?;
- }
- for col in data_cols.iter() {
- wtr.write_field(&format!("{}", col))?;
- }
- wtr.write_record(None::<&[u8]>)?;
- }
-
- let mut record = csv::ByteRecord::new();
-
- for event in events {
- if !events_csv.read_byte_record(&mut record)? { panic!("failed to read from events csv") }
- for meta in record.iter() {
- wtr.write_field(meta)?;
- }
- for val in event.data.iter() {
- wtr.write_field(&format!("{}", val))?;
- }
- wtr.write_record(None::<&[u8]>)?;
- }
-
-
- if verbose { println!("{:>8.2} finished.", seconds(Instant::now() - start)); }
-
- Ok(())
- }
-
- /*
- def to_tframe(version, df, trades, start):
- d = {'bid': {}, 'ask': {}}
- cursor = 0
- n = 0
- n_periods = 40
- xs = np.concatenate([periods(n_periods)[:0:-1] * -1, periods(n_periods)]) * 1000000 # mult to convert to nanos
- mask = df['version'] == version
- #my_trades = sorted(list(zip(df.loc[mask].index, df.loc[mask, 'side'], df.loc[mask, 'gid'])))
- my_trades = sorted(list(zip(df.loc[mask].index.values.astype(np.int64), df.loc[mask, 'side'], df.loc[mask, 'gid'])))
- #idx = trades.index
- idx = trades.index.values.astype(np.int64)
- amts = trades['amount']
- totals = trades['total']
- assert len(idx) == len(amts)
- assert len(idx) == len(totals)
- for tm, side, gid in my_trades:
- print '{} to_tfame {} {} (cursor = {})'.format(time.time() - start, version, n, cursor)
- #min_time = tm + timedelta(milliseconds=xs[0])
- #max_time = tm + timedelta(milliseconds=xs[1])
- min_time = tm + xs[0]
- max_time = tm + xs[1]
- if idx[cursor] > min_time:
- print 'warning: idx[cursor] ({}) > min_time ({})'.format(idx[cursor], min_time)
- while idx[cursor] > min_time and cursor > 0:
- cursor -= 1
- else:
- while idx[cursor] < min_time and cursor < len(idx) - 1:
- cursor += 1
- i = 1
- j = cursor
- d[side][gid] = {}
- while i < len(xs) - 1:
- wsum = 0.0
- w = 0.0
- while idx[j] < max_time:
- wsum += totals[j]
- w += amts[j]
- j += 1
- if w > 0.0:
- d[side][gid][xs[i]] = wsum / w
- else:
- d[side][gid][xs[i]] = np.nan
- i += 1
- min_time = max_time
- #max_time = tm + timedelta(milliseconds=xs[i])
- max_time = tm + xs[i]
- n += 1
- d['bid'] = sort_cols(pd.DataFrame.from_dict(d['bid'], orient='index'))
- d['ask'] = sort_cols(pd.DataFrame.from_dict(d['ask'], orient='index'))
- #yield (version, d)
- return d
- */
|