You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

verifying-hard-query-output-of-reference-impl.ipynb 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [
  8. {
  9. "name": "stdout",
  10. "output_type": "stream",
  11. "text": [
  12. "CPU times: user 1min 8s, sys: 7.72 s, total: 1min 16s\n",
  13. "Wall time: 1min 16s\n",
  14. "<class 'pandas.core.frame.DataFrame'>\n",
  15. "RangeIndex: 92331988 entries, 0 to 92331987\n",
  16. "Data columns (total 7 columns):\n",
  17. "time int64\n",
  18. "amount float64\n",
  19. "exch object\n",
  20. "price float64\n",
  21. "server_time int64\n",
  22. "side object\n",
  23. "ticker object\n",
  24. "dtypes: float64(2), int64(2), object(3)\n",
  25. "memory usage: 4.8+ GB\n"
  26. ]
  27. }
  28. ],
  29. "source": [
  30. "%time df = pd.read_csv('/xfs/sample.csv')\n",
  31. "df.info()"
  32. ]
  33. },
  34. {
  35. "cell_type": "code",
  36. "execution_count": 2,
  37. "metadata": {},
  38. "outputs": [
  39. {
  40. "data": {
  41. "text/html": [
  42. "<div>\n",
  43. "<style scoped>\n",
  44. " .dataframe tbody tr th:only-of-type {\n",
  45. " vertical-align: middle;\n",
  46. " }\n",
  47. "\n",
  48. " .dataframe tbody tr th {\n",
  49. " vertical-align: top;\n",
  50. " }\n",
  51. "\n",
  52. " .dataframe thead th {\n",
  53. " text-align: right;\n",
  54. " }\n",
  55. "</style>\n",
  56. "<table border=\"1\" class=\"dataframe\">\n",
  57. " <thead>\n",
  58. " <tr style=\"text-align: right;\">\n",
  59. " <th></th>\n",
  60. " <th>time</th>\n",
  61. " <th>amount</th>\n",
  62. " <th>exch</th>\n",
  63. " <th>price</th>\n",
  64. " <th>server_time</th>\n",
  65. " <th>side</th>\n",
  66. " <th>ticker</th>\n",
  67. " </tr>\n",
  68. " </thead>\n",
  69. " <tbody>\n",
  70. " <tr>\n",
  71. " <td>0</td>\n",
  72. " <td>1561939200002479372</td>\n",
  73. " <td>1.4894</td>\n",
  74. " <td>bnce</td>\n",
  75. " <td>292.7000</td>\n",
  76. " <td>1561939199919000064</td>\n",
  77. " <td>NaN</td>\n",
  78. " <td>eth_usd</td>\n",
  79. " </tr>\n",
  80. " <tr>\n",
  81. " <td>1</td>\n",
  82. " <td>1561939200011035644</td>\n",
  83. " <td>0.0833</td>\n",
  84. " <td>btfx</td>\n",
  85. " <td>10809.0000</td>\n",
  86. " <td>1561939199927000064</td>\n",
  87. " <td>bid</td>\n",
  88. " <td>btc_usd</td>\n",
  89. " </tr>\n",
  90. " <tr>\n",
  91. " <td>2</td>\n",
  92. " <td>1561939200011055712</td>\n",
  93. " <td>0.8333</td>\n",
  94. " <td>btfx</td>\n",
  95. " <td>10809.0000</td>\n",
  96. " <td>1561939199927000064</td>\n",
  97. " <td>bid</td>\n",
  98. " <td>btc_usd</td>\n",
  99. " </tr>\n",
  100. " <tr>\n",
  101. " <td>3</td>\n",
  102. " <td>1561939200019037617</td>\n",
  103. " <td>0.0831</td>\n",
  104. " <td>bnce</td>\n",
  105. " <td>10854.1000</td>\n",
  106. " <td>1561939199935000064</td>\n",
  107. " <td>NaN</td>\n",
  108. " <td>btc_usd</td>\n",
  109. " </tr>\n",
  110. " <tr>\n",
  111. " <td>4</td>\n",
  112. " <td>1561939200026450471</td>\n",
  113. " <td>0.1250</td>\n",
  114. " <td>okex</td>\n",
  115. " <td>123.2100</td>\n",
  116. " <td>1561939200026450432</td>\n",
  117. " <td>ask</td>\n",
  118. " <td>ltc_usd</td>\n",
  119. " </tr>\n",
  120. " </tbody>\n",
  121. "</table>\n",
  122. "</div>"
  123. ],
  124. "text/plain": [
  125. " time amount exch price server_time side \\\n",
  126. "0 1561939200002479372 1.4894 bnce 292.7000 1561939199919000064 NaN \n",
  127. "1 1561939200011035644 0.0833 btfx 10809.0000 1561939199927000064 bid \n",
  128. "2 1561939200011055712 0.8333 btfx 10809.0000 1561939199927000064 bid \n",
  129. "3 1561939200019037617 0.0831 bnce 10854.1000 1561939199935000064 NaN \n",
  130. "4 1561939200026450471 0.1250 okex 123.2100 1561939200026450432 ask \n",
  131. "\n",
  132. " ticker \n",
  133. "0 eth_usd \n",
  134. "1 btc_usd \n",
  135. "2 btc_usd \n",
  136. "3 btc_usd \n",
  137. "4 ltc_usd "
  138. ]
  139. },
  140. "execution_count": 2,
  141. "metadata": {},
  142. "output_type": "execute_result"
  143. }
  144. ],
  145. "source": [
  146. "df.head()"
  147. ]
  148. },
  149. {
  150. "cell_type": "code",
  151. "execution_count": 7,
  152. "metadata": {},
  153. "outputs": [
  154. {
  155. "data": {
  156. "text/plain": [
  157. "(True, True, True)"
  158. ]
  159. },
  160. "execution_count": 7,
  161. "metadata": {},
  162. "output_type": "execute_result"
  163. }
  164. ],
  165. "source": [
  166. "SECOND = int(1e9)\n",
  167. "\n",
  168. "example_time = 1567295920000000000\n",
  169. "\n",
  170. "last_5min = (df['time'] > example_time - SECOND * 60 * 5) & (df['time'] <= example_time)\n",
  171. "last_15min = (df['time'] > example_time - SECOND * 60 * 15) & (df['time'] <= example_time)\n",
  172. "last_60min = (df['time'] > example_time - SECOND * 60 * 60) & (df['time'] <= example_time)\n",
  173. "of_btc_usd = df['ticker'] == 'btc_usd'\n",
  174. "of_gdax = df['exch'] == 'gdax'\n",
  175. "of_bmex = df['exch'] == 'bmex'\n",
  176. "\n",
  177. "g5 = last_5min & of_btc_usd & of_gdax\n",
  178. "b5 = last_5min & of_btc_usd & of_bmex\n",
  179. "g15 = last_15min & of_btc_usd & of_gdax\n",
  180. "b15 = last_15min & of_btc_usd & of_bmex\n",
  181. "g60 = last_60min & of_btc_usd & of_gdax\n",
  182. "b60 = last_60min & of_btc_usd & of_bmex\n",
  183. "\n",
  184. "ratio_5min = ((df.loc[b5, 'price'] * df.loc[b5, 'amount']).sum() / df.loc[b5, 'amount'].sum()) / ((df.loc[g5, 'price'] * df.loc[g5, 'amount']).sum() / df.loc[g5, 'amount'].sum())\n",
  185. "ratio_15min = ((df.loc[b15, 'price'] * df.loc[b15, 'amount']).sum() / df.loc[b15, 'amount'].sum()) / ((df.loc[g15, 'price'] * df.loc[g15, 'amount']).sum() / df.loc[g15, 'amount'].sum())\n",
  186. "ratio_60min = ((df.loc[b60, 'price'] * df.loc[b60, 'amount']).sum() / df.loc[b60, 'amount'].sum()) / ((df.loc[g60, 'price'] * df.loc[g60, 'amount']).sum() / df.loc[g60, 'amount'].sum())\n",
  187. "\n",
  188. "abs(ratio_5min - 1.000474060563638) < 1e-6, abs(ratio_15min - 1.0005019306061411) < 1e-6, abs(ratio_60min - 1.0002338013889658) < 1e-6"
  189. ]
  190. },
  191. {
  192. "cell_type": "code",
  193. "execution_count": 8,
  194. "metadata": {},
  195. "outputs": [
  196. {
  197. "data": {
  198. "text/html": [
  199. "<div>\n",
  200. "<style scoped>\n",
  201. " .dataframe tbody tr th:only-of-type {\n",
  202. " vertical-align: middle;\n",
  203. " }\n",
  204. "\n",
  205. " .dataframe tbody tr th {\n",
  206. " vertical-align: top;\n",
  207. " }\n",
  208. "\n",
  209. " .dataframe thead th {\n",
  210. " text-align: right;\n",
  211. " }\n",
  212. "</style>\n",
  213. "<table border=\"1\" class=\"dataframe\">\n",
  214. " <thead>\n",
  215. " <tr style=\"text-align: right;\">\n",
  216. " <th></th>\n",
  217. " <th>time</th>\n",
  218. " <th>last</th>\n",
  219. " <th>bmex_5min</th>\n",
  220. " <th>gdax_5min</th>\n",
  221. " <th>n_bmex_p5</th>\n",
  222. " <th>n_gdax_p5</th>\n",
  223. " <th>r5</th>\n",
  224. " <th>r15</th>\n",
  225. " <th>r60</th>\n",
  226. " </tr>\n",
  227. " </thead>\n",
  228. " <tbody>\n",
  229. " <tr>\n",
  230. " <td>0</td>\n",
  231. " <td>1561939210000000000</td>\n",
  232. " <td>10758.5800</td>\n",
  233. " <td>10760.7205</td>\n",
  234. " <td>10760.4593</td>\n",
  235. " <td>22</td>\n",
  236. " <td>28</td>\n",
  237. " <td>1.0000</td>\n",
  238. " <td>1.0000</td>\n",
  239. " <td>1.0000</td>\n",
  240. " </tr>\n",
  241. " <tr>\n",
  242. " <td>1</td>\n",
  243. " <td>1561939220000000000</td>\n",
  244. " <td>10770.0000</td>\n",
  245. " <td>10763.6811</td>\n",
  246. " <td>10761.2528</td>\n",
  247. " <td>230</td>\n",
  248. " <td>75</td>\n",
  249. " <td>1.0002</td>\n",
  250. " <td>1.0002</td>\n",
  251. " <td>1.0002</td>\n",
  252. " </tr>\n",
  253. " <tr>\n",
  254. " <td>2</td>\n",
  255. " <td>1561939230000000000</td>\n",
  256. " <td>10758.0100</td>\n",
  257. " <td>10761.8843</td>\n",
  258. " <td>10760.1596</td>\n",
  259. " <td>418</td>\n",
  260. " <td>120</td>\n",
  261. " <td>1.0002</td>\n",
  262. " <td>1.0002</td>\n",
  263. " <td>1.0002</td>\n",
  264. " </tr>\n",
  265. " <tr>\n",
  266. " <td>3</td>\n",
  267. " <td>1561939240000000000</td>\n",
  268. " <td>10752.5000</td>\n",
  269. " <td>10757.6829</td>\n",
  270. " <td>10760.3630</td>\n",
  271. " <td>507</td>\n",
  272. " <td>147</td>\n",
  273. " <td>0.9998</td>\n",
  274. " <td>0.9998</td>\n",
  275. " <td>0.9998</td>\n",
  276. " </tr>\n",
  277. " <tr>\n",
  278. " <td>4</td>\n",
  279. " <td>1561939250000000000</td>\n",
  280. " <td>10772.6900</td>\n",
  281. " <td>10757.5702</td>\n",
  282. " <td>10763.0840</td>\n",
  283. " <td>537</td>\n",
  284. " <td>191</td>\n",
  285. " <td>0.9995</td>\n",
  286. " <td>0.9995</td>\n",
  287. " <td>0.9995</td>\n",
  288. " </tr>\n",
  289. " </tbody>\n",
  290. "</table>\n",
  291. "</div>"
  292. ],
  293. "text/plain": [
  294. " time last bmex_5min gdax_5min n_bmex_p5 n_gdax_p5 \\\n",
  295. "0 1561939210000000000 10758.5800 10760.7205 10760.4593 22 28 \n",
  296. "1 1561939220000000000 10770.0000 10763.6811 10761.2528 230 75 \n",
  297. "2 1561939230000000000 10758.0100 10761.8843 10760.1596 418 120 \n",
  298. "3 1561939240000000000 10752.5000 10757.6829 10760.3630 507 147 \n",
  299. "4 1561939250000000000 10772.6900 10757.5702 10763.0840 537 191 \n",
  300. "\n",
  301. " r5 r15 r60 \n",
  302. "0 1.0000 1.0000 1.0000 \n",
  303. "1 1.0002 1.0002 1.0002 \n",
  304. "2 1.0002 1.0002 1.0002 \n",
  305. "3 0.9998 0.9998 0.9998 \n",
  306. "4 0.9995 0.9995 0.9995 "
  307. ]
  308. },
  309. "execution_count": 8,
  310. "metadata": {},
  311. "output_type": "execute_result"
  312. }
  313. ],
  314. "source": [
  315. "ref = pd.read_csv('../var/hard.csv')\n",
  316. "ref.head()"
  317. ]
  318. },
  319. {
  320. "cell_type": "code",
  321. "execution_count": 12,
  322. "metadata": {},
  323. "outputs": [
  324. {
  325. "data": {
  326. "text/plain": [
  327. "array([1567295750000000000, 1567295760000000000, 1567295770000000000, 1567295780000000000, 1567295790000000000, 1567295800000000000, 1567295810000000000, 1567295820000000000, 1567295830000000000,\n",
  328. " 1567295840000000000, 1567295850000000000, 1567295860000000000, 1567295870000000000, 1567295880000000000, 1567295890000000000, 1567295900000000000, 1567295910000000000, 1567295920000000000,\n",
  329. " 1567295930000000000, 1567295940000000000, 1567295950000000000, 1567295960000000000, 1567295970000000000, 1567295980000000000, 1567295990000000000])"
  330. ]
  331. },
  332. "execution_count": 12,
  333. "metadata": {},
  334. "output_type": "execute_result"
  335. }
  336. ],
  337. "source": [
  338. "ref['time'].tail(25).values"
  339. ]
  340. },
  341. {
  342. "cell_type": "code",
  343. "execution_count": 13,
  344. "metadata": {},
  345. "outputs": [
  346. {
  347. "name": "stdout",
  348. "output_type": "stream",
  349. "text": [
  350. "finished in 487.8sec\n"
  351. ]
  352. }
  353. ],
  354. "source": [
  355. "import time\n",
  356. "\n",
  357. "start = time.time()\n",
  358. "rows = []\n",
  359. "\n",
  360. "for example_time in ref['time'].tail(25).values:\n",
  361. " last_5min = (df['time'] > example_time - SECOND * 60 * 5) & (df['time'] <= example_time)\n",
  362. " last_15min = (df['time'] > example_time - SECOND * 60 * 15) & (df['time'] <= example_time)\n",
  363. " last_60min = (df['time'] > example_time - SECOND * 60 * 60) & (df['time'] <= example_time)\n",
  364. " of_btc_usd = df['ticker'] == 'btc_usd'\n",
  365. " of_gdax = df['exch'] == 'gdax'\n",
  366. " of_bmex = df['exch'] == 'bmex'\n",
  367. "\n",
  368. " g5 = last_5min & of_btc_usd & of_gdax\n",
  369. " b5 = last_5min & of_btc_usd & of_bmex\n",
  370. " g15 = last_15min & of_btc_usd & of_gdax\n",
  371. " b15 = last_15min & of_btc_usd & of_bmex\n",
  372. " g60 = last_60min & of_btc_usd & of_gdax\n",
  373. " b60 = last_60min & of_btc_usd & of_bmex\n",
  374. "\n",
  375. " ratio_5min = ((df.loc[b5, 'price'] * df.loc[b5, 'amount']).sum() / df.loc[b5, 'amount'].sum()) / ((df.loc[g5, 'price'] * df.loc[g5, 'amount']).sum() / df.loc[g5, 'amount'].sum())\n",
  376. " ratio_15min = ((df.loc[b15, 'price'] * df.loc[b15, 'amount']).sum() / df.loc[b15, 'amount'].sum()) / ((df.loc[g15, 'price'] * df.loc[g15, 'amount']).sum() / df.loc[g15, 'amount'].sum())\n",
  377. " ratio_60min = ((df.loc[b60, 'price'] * df.loc[b60, 'amount']).sum() / df.loc[b60, 'amount'].sum()) / ((df.loc[g60, 'price'] * df.loc[g60, 'amount']).sum() / df.loc[g60, 'amount'].sum())\n",
  378. " rows.append(dict(example_time=example_time, r5=ratio_5min, r15=ratio_15min, r60=ratio_60min))\n",
  379. " \n",
  380. "took = time.time() - start\n",
  381. "print('finished in {:.1f}sec'.format(took))"
  382. ]
  383. },
  384. {
  385. "cell_type": "code",
  386. "execution_count": 19,
  387. "metadata": {},
  388. "outputs": [
  389. {
  390. "data": {
  391. "text/plain": [
  392. "3.342554545733013"
  393. ]
  394. },
  395. "execution_count": 19,
  396. "metadata": {},
  397. "output_type": "execute_result"
  398. }
  399. ],
  400. "source": [
  401. "hypothetical_full_took = (took / 25) * 5401808\n",
  402. "hypothetical_full_took / 60 / 60 / 24 / 365"
  403. ]
  404. },
  405. {
  406. "cell_type": "code",
  407. "execution_count": 20,
  408. "metadata": {},
  409. "outputs": [
  410. {
  411. "data": {
  412. "text/plain": [
  413. "0.1016643329480867"
  414. ]
  415. },
  416. "execution_count": 20,
  417. "metadata": {},
  418. "output_type": "execute_result"
  419. }
  420. ],
  421. "source": [
  422. "92331988 / 908204336"
  423. ]
  424. },
  425. {
  426. "cell_type": "code",
  427. "execution_count": 21,
  428. "metadata": {},
  429. "outputs": [
  430. {
  431. "data": {
  432. "text/plain": [
  433. "19.513984975814818"
  434. ]
  435. },
  436. "execution_count": 21,
  437. "metadata": {},
  438. "output_type": "execute_result"
  439. }
  440. ],
  441. "source": [
  442. "took / 25"
  443. ]
  444. },
  445. {
  446. "cell_type": "code",
  447. "execution_count": 24,
  448. "metadata": {},
  449. "outputs": [
  450. {
  451. "data": {
  452. "text/plain": [
  453. "('105,410,800.2', 105410800.15423629)"
  454. ]
  455. },
  456. "execution_count": 24,
  457. "metadata": {},
  458. "output_type": "execute_result"
  459. }
  460. ],
  461. "source": [
  462. "'{:,.1f}'.format(hypothetical_full_took), hypothetical_full_took"
  463. ]
  464. },
  465. {
  466. "cell_type": "code",
  467. "execution_count": 32,
  468. "metadata": {},
  469. "outputs": [
  470. {
  471. "data": {
  472. "text/plain": [
  473. "r5_delta 0.000000000002368\n",
  474. "r15_delta 0.000000000010704\n",
  475. "r60_delta 0.000000000005513\n",
  476. "dtype: object"
  477. ]
  478. },
  479. "execution_count": 32,
  480. "metadata": {},
  481. "output_type": "execute_result"
  482. }
  483. ],
  484. "source": [
  485. "(pd.DataFrame(rows).join(ref.set_index('time'), on='example_time', rsuffix='_rust')\n",
  486. " .assign(r5_delta=lambda df: abs(df['r5'] - df['r5_rust']))\n",
  487. " .assign(r15_delta=lambda df: abs(df['r15'] - df['r15_rust']))\n",
  488. " .assign(r60_delta=lambda df: abs(df['r60'] - df['r60_rust']))\n",
  489. ")[['r5_delta','r15_delta','r60_delta']].max(axis=0).map(lambda x: '{:.15f}'.format(x))"
  490. ]
  491. }
  492. ],
  493. "metadata": {
  494. "kernelspec": {
  495. "display_name": "Python 3",
  496. "language": "python",
  497. "name": "python3"
  498. },
  499. "language_info": {
  500. "codemirror_mode": {
  501. "name": "ipython",
  502. "version": 3
  503. },
  504. "file_extension": ".py",
  505. "mimetype": "text/x-python",
  506. "name": "python",
  507. "nbconvert_exporter": "python",
  508. "pygments_lexer": "ipython3",
  509. "version": "3.7.5"
  510. }
  511. },
  512. "nbformat": 4,
  513. "nbformat_minor": 2
  514. }