|
| 1 | +use std::fs::File; |
| 2 | +use std::sync::{Arc, Mutex}; |
| 3 | +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; |
| 4 | +use parquet::file::reader::{FileReader, SerializedFileReader}; |
| 5 | +use tracing::{debug, info}; |
| 6 | +use crate::backtest::data::{Data, DataPtr, POD}; |
| 7 | +use crate::prelude::Event; |
| 8 | + |
| 9 | +use super::NpyDTyped; |
| 10 | +use arrow_array::{UInt64Array, Int64Array, Float64Array}; |
| 11 | +use rayon::prelude::*; |
| 12 | + |
| 13 | + |
| 14 | +pub fn read_parquet_file<D: NpyDTyped + Clone>(filepath: &str) -> std::io::Result<Data<D>> { |
| 15 | + let batch_size = 1024 * 1024; |
| 16 | + let events_capacity = 150_000_000; |
| 17 | + |
| 18 | + let file = File::open(filepath)?; |
| 19 | + let builder = ParquetRecordBatchReaderBuilder::try_new(file) |
| 20 | + .unwrap() |
| 21 | + .with_batch_size(batch_size); |
| 22 | + let reader = builder.build().unwrap(); |
| 23 | + let events = Arc::new(Mutex::new(Vec::with_capacity(events_capacity))); |
| 24 | + |
| 25 | + // If we use parallel loading here, we need to re-sort by exch_ts in order. |
| 26 | + // This is because exch_ts and local_ts are sorted in chronological order. |
| 27 | + reader.into_iter().par_bridge().for_each(|maybe_batch| { |
| 28 | + let batch = maybe_batch.unwrap(); |
| 29 | + |
| 30 | + let ev_col = batch.column(0).as_any().downcast_ref::<UInt64Array>().unwrap(); |
| 31 | + let exch_ts_col = batch.column(1).as_any().downcast_ref::<Int64Array>().unwrap(); |
| 32 | + let local_ts_col = batch.column(2).as_any().downcast_ref::<Int64Array>().unwrap(); |
| 33 | + let px_col = batch.column(3).as_any().downcast_ref::<Float64Array>().unwrap(); |
| 34 | + let qty_col = batch.column(4).as_any().downcast_ref::<Float64Array>().unwrap(); |
| 35 | + let order_id_col = batch.column(5).as_any().downcast_ref::<UInt64Array>().unwrap(); |
| 36 | + let ival_col = batch.column(6).as_any().downcast_ref::<Int64Array>().unwrap(); |
| 37 | + let fval_col = batch.column(7).as_any().downcast_ref::<Float64Array>().unwrap(); |
| 38 | + |
| 39 | + let mut local_events: Vec<Event> = Vec::with_capacity(batch.num_rows()); |
| 40 | + for row in 0..batch.num_rows() { |
| 41 | + local_events.push(Event { |
| 42 | + ev: ev_col.value(row), |
| 43 | + exch_ts: exch_ts_col.value(row), |
| 44 | + local_ts: local_ts_col.value(row), |
| 45 | + px: px_col.value(row), |
| 46 | + qty: qty_col.value(row), |
| 47 | + order_id: order_id_col.value(row), |
| 48 | + ival: ival_col.value(row), |
| 49 | + fval: fval_col.value(row), |
| 50 | + }); |
| 51 | + } |
| 52 | + debug!("Read {} events", local_events.len()); |
| 53 | + let mut events = events.lock().unwrap(); |
| 54 | + events.extend(local_events); |
| 55 | + }); |
| 56 | + |
| 57 | + let mut events = events.lock().unwrap(); |
| 58 | + events.par_sort_by_key(|event| event.exch_ts); |
| 59 | + let data_ptr = DataPtr::new(events.len() * std::mem::size_of::<D>()); |
| 60 | + |
| 61 | + // Copy events to DataPtr |
| 62 | + unsafe { |
| 63 | + std::ptr::copy_nonoverlapping( |
| 64 | + events.as_ptr() as *const u8, |
| 65 | + data_ptr.ptr as *mut u8, |
| 66 | + events.len() * std::mem::size_of::<D>() |
| 67 | + ); |
| 68 | + } |
| 69 | + |
| 70 | + let data = unsafe { Data::from_data_ptr(data_ptr, 0) }; |
| 71 | + Ok(data) |
| 72 | +} |
0 commit comments