Skip to content
This repository was archived by the owner on Feb 5, 2025. It is now read-only.

Commit a610912

Browse files
committed
Clean up and document the API
1 parent abbd033 commit a610912

7 files changed

Lines changed: 81 additions & 57 deletions

File tree

Source/santad/Logs/EndpointSecurity/ParquetLogger/column_builder.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ pub struct ColumnBuilder {
1818
}
1919

2020
impl ColumnBuilder {
21+
/// Create a new column builder. Providing invalid options will result in
22+
/// failures on push, not immediately.
2123
pub fn new(
2224
page_size: usize,
2325
descriptor: Descriptor,
@@ -31,6 +33,12 @@ impl ColumnBuilder {
3133
}
3234
}
3335

36+
/// Compress and return the buffered pages so they can be written to a file.
37+
///
38+
/// WARNING: due to parquet2's iterator-centric design, it's necessary to
39+
/// drain this iterator before calling push again. Otherwise, it's undefined
40+
/// which row group the newly written data will end up, and it could even be
41+
/// dropped.
3442
pub fn drain<'a>(&'a mut self) -> DynStreamingIterator<'a, CompressedPage, Error> {
3543
let pages: Vec<Result<Page>> = self
3644
.pages
@@ -45,10 +53,16 @@ impl ColumnBuilder {
4553
DynStreamingIterator::new(compressor)
4654
}
4755

56+
/// Append the value to the most recent page. If the page is full, create a
57+
/// new one.
4858
pub fn push(&mut self, value: Value) -> Result<()> {
4959
self.page_builder(value.dyn_size())?.push(value)
5060
}
5161

62+
/// Return the most recent, partially built page. If the page can't fit the
63+
/// size_hint without going over page_size, a new page is created.
64+
///
65+
/// This call can only fail if the schema is invalid.
5266
pub fn page_builder(&mut self, size_hint: usize) -> Result<&mut PageBuilder> {
5367
let last_page = match self.pages.last_mut() {
5468
Some(page) => page,
@@ -71,10 +85,13 @@ impl ColumnBuilder {
7185
Ok(self.pages.last_mut().unwrap())
7286
}
7387

88+
/// Returns the current size of the column in bytes, as a sum of the sizes
89+
/// of all buffered pages.
7490
pub fn size(&self) -> usize {
7591
self.pages.iter().map(|page| page.size()).sum()
7692
}
7793

94+
/// Returns the current number of buffered values.
7895
pub fn count(&self) -> usize {
7996
self.pages.iter().map(|page| page.count()).sum()
8097
}

Source/santad/Logs/EndpointSecurity/ParquetLogger/cpp_api.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,10 +135,11 @@ fn table_push_string(table: &mut Table, column_no: usize, value: &CxxString) ->
135135
table.push(column_no, Value::Bytes(value.as_bytes()))
136136
}
137137

138-
fn table_flush(table: &mut Table) -> Result<(usize), Error> {
138+
fn table_flush(table: &mut Table) -> Result<usize, Error> {
139139
table.flush()
140140
}
141141

142-
fn table_end(mut table: Box<Table>) -> Result<u64, Error> {
143-
table.end()
142+
fn table_end(table: Box<Table>) -> Result<u64, Error> {
143+
let (n, _writer) = table.end()?;
144+
Ok(n)
144145
}

Source/santad/Logs/EndpointSecurity/ParquetLogger/page_builder.rs

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ use parquet2::{
1111
};
1212
use std::cmp::PartialOrd;
1313

14-
// A page builder serializes primitive values into bytes and appends them to a
15-
// page (buffer). Implementations are provided for NativeType and &[u8] (byte
16-
// array).
14+
/// A page builder serializes primitive values into bytes and appends them to a
15+
/// page (buffer). Implementations are provided for NativeType and &[u8] (byte
16+
/// array).
1717
pub struct PageBuilder {
1818
page_builder: InnerBuilder,
1919
}
@@ -93,11 +93,8 @@ enum InnerBuilder {
9393
F64(NativePage<f64>),
9494
}
9595

96-
// A page builder serializes primitive values into bytes and appends them to a
97-
// page (buffer). Implementations are provided for NativeType and &[u8] (byte
98-
// array).
99-
100-
// Builds a page of variable length by arrays. Used for strings and other blobs.
96+
/// Builds a page of variable length by arrays. Used for strings and other
97+
/// blobs.
10198
pub struct ByteArrayPage {
10299
buffer: Vec<u8>,
103100
count: usize,
@@ -151,8 +148,8 @@ impl ByteArrayPage {
151148
}
152149
}
153150

154-
// A page of numbers using plain encoding. This is implemented (and fast) for
155-
// most native numeric types. (Int96 isn't used at the moment.)
151+
/// A page of numbers using plain encoding. This is implemented (and fast) for
152+
/// most native numeric types. (Int96 isn't used at the moment.)
156153
pub struct NativePage<T: NativeType + PartialOrd> {
157154
buffer: Vec<u8>,
158155
count: usize,

Source/santad/Logs/EndpointSecurity/ParquetLogger/parquet_logger.rs

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,48 @@
11
//! This package provides an opinionated API for producing a Parquet file
22
//! containing a simple table. It's intended to be easy to use from both Rust
33
//! and C++ code, and uses Cxx to expose a C++ API. (See cpp_api.rs.)
4-
//!
4+
//!
55
//! We take the following simplifying assumptions:
6-
//!
6+
//!
77
//! * All fields are always required (no NULLs).
88
//! * All fields are simple types: integers, floats and strings.
99
//! * All files are brotli-compressed.
10-
//!
10+
//!
1111
//! The API provides reasonable defaults for many of the knobs Parquet exposes,
1212
//! and doesn't allow overriding most of them. This is intentional - the goal is
1313
//! to be as simple to use as possible.
14-
//!
14+
//!
1515
//! To get started from C++, look at cpp_api.rs. To get started from Rust, look
1616
//! at the Table type in table.rs.
17-
//!
17+
//!
1818
//! # Implementation Notes
19-
//!
19+
//!
2020
//! The API is implemented on top of parquet2, a minimal reimplamentation of the
2121
//! official arrow crate. We chose parquet2 for its simplicity, compilation
2222
//! speed and lack of unsafe code. (The official arrow project is extremely
2323
//! large and depends on Boost in C++.)
24-
//!
24+
//!
2525
//! The code structure roughly mirrors that of a parquet file:
26-
//!
26+
//!
2727
//! * Table: represents a parquet file, which consists of one or more row
2828
//! groups.
2929
//! * ColumnBuilder: represents a column chunk in a row group.
3030
//! * PageBuilder: represents a data page in a column chunk.
3131
//! * Value: represents a single scalar (number or byte blob) in a data page.
32-
//!
32+
//!
3333
//! Correctness, including of types, is enforced at runtime. Value, rather than
3434
//! being a generic type, is an enumeration (discriminated union) that can hold
3535
//! any of the supported types. This is done for two reasons:
36-
//!
36+
//!
3737
//! 1. It makes the code eaiser to understand - multiple layers of generic
3838
//! traits are required for static type checking of column chunks and pages.
3939
//! 2. The Table type must expose a runtime-generic way of setting a cell in a
4040
//! column, and this is the most common way of using the API, so any savings
4141
//! gained from static type checking would be bypassed by the most common
4242
//! code path anyway.
43-
//!
43+
//!
4444
//! # Future Work
45-
//!
45+
//!
4646
//! * Support fixed-length byte arrays.
4747
//! * Reimplement FileWriter to use an arena-style buffer instead of nested
4848
//! iterators.
@@ -67,6 +67,9 @@ pub extern "C" fn parquet2_1337_bloom_filter_contains(x: i64) -> bool {
6767
bloom_filter::is_in_set(&bits, bloom_filter::hash_native(x))
6868
}
6969

70+
// This is the main public API.
71+
pub use crate::table::Table;
72+
7073
#[cfg(test)]
7174
mod test {
7275
use crate::{
@@ -89,7 +92,6 @@ mod test {
8992
version: Version::V1,
9093
},
9194
compression_options: CompressionOptions::Brotli(Some(BrotliLevel::try_new(5).unwrap())),
92-
// compression_options: CompressionOptions::Uncompressed,
9395
page_size: 1024,
9496
};
9597

@@ -117,9 +119,8 @@ mod test {
117119
.expect("push failed");
118120
}
119121
table.flush().expect("flush failed");
120-
table.end().expect("end failed");
122+
let (_, writer) = table.end().expect("end failed");
121123

122-
let (_schema, writer, _options) = table.into_inner();
123124
let writer = if let Writer::Memory(w) = writer {
124125
w
125126
} else {

Source/santad/Logs/EndpointSecurity/ParquetLogger/table.rs

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@ pub struct Options {
1818
/// easy to construct.)
1919
pub struct Table {
2020
columns: Vec<ColumnBuilder>,
21-
schema: SchemaDescriptor,
22-
options: Options,
2321
writer: Writer,
2422
}
2523

@@ -36,18 +34,22 @@ impl Table {
3634
)
3735
})
3836
.collect::<Vec<_>>();
39-
Self {
40-
columns,
41-
schema,
42-
options,
43-
writer,
44-
}
37+
Self { columns, writer }
4538
}
4639

40+
/// Pushes a value to the column. The column number is the index the column
41+
/// had in the schema. The type of Value must match the type of the column
42+
/// as specified in new.
43+
///
44+
/// It is not necessary to push values in order of column number. It is also
45+
/// not required to push one row at a time (it's fine to write column by
46+
/// column). However, the same number of values must be pushed to each
47+
/// column by the time flush is called.
4748
pub fn push(&mut self, column_no: usize, value: Value) -> Result<()> {
4849
self.columns[column_no].push(value)
4950
}
5051

52+
/// Convenience method for pushing a row of values at a time. See push.
5153
pub fn push_row<'a, I>(&mut self, values: I) -> Result<()>
5254
where
5355
I: Iterator<Item = Value<'a>>,
@@ -58,6 +60,7 @@ impl Table {
5860
Ok(())
5961
}
6062

63+
/// Convenience method for pushing a column of values at a time. See push
6164
pub fn push_column<'a, I>(&mut self, column_no: usize, values: I) -> Result<()>
6265
where
6366
I: Iterator<Item = Value<'a>>,
@@ -68,15 +71,15 @@ impl Table {
6871
Ok(())
6972
}
7073

71-
// Checks that the table is well-formed and returns the number of rows in
72-
// the buffer.
73-
pub fn validate(&self) -> Result<(usize)> {
74+
/// Checks that the table is well-formed and returns the number of rows in
75+
/// the buffer.
76+
pub fn validate(&self) -> Result<usize> {
7477
match self.columns.len() {
7578
0 => Err(parquet2::error::Error::OutOfSpec("No columns".to_string())),
7679
_ => {
7780
let n = self.columns[0].count();
7881
if self.columns.iter().all(|column| column.count() == n) {
79-
Ok((n))
82+
Ok(n)
8083
} else {
8184
Err(parquet2::error::Error::OutOfSpec(
8285
"Column counts don't match".to_string(),
@@ -86,25 +89,30 @@ impl Table {
8689
}
8790
}
8891

89-
// Flushes a rowg group to the writer and returns the number of rows
90-
// flushed. Does nothing if no rows are buffered.
91-
pub fn flush(&mut self) -> Result<(usize)> {
92+
/// Flushes a rowg group to the writer and returns the number of rows
93+
/// flushed. Does nothing if no rows are buffered.
94+
pub fn flush(&mut self) -> Result<usize> {
9295
match self.validate() {
93-
Ok(0) => Ok((0)),
96+
Ok(0) => Ok(0),
9497
Ok(n) => {
9598
write_row_group(&mut self.writer, &mut self.columns)?;
96-
Ok((n))
99+
Ok(n)
97100
}
98101
Err(e) => Err(e),
99102
}
100103
}
101104

102-
pub fn end(&mut self) -> Result<u64> {
105+
/// Flushes all buffered data and ends the file, writing the footer.
106+
pub fn end(mut self) -> Result<(u64, Writer)> {
103107
self.flush()?;
104-
self.writer.end()
108+
let n = self.writer.end()?;
109+
Ok((n, self.writer))
105110
}
106111

107-
pub fn into_inner(self) -> (SchemaDescriptor, Writer, Options) {
108-
(self.schema, self.writer, self.options)
112+
/// Return the total buffered size of the table in bytes. This does not
113+
/// count bytes already written to disk, or any metadata and header and
114+
/// footer.
115+
pub fn size(&self) -> usize {
116+
self.columns.iter().map(|column| column.size()).sum()
109117
}
110118
}

Source/santad/Logs/EndpointSecurity/ParquetLogger/value.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
use parquet2::error::Result;
22

3-
// A value can be written to a page in a column chunk in a row group in a
4-
// parquet file in the house that Jack built.
5-
//
6-
// Parquet only supports a handful of physical types. In addition to what's
7-
// listed in this enum, parquet supports int96 and fixed length arrays, which
8-
// are not yet implemented here.
3+
/// A value can be written to a page in a column chunk in a row group in a
4+
/// parquet file in the house that Jack built.
5+
///
6+
/// Parquet only supports a handful of physical types. In addition to what's
7+
/// listed in this enum, parquet supports int96 and fixed length arrays, which
8+
/// are not yet implemented here.
99
pub enum Value<'a> {
1010
I32(i32),
1111
I64(i64),

Source/santad/Logs/EndpointSecurity/ParquetLogger/writer.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ use parquet2::{
99

1010
use crate::column_builder::ColumnBuilder;
1111

12-
// Wraps the FileWriter for Table to allow constructing the latter from C++.
13-
// (FileWriter is generic, but Table cannot be.)
12+
/// Wraps the FileWriter for Table to allow constructing the latter from C++.
13+
/// (FileWriter is generic, but Table cannot be.)
1414
pub enum Writer {
1515
Memory(FileWriter<Vec<u8>>),
1616
File(FileWriter<File>),

0 commit comments

Comments
 (0)