@@ -100,12 +100,24 @@ message ColumnStatistics {
100100 optional CollectionStatistics collectionStatistics = 12 ;
101101}
102102
103+ message RowIndexEntry {
104+ repeated uint64 positions = 1 [packed =true ];
105+ optional ColumnStatistics statistics = 2 ;
106+ }
107+
108+ message RowIndex {
109+ repeated RowIndexEntry entry = 1 ;
110+ }
111+
103112message BloomFilter {
104113 optional uint32 numHashFunctions = 1 ;
105114 repeated fixed64 bitset = 2 ;
106115 optional bytes utf8bitset = 3 ;
107116}
108117
118+ message BloomFilterIndex {
119+ repeated BloomFilter bloomFilter = 1 ;
120+ }
109121
110122message Stream {
111123 // if you add new index stream kinds, you need to make sure to update
@@ -258,12 +270,23 @@ message Metadata {
258270 repeated StripeStatistics stripeStats = 1 ;
259271}
260272
273+ // In ORC v2 (and for encrypted columns in v1), each column has
274+ // their column statistics written separately.
275+ message ColumnarStripeStatistics {
276+ // one value for each stripe in the file
277+ repeated ColumnStatistics colStats = 1 ;
278+ }
279+
261280enum EncryptionAlgorithm {
262281 UNKNOWN_ENCRYPTION = 0 ; // used for detecting future algorithms
263282 AES_CTR_128 = 1 ;
264283 AES_CTR_256 = 2 ;
265284}
266285
286+ message FileStatistics {
287+ repeated ColumnStatistics column = 1 ;
288+ }
289+
267290// How was the data masked? This isn't necessary for reading the file, but
268291// is documentation about how the file was written.
269292message DataMask {
0 commit comments