@@ -6,8 +6,11 @@ use deltalake_core::protocol::DeltaOperation;
6
6
7
7
mod simple_checkpoint {
8
8
use deltalake_core:: * ;
9
+ use parquet:: basic:: Encoding ;
10
+ use parquet:: file:: reader:: { FileReader , SerializedFileReader } ;
9
11
use pretty_assertions:: assert_eq;
10
- use std:: fs;
12
+ use regex:: Regex ;
13
+ use std:: fs:: { self , File } ;
11
14
use std:: path:: { Path , PathBuf } ;
12
15
13
16
#[ tokio:: test]
@@ -31,6 +34,9 @@ mod simple_checkpoint {
31
34
let checkpoint_path = log_path. join ( "00000000000000000005.checkpoint.parquet" ) ;
32
35
assert ! ( checkpoint_path. as_path( ) . exists( ) ) ;
33
36
37
+ // Check that the checkpoint does use run length encoding
38
+ assert_column_rle_encoding ( checkpoint_path, true ) ;
39
+
34
40
// _last_checkpoint should exist and point to the correct version
35
41
let version = get_last_checkpoint_version ( & log_path) ;
36
42
assert_eq ! ( 5 , version) ;
@@ -42,6 +48,9 @@ mod simple_checkpoint {
42
48
let checkpoint_path = log_path. join ( "00000000000000000010.checkpoint.parquet" ) ;
43
49
assert ! ( checkpoint_path. as_path( ) . exists( ) ) ;
44
50
51
+ // Check that the checkpoint does use run length encoding
52
+ assert_column_rle_encoding ( checkpoint_path, true ) ;
53
+
45
54
// _last_checkpoint should exist and point to the correct version
46
55
let version = get_last_checkpoint_version ( & log_path) ;
47
56
assert_eq ! ( 10 , version) ;
@@ -53,6 +62,77 @@ mod simple_checkpoint {
53
62
assert_eq ! ( 12 , files. count( ) ) ;
54
63
}
55
64
65
+ #[ tokio:: test]
66
+ async fn checkpoint_run_length_encoding_test ( ) {
67
+ let table_location = "../test/tests/data/checkpoints" ;
68
+ let table_path = PathBuf :: from ( table_location) ;
69
+ let log_path = table_path. join ( "_delta_log" ) ;
70
+
71
+ // Delete checkpoint files from previous runs
72
+ cleanup_checkpoint_files ( log_path. as_path ( ) ) ;
73
+
74
+ // Load the delta table
75
+ let base_table = deltalake_core:: open_table ( table_location) . await . unwrap ( ) ;
76
+
77
+ // Set the table properties to disable run length encoding
78
+ // this alters table version and should be done in a more principled way
79
+ let table = DeltaOps ( base_table)
80
+ . set_tbl_properties ( )
81
+ . with_properties ( std:: collections:: HashMap :: < String , String > :: from ( [ (
82
+ "delta-rs.checkpoint.useRunLengthEncoding" . to_string ( ) ,
83
+ "false" . to_string ( ) ,
84
+ ) ] ) )
85
+ . await
86
+ . unwrap ( ) ;
87
+
88
+ // Write a checkpoint
89
+ checkpoints:: create_checkpoint ( & table, None ) . await . unwrap ( ) ;
90
+
91
+ // checkpoint should exist
92
+ let checkpoint_path = log_path. join ( "00000000000000000013.checkpoint.parquet" ) ;
93
+ assert ! ( checkpoint_path. as_path( ) . exists( ) ) ;
94
+
95
+ // Check that the checkpoint does not use run length encoding
96
+ assert_column_rle_encoding ( checkpoint_path, false ) ;
97
+
98
+ // _last_checkpoint should exist and point to the correct version
99
+ let version = get_last_checkpoint_version ( & log_path) ;
100
+ assert_eq ! ( table. version( ) , version) ;
101
+
102
+ // delta table should load just fine with the checkpoint in place
103
+ let table_result = deltalake_core:: open_table ( table_location) . await . unwrap ( ) ;
104
+ let table = table_result;
105
+ let files = table. get_files_iter ( ) . unwrap ( ) ;
106
+ assert_eq ! ( 12 , files. count( ) ) ;
107
+ }
108
+
109
+ fn assert_column_rle_encoding ( file_path : PathBuf , should_be_rle : bool ) {
110
+ let file = File :: open ( & file_path) . unwrap ( ) ;
111
+ let reader = SerializedFileReader :: new ( file) . unwrap ( ) ;
112
+ let meta = reader. metadata ( ) ;
113
+ let mut found_rle = false ;
114
+
115
+ for i in 0 ..meta. num_row_groups ( ) {
116
+ let row_group = meta. row_group ( i) ;
117
+ for j in 0 ..row_group. num_columns ( ) {
118
+ let column_chunk: & parquet:: file:: metadata:: ColumnChunkMetaData =
119
+ row_group. column ( j) ;
120
+
121
+ for encoding in column_chunk. encodings ( ) {
122
+ if * encoding == Encoding :: RLE_DICTIONARY {
123
+ found_rle = true ;
124
+ }
125
+ }
126
+ }
127
+ }
128
+
129
+ if should_be_rle {
130
+ assert ! ( found_rle, "Expected RLE_DICTIONARY encoding" ) ;
131
+ } else {
132
+ assert ! ( !found_rle, "Expected no RLE_DICTIONARY encoding" ) ;
133
+ }
134
+ }
135
+
56
136
fn get_last_checkpoint_version ( log_path : & Path ) -> i64 {
57
137
let last_checkpoint_path = log_path. join ( "_last_checkpoint" ) ;
58
138
assert ! ( last_checkpoint_path. as_path( ) . exists( ) ) ;
@@ -69,15 +149,22 @@ mod simple_checkpoint {
69
149
}
70
150
71
151
fn cleanup_checkpoint_files ( log_path : & Path ) {
72
- let paths = fs:: read_dir ( log_path) . unwrap ( ) ;
73
- for d in paths. flatten ( ) {
74
- let path = d. path ( ) ;
75
-
76
- if path. file_name ( ) . unwrap ( ) == "_last_checkpoint"
77
- || ( path. extension ( ) . is_some ( ) && path. extension ( ) . unwrap ( ) == "parquet" )
78
- {
79
- fs:: remove_file ( path) . unwrap ( ) ;
152
+ let re = Regex :: new ( r"^(\d{20})\.json$" ) . unwrap ( ) ;
153
+ for entry in fs:: read_dir ( log_path) . unwrap ( ) . flatten ( ) {
154
+ let path = entry. path ( ) ;
155
+ let filename = match path. file_name ( ) . and_then ( |n| n. to_str ( ) ) {
156
+ Some ( name) => name,
157
+ None => continue ,
158
+ } ;
159
+
160
+ if let Some ( caps) = re. captures ( filename) {
161
+ if let Ok ( num) = caps[ 1 ] . parse :: < u64 > ( ) {
162
+ if num <= 12 {
163
+ continue ;
164
+ }
165
+ }
80
166
}
167
+ let _ = fs:: remove_file ( path) ;
81
168
}
82
169
}
83
170
}
0 commit comments