1010/// - `dst_chunk`: The index to the chunk where the data will be copied to.
1111/// - `src_chunk`: The index to the chunk where the data will be copied from.
1212/// - `chunk_size`: The size of the chunks.
13- ///
14- fn copy_chunk ( column : & mut Vec < usize > , dst_chunk : usize , src_chunk : usize , chunk_size : usize )
13+ ///
14+ fn copy_chunk ( src : & Vec < usize > , src_chunk : usize , dst : & mut Vec < usize > , dst_chunk : usize , chunk_size : usize )
1515{
1616 let dst_offset = dst_chunk * chunk_size;
1717 let src_offset = src_chunk * chunk_size;
1818
19+ let dst_size = dst_offset + chunk_size;
20+ if dst. len ( ) < dst_size {
21+ dst. resize ( dst_size, 0 ) ;
22+ }
23+
1924 for i in 0 ..chunk_size {
20- column [ dst_offset + i] = column [ src_offset + i] ;
25+ dst [ dst_offset + i] = src [ src_offset + i] ;
2126 }
2227}
2328
24- fn test_chunk ( column : & Vec < usize > , a : usize , b : usize , chunk_size : usize ) -> bool
29+ fn test_chunk ( src : & Vec < usize > , src_chunk : usize , dst : & Vec < usize > , dst_chunk : usize , chunk_size : usize ) -> bool
2530{
26- let a_offset = a * chunk_size;
27- let b_offset = b * chunk_size;
31+ let src_offset = src_chunk * chunk_size;
32+ let dst_offset = dst_chunk * chunk_size;
2833
2934 for i in 0 ..chunk_size {
30- if column [ a_offset + i] != column [ b_offset + i] {
35+ if dst [ dst_offset + i] != src [ src_offset + i] {
3136 return false ;
3237 }
3338 }
@@ -38,11 +43,12 @@ fn test_chunk(column: &Vec<usize>, a: usize, b: usize, chunk_size: usize) -> boo
3843/// `src_chunk`.
3944///
4045/// # Arguments
41- /// - `column`: The vector to deduplicate.
46+ /// - `src': The data source.
47+ /// - `src_chunk`: The chunk-index to the source-chunk.
48+ /// - `dst`: The deduplicated table.
4249/// - `dst_chunk`: The chunk-index to the destination-chunk where the
4350/// source-chunk will be copied to; if all chunks from
4451/// `0..dst_chunk` are unequal to source-chunk.
45- /// - `src_chunk`: The chunk-index to the source-chunk.
4652/// - `chunk_size`: The size of the chunks.
4753///
4854/// # Returns
@@ -52,11 +58,11 @@ fn test_chunk(column: &Vec<usize>, a: usize, b: usize, chunk_size: usize) -> boo
5258/// The `dst_chunk` is returned so that you can directly copy the source-chunk
5359/// to the correct position.
5460///
55- fn test_chunks ( column : & Vec < usize > , dst_chunk : usize , src_chunk : usize , chunk_size : usize ) -> usize
61+ fn test_chunks ( src : & Vec < usize > , src_chunk : usize , dst : & Vec < usize > , dst_chunk : usize , chunk_size : usize ) -> usize
5662{
57- for a_chunk in 0 ..dst_chunk {
58- if test_chunk ( column , a_chunk , src_chunk , chunk_size) {
59- return a_chunk ;
63+ for i in 0 ..dst_chunk {
64+ if test_chunk ( src , src_chunk , dst , i , chunk_size) {
65+ return i ;
6066 }
6167 }
6268 return dst_chunk;
@@ -69,32 +75,30 @@ fn test_chunks(column: &Vec<usize>, dst_chunk: usize, src_chunk: usize, chunk_si
6975/// - `chunk_size` : The size of the chunks.
7076///
7177/// # Returns
72- /// The index-table.
78+ /// (dedupped-data-table, index-table) .
7379///
7480/// To find a specific entry `i` in the deduplicated column:
7581/// - `chunk_nr = min(i / chunk_size, index_table.len())`
7682/// - `offset = index_table[chunk_nr] * chunk_size + i % chunk_size`
7783/// - `entry = column[offset]`
7884///
79- pub fn dedup ( column : & mut Vec < usize > , chunk_size : usize ) -> Vec < usize >
85+ pub fn dedup ( column : & Vec < usize > , chunk_size : usize ) -> ( Vec < usize > , Vec < usize > )
8086{
8187 let num_chunks = 0x110000 / chunk_size;
8288 let mut index_table = Vec :: < usize > :: new ( ) ;
89+ let mut dedup_table = Vec :: < usize > :: new ( ) ;
8390
8491 // Deduplicating the column table and create an index table.
8592 let mut dst_chunk = 0 ;
8693 for src_chunk in 0 ..num_chunks {
87- let found_chunk = test_chunks ( column, dst_chunk , src_chunk , chunk_size) ;
94+ let found_chunk = test_chunks ( column, src_chunk , & dedup_table , dst_chunk , chunk_size) ;
8895 index_table. push ( found_chunk) ;
8996 if found_chunk == dst_chunk {
90- copy_chunk ( column, dst_chunk , src_chunk , chunk_size) ;
97+ copy_chunk ( column, src_chunk , & mut dedup_table , dst_chunk , chunk_size) ;
9198 dst_chunk += 1 ;
9299 }
93100 }
94101
95- // Truncate the column, after deduplicating the column.
96- column. truncate ( dst_chunk * chunk_size) ;
97-
98102 // Truncate the index_table, so that the last value is not repeating.
99103 if let Some ( & last_value) = index_table. last ( ) {
100104 loop {
@@ -112,7 +116,42 @@ pub fn dedup(column: &mut Vec<usize>, chunk_size: usize) -> Vec<usize>
112116 index_table. push ( last_value) ;
113117 }
114118
115- return index_table;
119+ return ( dedup_table, index_table) ;
120+ }
121+
122+ pub fn dedup_best_fit ( column : & Vec < usize > ) -> ( Vec < usize > , usize , Vec < usize > , usize , usize )
123+ {
124+ let chunk_sizes = vec ! [
125+ 32 as usize ,
126+ 64 as usize ,
127+ 128 as usize ,
128+ 256 as usize ,
129+ 512 as usize ,
130+ ] ;
131+
132+ let mut best_chunk_size: usize = 0 ;
133+ let mut best_dedup = Vec :: < usize > :: new ( ) ;
134+ let mut best_index = Vec :: < usize > :: new ( ) ;
135+ let mut best_dedup_bits: usize = 0 ;
136+ let mut best_index_bits: usize = 0 ;
137+ let mut best_byte_len: usize = 0 ;
138+ for chunk_size in chunk_sizes {
139+ let ( dedup, index) = dedup ( & column, chunk_size) ;
140+ let dedup_bits = get_width ( & dedup) ;
141+ let index_bits = get_width ( & index) ;
142+
143+ let byte_len = dedup. len ( ) * dedup_bits + index. len ( ) * index_bits;
144+ if best_byte_len == 0 || byte_len < best_byte_len {
145+ best_chunk_size = chunk_size;
146+ best_dedup = dedup;
147+ best_index = index;
148+ best_dedup_bits = dedup_bits;
149+ best_index_bits = index_bits;
150+ best_byte_len = byte_len;
151+ }
152+ }
153+
154+ return ( best_dedup, best_dedup_bits, best_index, best_index_bits, best_chunk_size) ;
116155}
117156
118157pub fn map_str_to_int < ' a > ( order : & mut Vec < String > , op : impl Fn ( usize ) -> & ' a String ) -> Vec < usize >
@@ -166,3 +205,8 @@ pub fn compress(input: &Vec<usize>, num_bits: usize) -> Vec<u8>
166205 return r;
167206}
168207
208+ pub fn get_width ( input : & Vec < usize > ) -> usize
209+ {
210+ let max = * input. iter ( ) . max ( ) . unwrap_or ( & 0 ) ;
211+ return ( max + 1 ) . next_power_of_two ( ) . trailing_zeros ( ) as usize ;
212+ }
0 commit comments