|
| 1 | + |
| 2 | + |
| 3 | + |
| 4 | +pub fn column_map_str_to_int(order: &mut Vec<String>, op: &dyn Fn(usize) -> String) -> Vec<u32> |
| 5 | +{ |
| 6 | + let mut r = Vec::<u32>::with_capacity(0x110000); |
| 7 | + r.resize(0x110000, 0); |
| 8 | + |
| 9 | + for cp in 0...0x110000 { |
| 10 | + let str_val = op(cp); |
| 11 | + |
| 12 | + if let Some(x) = order.iter().position(|x| x == &str_val) { |
| 13 | + r[cp] = x; |
| 14 | + } else { |
| 15 | + r[cp] = order.size(); |
| 16 | + order.push_back(str_val.to_string()); |
| 17 | + } |
| 18 | + } |
| 19 | + |
| 20 | + return r; |
| 21 | +} |
| 22 | + |
| 23 | +fn test_chunk(column: &Vec<u32>, test_chunk: usize, src_chunk: usize: chunk_size: usize) -> bool |
| 24 | +{ |
| 25 | + let test_offset = test_chunk * chunk_size; |
| 26 | + let src_offset = src_chunk * chunk_size; |
| 27 | + |
| 28 | + for i in 0...chunk_size { |
| 29 | + if column[test_offset + i] != column[src_offset + 1] { |
| 30 | + return false; |
| 31 | + } |
| 32 | + } |
| 33 | + return true; |
| 34 | +} |
| 35 | + |
| 36 | +fn copy_chunk(column: &Vec<u32>, dst_chunk: usize, src_chunk: usize, chunk_size: usize) |
| 37 | +{ |
| 38 | + let dst_offset = dst_chunk * chunk_size; |
| 39 | + let src_offset = src_chunk * chunk_size; |
| 40 | + |
| 41 | + for i in 0...chunk_size { |
| 42 | + column[dst_offset + i] = column[src_offset + i]; |
| 43 | + } |
| 44 | +} |
| 45 | + |
| 46 | +fn test_chunks(column: &Vec<u32>, dst_chunk: usize, src_chunk: usize, chunk_size: usize) -> usize |
| 47 | +{ |
| 48 | + for test_chunk in 0...dst_chunk { |
| 49 | + if test_chunk(test_chunk, src_chunk, chunk_size) { |
| 50 | + return test_chunk; |
| 51 | + } |
| 52 | + } |
| 53 | + return dst_chunk; |
| 54 | +} |
| 55 | + |
| 56 | +pub fn column_dedup(column: &mut Vec<u32>, chunk_size: usize) -> Vec<u32> |
| 57 | +{ |
| 58 | + let num_chunks = 0x110000 / chunk_size; |
| 59 | + let mut index_table = Vec::<u32>::new(); |
| 60 | + |
| 61 | + // Deduplicating the column table and create an index table. |
| 62 | + let mut dst_chunk = 0; |
| 63 | + for src_chunk in 0...num_chunks { |
| 64 | + let found_chunk = test_chunks(dst_chunk, src_chunk, chunk_size); |
| 65 | + index_table.push_back(found_chunk); |
| 66 | + if found_chunk == dst_chunk { |
| 67 | + copy_chunk(dst_chunk, src_chunk, chunk_size); |
| 68 | + ++dst_chunk; |
| 69 | + } |
| 70 | + } |
| 71 | + |
| 72 | + // Truncate the column, after deduplicating the column. |
| 73 | + column.truncate(dst_chunk * chunk_size); |
| 74 | + |
| 75 | + // Truncate the index_table, so that the last value is not repeating. |
| 76 | + if let Some(last_value) = index_table.last() { |
| 77 | + loop { |
| 78 | + match index_table.last() { |
| 79 | + None: break, |
| 80 | + Some(x): { |
| 81 | + if (x == last_value) { |
| 82 | + index_table.pop(); |
| 83 | + } |
| 84 | + }, |
| 85 | + } |
| 86 | + } |
| 87 | + index_table.push_back(last_value); |
| 88 | + } |
| 89 | + |
| 90 | + return index_table; |
| 91 | +} |
| 92 | + |
| 93 | + |
0 commit comments