[annotation] Use two bytes per hash_tracked column instead of one.

ml-ebs-ext · ml-ebs-ext · commit a8d7ec1411d0 · 2025-07-30T16:13:47.000+02:00
diff --git a/R/review_structures.R b/R/review_structures.R
@@ -1,4 +1,7 @@
 # nolint start
+
+BYTES_PER_TRACKED_HASH <- 2L
+
 SH <- local({ # _S_erialization _H_elpers
   get_UTC_time_in_seconds <- function() as.numeric(structure(Sys.time(), tzone = 'UTC'))
   double_to_raw <- function(v) writeBin(v, con = raw(0), endian = 'little', useBytes = TRUE)
@@ -36,10 +39,12 @@ SH <- local({ # _S_erialization _H_elpers
   ..ref_hash_tracked <- function(row) {
     n_col <- length(row)
     
-    res <- raw(n_col)
+    res <- raw(BYTES_PER_TRACKED_HASH * n_col)
     for(i_col in seq(n_col)){      
       col_indices <- (((i_col-1) + hash_tracked_offsets) %% n_col) + 1
-      res[[i_col]] <- ..ref_hash_tracked_inner(row[col_indices])[[1]] # most significant byte
+      first <- BYTES_PER_TRACKED_HASH * (i_col-1) + 1
+      last <- BYTES_PER_TRACKED_HASH * i_col
+      res[first:last] <- ..ref_hash_tracked_inner(row[col_indices])[1:BYTES_PER_TRACKED_HASH] # most significant bytes
       i_col <- i_col + 1
     }
     
@@ -63,9 +68,9 @@ SH <- local({ # _S_erialization _H_elpers
     res <- list()   
     for (i_col in seq_len(n_col)) {
       col_indices <- (((i_col - 1) + hash_tracked_offsets) %% n_col) + 1
-      res[[i_col]] <- vectorized_hash_row(df[col_indices], algo = "xxh32")[1,] # most significant byte    
+      res[[i_col]] <- vectorized_hash_row(df[col_indices], algo = "xxh32")[1:BYTES_PER_TRACKED_HASH,] # most significant bytes
     }
-    res <- matrix(unlist(res), nrow = ncol(df), ncol = nrow(df), byrow = TRUE)  
+    res <- do.call(rbind, res)
     return(res)
   }
 
@@ -195,7 +200,7 @@ RS_compute_base_memory <- function(df_id, df, id_vars, tracked_vars){
   ; if(any(duplicated(id_hashes, MARGIN = 2))) return(simpleCondition("Found duplicated IDs"))
 
   tracked_hashes <- SH$hash_tracked(df[tracked_vars])
-  ; if(!identical(dim(tracked_hashes), c(length(tracked_vars), nrow(df)))) 
+  ; if(!identical(dim(tracked_hashes), c(BYTES_PER_TRACKED_HASH*length(tracked_vars), nrow(df)))) 
     return(simpleCondition("Internal error in tracked_vars hash preparation"))
   
   # NOTE: We choose a serialization scheme with a well-known encoding. This avoid security concerns over 
@@ -244,7 +249,7 @@ RS_parse_base <- function(contents){
   row_count <- readBin(con, integer(), 1L)
  
   id_hashes <- SH$read_hashes_from_con(con, row_count, 16L)
-  tracked_hashes <- SH$read_hashes_from_con(con, row_count, length(tracked_vars))
+  tracked_hashes <- SH$read_hashes_from_con(con, row_count, BYTES_PER_TRACKED_HASH*length(tracked_vars))
   
   empty_read <- readBin(con, raw(), 1L)
   ; if(length(empty_read) > 0) return(simpleCondition("Too much hash data"))
@@ -282,7 +287,7 @@ RS_compute_delta_memory <- function(state, df){
   tracked_vars <- state$tracked_vars
   # FIXME: (LUIS): Ask Miguel about the postlude
   tracked_hashes <- (SH$hash_tracked(df[tracked_vars]) |> c() |> 
-                       array(dim = c(length(tracked_vars), nrow(df))))
+                       array(dim = c(BYTES_PER_TRACKED_HASH*length(tracked_vars), nrow(df))))
 
   # Assert against removal of rows
   local({
@@ -353,11 +358,11 @@ RS_parse_delta <- function(contents, tracked_var_count){
   domain <- SH$read_string_from_con(con)
   new_row_count <- readBin(con, integer(), 1L)
   new_id_hashes <- SH$read_hashes_from_con(con, new_row_count, 16L)
-  new_tracked_hashes <- SH$read_hashes_from_con(con, new_row_count, tracked_var_count)
+  new_tracked_hashes <- SH$read_hashes_from_con(con, new_row_count, BYTES_PER_TRACKED_HASH*tracked_var_count)
   modified_row_count <- NA_integer_
   modified_row_indices <- SH$read_integer_vector_from_con(con)
   modified_row_count <- length(modified_row_indices)
-  modified_tracked_hashes <- SH$read_hashes_from_con(con, modified_row_count, tracked_var_count)
+  modified_tracked_hashes <- SH$read_hashes_from_con(con, modified_row_count, BYTES_PER_TRACKED_HASH*tracked_var_count)
 
   empty_read <- readBin(con, raw(), 1L)
   ; if(length(empty_read) > 0) return(simpleCondition("Too much hash data"))
@@ -466,6 +471,7 @@ RS_load <- function(base, deltas){
   base_timestamp <- res$timestamp
   for(delta in deltas){
     state_delta <- RS_parse_delta(contents = delta, tracked_var_count = length(res[["tracked_vars"]]))
+    if(inherits(state_delta, "simpleCondition")) return(state_delta)
     
     if(!identical(state_delta$generation, res$generation+1L))
       return(simpleCondition(paste("Wrong generation marker. Should be", res$generation+1L)))
diff --git a/tests/testthat/test-hash_tracked.R b/tests/testthat/test-hash_tracked.R
@@ -1,13 +1,8 @@
 test_that("SH$hash_tracked exhibits almost no false negatives and few false positives", {
-  hash_df <- function(df, tracked_vars) {
-    hashes <- SH$hash_tracked(df[tracked_vars])
-    return(hashes)
-  }
- 
   # TODO(miguel): Refactor and move next to hash_tracked into SH  
   report_changes <- function(df, h0, verbose = FALSE){
     res <- list()
-    h1 <- hash_df(df, tracked_vars = colnames(df))
+    h1 <- SH$hash_tracked(df[colnames(df)])
     
     offsets <- c(0, 2, 3)
     
@@ -17,6 +12,7 @@ test_that("SH$hash_tracked exhibits almost no false negatives and few false posi
       prev <- as.integer(h0[,i_row])
       cur <- as.integer(h1[,i_row])
       diff <- (prev != cur)
+      diff <- apply(matrix(diff, ncol = BYTES_PER_TRACKED_HASH, byrow = TRUE), 1, any)
       evidence <- integer(n_col)
       for(i in seq_len(n_col)){
         v <- diff[[i]]
@@ -56,7 +52,7 @@ test_that("SH$hash_tracked exhibits almost no false negatives and few false posi
   }
   
   stress <- function(df, test_count, changes_per_test){
-    hashes <- hash_df(df, tracked_vars = colnames(df))
+    hashes <- SH$hash_tracked(df[colnames(df)])
     
     n_row <- nrow(df)
     n_col <- ncol(df)
@@ -116,7 +112,7 @@ test_that("SH$hash_tracked exhibits almost no false negatives and few false posi
           false_negatives <- append(false_negatives, tail)
         }
         if(i_rep <= length(reported_changes)) {
-          if(length(false_positives) == 0)  false_positive_first_delta <- expected_changes
+          if(length(false_positives) == 0) false_positive_first_delta <- expected_changes
           tail <- reported_changes[i_rep:length(reported_changes)]
           false_positives <- append(false_positives, tail)
         }
diff --git a/vignettes/data_review.Rmd b/vignettes/data_review.Rmd
@@ -146,7 +146,7 @@ If we take a hypothetical "xyz" domain, `dv.listings` will store the following f
 - m `tracked_vars` variable types (see "Variable type encoding" below)
 - 1 row count
 - p (1 per "xyz" row) `hash_id(xyz[id_vars])`
-- p (1 per "xyz" row, *m* bytes long) `hash_tracked(xyz[tracked_vars])`
+- p (1 per "xyz" row, 2\*m bytes long) `hash_tracked(xyz[tracked_vars])`
 </details>
 
 <details><summary>`xyz_001.delta` (one per domain dataset update)</summary>
@@ -158,10 +158,10 @@ If we take a hypothetical "xyz" domain, `dv.listings` will store the following f
 - 1 domain string ("xyz")
 - 1 count of new rows
 - n (1 per *new* "xyz" row) `hash_id(xyz[id_vars])`
-- n (1 per *new* "xyz" row, *m* bytes long) `hash_tracked(xyz[tracked_vars])`
+- n (1 per *new* "xyz" row, 2\*m bytes long) `hash_tracked(xyz[tracked_vars])`
 - 1 count of modified rows
 - p (1 per *modified* "xyz" row) row index
-- p (1 per *modified* "xyz" row, *m* bytes long) `hash_tracked(xyz[tracked_vars])`
+- p (1 per *modified* "xyz" row, 2\*m bytes long) `hash_tracked(xyz[tracked_vars])`
 </details>
 
 <details><summary>`xyz_<ROLE>.review` (one per domain and ROLE)</summary>
@@ -180,7 +180,7 @@ If we take a hypothetical "xyz" domain, `dv.listings` will store the following f
 
 (Row indices refer to indices in the stored base+delta matrix, which is append-only. These _canonical_ indices are as good as identifiers).
 
-The dominant factor governing the size of these files is the length of a hash, which is 16 bytes, as we discuss in the "Hashing" session below. Row indices and delta timestamps can be encoded in 4 bytes. Review indices take up 1 byte each. Estimating an upper bound of 1 million rows per dataset, a `.base` file would take around 32 MiB. A comprehensive `.review` file for such a dataset would take around 9 MiB.
+The dominant factor governing the size of these files is the length of a hash, which is 16 bytes in the case of `hash_id` and 2\*m bytes (m being the number of tracked columns) in the case of `hash_tracked`, as we discuss in the "Hashing" session below. Row indices and delta timestamps can be encoded in 4 bytes. Review indices take up 1 byte each. Estimating an upper bound of 1 million rows per dataset, a `.base` file tracking 8 variables would take around 32 MiB. A comprehensive `.review` file for such a dataset would take around 9 MiB.
 
 These file structures are designed so that they start with a short heterogeneous header that reiterates the information that can be gleaned from the file name. The rest of the records are all homogeneous and of known size. That allows to load them into memory without the need for expensive parsing.
 
@@ -232,36 +232,36 @@ We opt instead for 128 bits, which makes the possibility of a collision extremel
 #### Hashing of tracked variables (`hash_tracked()`)
 We could apply the same reasoning behind the choice of the `hash_id()` function to the hashing of the variable parts of each row. We instead propose a more complex hashing scheme to provide partial information about *which variables of a row have been altered* when its hash changes.
 
-Each hash value is *m* bytes long, where *m* is the number of variables tracked of a given dataset. Each of those bytes is an independent hash of three of the tracked variables of a dataset row. Each variable, in turn, contributes to three of the *m* byte-sized hashes. This mixing of variables makes it harder for an external adversarial observer of the `.base` and `.delta` files to brute-force the original values of the dataset by looking for collisions with the computed hash values.
+Each hash value is 2\*m bytes long, where *m* is the number of variables tracked of a given dataset. Each of those bytes is an independent hash of three of the tracked variables of a dataset row. Each variable, in turn, contributes to three of the 2\*m byte-sized hashes. This mixing of variables makes it harder for an external adversarial observer of the `.base` and `.delta` files to brute-force the original values of the dataset by looking for collisions with the computed hash values.
 
-To compute which variables contribute to which hash byte, we use the following scheme:
+To compute which variables contribute to which hash byte pair, we use the following scheme:
 
-  - Byte *n*: Variables (*n*+0)%*n*, (*n*+2)%*n* and (*n*+3)%*n*
+  - Byte pair *n*: Variables (*n*+0)%*n*, (*n*+2)%*n* and (*n*+3)%*n*
 
 Where `%` indicates the remainder of the integer division.
 
 So, for a input dataset with seven tracked variables (zero through six), this would mean:
 
-  - Byte 0: Variables 0, 2 and 3
-  - Byte 1: Variables 1, 3 and 4
-  - Byte 2: Variables 2, 4 and 5
-  - Byte 3: Variables 3, 5 and 6
-  - Byte 4: Variables 4, 6 and 0
-  - Byte 5: Variables 5, 0 and 1
-  - Byte 6: Variables 6, 1 and 2
+  - Byte pair 0: Variables 0, 2 and 3
+  - Byte pair 1: Variables 1, 3 and 4
+  - Byte pair 2: Variables 2, 4 and 5
+  - Byte pair 3: Variables 3, 5 and 6
+  - Byte pair 4: Variables 4, 6 and 0
+  - Byte pair 5: Variables 5, 0 and 1
+  - Byte pair 6: Variables 6, 1 and 2
 
-This scheme creates a unique mixtures of variables. Take, for instance, variable 0. It is combined with variables 2 and 3 on the zeroth byte, with variables 4 and 6 on the fourth byte and with 1 and 5 for the fifth byte.
+This scheme creates a unique mixtures of variables. Take, for instance, variable 0. It is combined with variables 2 and 3 on the zeroth byte pair, with variables 4 and 6 on the fourth byte pair and with 1 and 5 for the fifth byte pair.
 
-Each of these bytes is computed by:
+Each of these byte pairs is computed by:
 
 - Taking the three values to hash.
 - Serializing them to text and concatenating them using the non-ASCII byte separator `1D` (also known as "group separator").
-- Computing the `xxh32` hash and returning its most significant byte.
+- Computing the `xxh32` hash and returning its two most significant bytes.
 
 Informal testing (refer to `tests/testthat/tests-hash_tracked.R` for more details) of this hashing scheme shows the following properties:
 
 - It's capable of identifying up to four modified variables per row (after that, it's preferable to give up and notify the whole row as modified).
-- It has a very low **false negative rate** (a variable is modified without it being notified as such) of one for every 8 million row updates.
+- It has a very low **false negative rate** (a variable is modified without it being notified as such).
 - It has a low **false positive rate** (a variable that retains its value is notified as modified). This only happens when there are actual changes to a row.
 
 False positives are not critical, as they ask reviewers to consider a larger set of variables when re-reviewing a row that has been altered.