Merge pull request #27 from Boehringer-Ingelheim/303747-replace-apply-for-bug

zsigmas · web-flow · commit c595979435dc · 2025-07-29T12:54:49.000+02:00
bug fix regarding apply character padding
diff --git a/R/review_structures.R b/R/review_structures.R
@@ -15,14 +15,14 @@ SH <- local({ # _S_erialization _H_elpers
     return(res)
   }
 
-  hash_id <- function(row) {
+  ..ref_hash_id <- function(row) {
     input <- paste(row, collapse = '\1D')
     input <- charToRaw(input)
     res <- xxhashlite::xxhash_raw(input, as_raw = TRUE)
     return(res)
   }
   
-  hash_tracked_inner <- function(row) {
+..ref_hash_tracked_inner <- function(row) {
     # FIXME: Ensure that precision of numeric values does not affect serialization
     #        Maybe by using a string hex representation of their binary contents
     input <- paste(row, collapse = '\1D')
@@ -33,19 +33,42 @@ SH <- local({ # _S_erialization _H_elpers
   
   hash_tracked_offsets <- c(0, 2, 3)
   
-  hash_tracked <- function(row) {
+  ..ref_hash_tracked <- function(row) {
     n_col <- length(row)
     
     res <- raw(n_col)
-    for(i_col in seq(n_col)){
+    for(i_col in seq(n_col)){      
       col_indices <- (((i_col-1) + hash_tracked_offsets) %% n_col) + 1
-      res[[i_col]] <- hash_tracked_inner(row[col_indices])[[1]] # most significant byte
+      res[[i_col]] <- ..ref_hash_tracked_inner(row[col_indices])[[1]] # most significant byte
       i_col <- i_col + 1
     }
     
     return(res)
   }
 
+  vectorized_hash_row <- function(df, algo = "xxh128") {  
+    vectorized_hash_id <- Vectorize(function(x) xxhashlite::xxhash_raw(charToRaw(x), as_raw = TRUE, algo = algo), USE.NAMES = FALSE, SIMPLIFY = FALSE)
+    single_col <- do.call(function(...) paste(..., sep = "\1D"), lapply(df, as.character))
+    hashed_col <- vectorized_hash_id(single_col)
+    n_col <- length(hashed_col)
+    n_row <- if (length(hashed_col) > 0) length(hashed_col[[1]]) else 0
+    res <- matrix(unlist(vectorized_hash_id(single_col)), nrow = n_row, ncol = n_col)  
+    res
+  }
+
+  hash_id <- vectorized_hash_row
+
+  hash_tracked <- function(df) {
+    n_col <- ncol(df)
+    res <- list()   
+    for (i_col in seq_len(n_col)) {
+      col_indices <- (((i_col - 1) + hash_tracked_offsets) %% n_col) + 1
+      res[[i_col]] <- vectorized_hash_row(df[col_indices], algo = "xxh32")[1,] # most significant byte    
+    }
+    res <- matrix(unlist(res), nrow = ncol(df), ncol = nrow(df), byrow = TRUE)  
+    return(res)
+  }
+
   read_string_from_con <- function(con){
     res <- NULL
     n <- readBin(con, integer(), 1L)
@@ -85,6 +108,10 @@ SH <- local({ # _S_erialization _H_elpers
       integer_vector_to_raw = integer_vector_to_raw,
       hash_id = hash_id,
       hash_tracked = hash_tracked,
+      ..ref = list(
+        hash_id = ..ref_hash_id,
+        hash_tracked = ..ref_hash_tracked
+      ),
       read_string_from_con = read_string_from_con,
       read_character_vector_from_con = read_character_vector_from_con,
       read_integer_vector_from_con = read_integer_vector_from_con,
@@ -149,7 +176,7 @@ RS_parse_data_frame_variable_types <- function(v){
 }
 
 RS_compute_id_hashes <- function(df, id_vars){
-  return(apply(df[id_vars], 1, SH$hash_id, simplify = TRUE)) # coerces all types to be the same (character?)
+  return(SH$hash_id(df[id_vars]))
 }
 
 RS_compute_base_memory <- function(df_id, df, id_vars, tracked_vars){
@@ -167,7 +194,7 @@ RS_compute_base_memory <- function(df_id, df, id_vars, tracked_vars){
   ; if(!identical(dim(id_hashes), c(16L, nrow(df)))) return(simpleCondition("Internal error in id_vars hash preparation"))
   ; if(any(duplicated(id_hashes, MARGIN = 2))) return(simpleCondition("Found duplicated IDs"))
 
-  tracked_hashes <- apply(df[tracked_vars], 1, SH$hash_tracked, simplify = TRUE)
+  tracked_hashes <- SH$hash_tracked(df[tracked_vars])
   ; if(!identical(dim(tracked_hashes), c(length(tracked_vars), nrow(df)))) 
     return(simpleCondition("Internal error in tracked_vars hash preparation"))
   
@@ -253,9 +280,10 @@ RS_compute_delta_memory <- function(state, df){
   id_hashes <- RS_compute_id_hashes(df, id_vars) |> c() |> array(dim = c(16L, nrow(df)))
 
   tracked_vars <- state$tracked_vars
-  tracked_hashes <- (apply(df[tracked_vars], 1, SH$hash_tracked, simplify = TRUE) |> c() |> 
+  # FIXME: (LUIS): Ask Miguel about the postlude
+  tracked_hashes <- (SH$hash_tracked(df[tracked_vars]) |> c() |> 
                        array(dim = c(length(tracked_vars), nrow(df))))
-  
+
   # Assert against removal of rows
   local({
     merged <- cbind(id_hashes, state$id_hashes, deparse.level = 0)
diff --git a/tests/testthat/test-for_apply_bug.R b/tests/testthat/test-for_apply_bug.R
@@ -0,0 +1,112 @@
+# 303747-replace-apply-for-bug
+# 
+# When using apply to iterate through the rows of a data.frame it automatically casts the type to the a common
+# representation. In the case below, the common representation is character therefore before the loop starts all
+# df is transformed into character. Curiously, the casting does not work as one would expect and because the widest
+# element in b is two characters b will be casted in a character vector of elements of width 2 left-padded with spaces.
+#
+# # apply(as.data.frame(list(a = c(1L,2L), b = c(2L, 20L), c = "B")), 1, function(row) paste(row, collapse = "%%"))
+# # [1] "1%% 2%%B" "2%%20%%B" # Notice the space before the two, when one element is of at least width 2
+#
+# # apply(as.data.frame(list(a = c(1L,2L), b = c(2L, 0L), c = "B")), 1, function(row) paste0(row, collapse = "%%"))
+# # [1] "1%%2%%B" "2%%0%%B" # Notice how the space is not there when all elements are of width 1
+#
+# # apply(as.data.frame(list(a = c(1L,2L), b = c(2L, 100L), c = "B")), 1, function(row) paste0(row, collapse = "%%"))
+# # [1] "1%%  2%%B" "2%%100%%B" # Notice the two spaces when widest element is of length three
+#
+# Also fails for 
+# # apply(as.data.frame(list(a = c(1.21,2.2), c = "B")), 1, function(row) paste0(row, collapse = "%%")) # See trailing 0 in 2.2
+# # apply(as.data.frame(list(a = c(1.2,2.2), c = "B")), 1, function(row) paste0(row, collapse = "%%")) 
+#
+# When calculating hashes the following call was used
+# # apply(df, 1, SH$hash_tracked, simplify = TRUE)
+# Therefore returned hashes for each row differ even when the row in df was the same, as the casted contents
+# differ.
+#
+
+# Save current RNG state
+if (!exists(".Random.seed", envir = .GlobalEnv)) runif(1)  # ensure seed exists
+old_seed <- .Random.seed
+on.exit(assign(".Random.seed", old_seed, envir = .GlobalEnv), add = TRUE)
+set.seed(Sys.time())
+
+generate_random_df <- function(n_rows, equal_length = FALSE) {
+  
+  random_strings <- function(n) {
+    min_len <- 1
+    max_len <- 20
+    replicate(
+      n,
+      {
+        l <- if(equal_length) max_len else sample(min_len:max_len, 1)
+        paste0(sample(c(letters, LETTERS), l, replace = TRUE), collapse = "")
+      }
+    )
+  }
+
+  # Random date range
+  start_date <- as.Date("2000-01-01")
+  end_date <- as.Date("2020-12-31")
+  date_range <- as.numeric(end_date - start_date)
+
+  num <- if(equal_length) sample(10:99, n_rows, replace = TRUE) + sample(1:9, n_rows, replace = TRUE)/100 else rnorm(n_rows, mean = 100)
+  int <- as.integer(if(equal_length) sample(10:99, n_rows, replace = TRUE) else sample(1:100, n_rows, replace = TRUE))
+
+  df <- data.frame(
+    num    = num,             
+    int    = int,          
+    date   = start_date + sample(0:date_range, n_rows, TRUE),
+    log    = sample(c(TRUE, FALSE), n_rows, replace = TRUE), 
+    factor = factor(random_strings(n_rows)),     
+    char   = random_strings(n_rows)
+  )
+
+  return(df)
+}
+
+
+# Tested by double programming with the previous hash functions
+fixed_apply_hash <- function(df, fun) {
+  char <- lapply(df, as.character)
+  mat <- matrix(unlist(char), ncol = length(char), nrow = length(char[[1]]))
+  apply(mat, 1, fun)
+}
+
+
+test_that(
+  "apply, fixed_lapply and hash_id are identical when rows of each column have the same width", {
+  
+  df <- generate_random_df(100, equal_length = TRUE)
+  apply_hash_res <- apply(df, 1, SH$`..ref`$hash_id, simplify = TRUE)
+  fixed_apply_hash_res <- fixed_apply_hash(df, SH$`..ref`$hash_id)
+  vectorized_hash <- SH$hash_id(df)
+  expect_identical(apply_hash_res, fixed_apply_hash_res)
+  expect_identical(apply_hash_res, vectorized_hash)  
+})
+
+test_that(
+  "fixed_lapply and hash_id are identical when rows of each column have different width", {  
+  df <- generate_random_df(100, equal_length = FALSE)  
+  fixed_apply_hash_res <- fixed_apply_hash(df, SH$`..ref`$hash_id)
+  vectorized_hash <- SH$hash_id(df)
+  expect_identical(fixed_apply_hash_res, vectorized_hash)  
+})
+
+test_that(
+  "apply, fixed_lapply and hash_tracked are identical when rows of each column have the same width", {
+  
+  df <- generate_random_df(100, equal_length = TRUE)
+  apply_hash_res <- apply(df, 1, SH$`..ref`$hash_tracked, simplify = TRUE)
+  fixed_apply_hash_res <- fixed_apply_hash(df, SH$`..ref`$hash_tracked)
+  vectorized_hash <- SH$hash_tracked(df)
+  expect_identical(apply_hash_res, fixed_apply_hash_res)
+  expect_identical(apply_hash_res, vectorized_hash)  
+})
+
+test_that(
+  "fixed_lapply and hash_tracked are identical when rows of each column have different width", {  
+  df <- generate_random_df(100, equal_length = FALSE)  
+  fixed_apply_hash_res <- fixed_apply_hash(df, SH$`..ref`$hash_tracked)
+  vectorized_hash <- SH$hash_tracked(df)
+  expect_identical(fixed_apply_hash_res, vectorized_hash)  
+})
diff --git a/tests/testthat/test-hash_tracked.R b/tests/testthat/test-hash_tracked.R
@@ -1,6 +1,6 @@
 test_that("SH$hash_tracked exhibits almost no false negatives and few false positives", {
   hash_df <- function(df, tracked_vars) {
-    hashes <- apply(df[tracked_vars], 1, SH$hash_tracked, simplify = TRUE) # coerces all types to be the same (character?)
+    hashes <- SH$hash_tracked(df[tracked_vars])
     return(hashes)
   }
  

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`test_that("SH$hash_tracked exhibits almost no false negatives and few false positives", {`
`2`	`2`	`hash_df <- function(df, tracked_vars) {`
`3`		`- hashes <- apply(df[tracked_vars], 1, SH$hash_tracked, simplify = TRUE) # coerces all types to be the same (character?)`
	`3`	`+ hashes <- SH$hash_tracked(df[tracked_vars])`
`4`	`4`	`return(hashes)`
`5`	`5`	`}`
`6`	`6`