Merge pull request #24 from Boehringer-Ingelheim/feat/dataset_update_checks

ml-ebs-ext · web-flow · commit 606866b3f237 · 2025-06-19T10:39:05.000+02:00
[annotation] Store dataframe variable types in `.base` file.
diff --git a/R/review_structures.R b/R/review_structures.R
@@ -105,6 +105,33 @@ RS_hash_data_frame <- function(df){
   return(res)
 }
 
+RS_compute_data_frame_variable_types <- function(df, vars){
+  res <- raw(length(vars))
+  for(i_var in seq_along(vars)){
+    v <- 0
+    var <- df[[vars[[i_var]]]]
+   
+    # The order of this comparison matters, because e.g. POSIXct variables are also numeric
+    # They go from more to less restrictive
+    if(inherits(var, "Date")) v <- 1
+    else if(inherits(var, "POSIXct")) v <- 2
+    else if(inherits(var, "POSIXlt")) v <- 3
+    else if(is.logical(var)) v <- 10
+    else if(is.factor(var)) v <- 11
+    else if(is.integer(var)) v <- 13
+    else if(is.numeric(var)) v <- 14
+    else if(is.complex(var)) v <- 15
+    else if(is.character(var)) v <- 16
+    else if(is.raw(var)) v <- 24
+    
+    checkmate::assert_true(v != 0)
+    
+    res[[i_var]] <- as.raw(v)
+  }
+  
+  return(res)
+}
+
 RS_compute_base_memory <- function(df_id, df, id_vars, tracked_vars){
   checkmate::assert_string(df_id, min.chars = 1, max.chars = 65535)
   checkmate::assert_data_frame(df)
@@ -133,17 +160,19 @@ RS_compute_base_memory <- function(df_id, df, id_vars, tracked_vars){
   #       loss of data in case something goes wrong, as we can look for the last correct byte on any given
   #       file, discard the remainder bytes and end up in a consistent state.
   res <- c(
-    charToRaw("LISTBASE"),                          # file magic code
-    as.raw(0),                                      # format version number
-    as.raw(0),                                      # generation marker
-    SH$double_to_raw(SH$get_UTC_time_in_seconds()), # timestamp
-    df_hash,                                        # complete hash of input data.frame
-    SH$string_to_raw(df_id),                        # domain string
-    SH$character_vector_to_raw(id_vars),            # identifier vars
-    SH$character_vector_to_raw(tracked_vars),       # tracked vars
-    SH$integer_to_raw(nrow(df)),                    # row count
-    id_hashes,                                      # one hash of id_vars per row
-    tracked_hashes                                  # one hash of tracked_vars per row
+    charToRaw("LISTBASE"),                                          # file magic code
+    as.raw(0),                                                      # format version number
+    as.raw(0),                                                      # generation marker
+    SH$double_to_raw(SH$get_UTC_time_in_seconds()),                 # timestamp
+    df_hash,                                                        # complete hash of input data.frame
+    SH$string_to_raw(df_id),                                        # domain string
+    SH$character_vector_to_raw(id_vars),                            # identifier vars (names)
+    as.raw(RS_compute_data_frame_variable_types(df, id_vars)),      # identifier vars (types)
+    SH$character_vector_to_raw(tracked_vars),                       # tracked vars (names)
+    as.raw(RS_compute_data_frame_variable_types(df, tracked_vars)), # tracked vars (types)
+    SH$integer_to_raw(nrow(df)),                                    # row count
+    id_hashes,                                                      # one hash of id_vars per row
+    tracked_hashes                                                  # one hash of tracked_vars per row
   )
   
   return(res)
@@ -164,7 +193,9 @@ RS_parse_base <- function(contents){
   contents_hash <- readBin(con, raw(), 16L)
   domain_string <- SH$read_string_from_con(con)
   id_vars <- SH$read_character_vector_from_con(con)
+  id_var_types <- readBin(con, raw(), length(id_vars))
   tracked_vars <- SH$read_character_vector_from_con(con)
+  tracked_var_types <- readBin(con, raw(), length(tracked_vars))
   row_count <- readBin(con, integer(), 1L)
  
   id_hashes <- SH$read_hashes_from_con(con, row_count, 16L)
diff --git a/vignettes/review_design_notes.Rmd b/vignettes/review_design_notes.Rmd
@@ -122,9 +122,9 @@ There will use a small collection of files for each input dataset configured for
   - 1 complete hash of "ae" data.frame
   - 1 domain string ("ae")
   - n `id_vars` column names
-  - **MISSING**: n `id_vars` column types
+  - n `id_vars` column types (see "Variable type encoding" below)
   - m `tracked_vars` column names
-  - **MISSING**: m `tracked_vars` column types
+  - m `tracked_vars` column types (see "Variable type encoding" below)
   - 1 row count
   - p (1 per "ae" row) `hash_id(ae[id_vars])`
   - p (1 per "ae" row, *m* bytes long) `hash_tracked(ae[tracked_vars])`
@@ -160,6 +160,26 @@ These file structures are designed so that they start with a short heterogeneous
 
 These files won't benefit much from compression since their main content (the hashes) is by construction statistically indistinguishable from noise.
 
+#### Variable type encoding
+The two "variable type" `.base` fields are encoded as single bytes that take the following values:
+
+- Date: 1
+- POSIXct: 2
+- POSIXlt: 3
+- Logical: 10
+- Factor: 11
+- Integer: 13
+- Numeric: 14
+- Complex: 15
+- Character: 16
+- Raw: 24
+
+Most of these values are taken from the base R `SEXPTYPE` enum definition (see `src/include/Rinternals.h` on any recent R source distribution). 
+
+The values assigned to time types are arbitrary, because they are S3 objects and thus lack dedicated `SEXPTYPE` values.
+
+The type of a `factor()` variable is not fully defined by it being tagged as such, since the levels and their internal encoding is also part of the type. For purposes of hashing, the review feature of `dv.listings` treats the content of factor columns as `character()` by mapping their value to their assign string-like representation. This feature also is indifferent to a factor being ordered.
+
 ## Hashing
 We store hashes for the values of `id_vars` and `tracked_vars` dataset columns. These hashes serve as content IDs.