Skip to content

Commit 606866b

Browse files
authored
Merge pull request #24 from Boehringer-Ingelheim/feat/dataset_update_checks
[annotation] Store dataframe variable types in `.base` file.
2 parents 4966c1d + 7fe154b commit 606866b

2 files changed

Lines changed: 64 additions & 13 deletions

File tree

R/review_structures.R

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,33 @@ RS_hash_data_frame <- function(df){
105105
return(res)
106106
}
107107

108+
RS_compute_data_frame_variable_types <- function(df, vars){
109+
res <- raw(length(vars))
110+
for(i_var in seq_along(vars)){
111+
v <- 0
112+
var <- df[[vars[[i_var]]]]
113+
114+
# The order of this comparison matters, because e.g. POSIXct variables are also numeric
115+
# They go from more to less restrictive
116+
if(inherits(var, "Date")) v <- 1
117+
else if(inherits(var, "POSIXct")) v <- 2
118+
else if(inherits(var, "POSIXlt")) v <- 3
119+
else if(is.logical(var)) v <- 10
120+
else if(is.factor(var)) v <- 11
121+
else if(is.integer(var)) v <- 13
122+
else if(is.numeric(var)) v <- 14
123+
else if(is.complex(var)) v <- 15
124+
else if(is.character(var)) v <- 16
125+
else if(is.raw(var)) v <- 24
126+
127+
checkmate::assert_true(v != 0)
128+
129+
res[[i_var]] <- as.raw(v)
130+
}
131+
132+
return(res)
133+
}
134+
108135
RS_compute_base_memory <- function(df_id, df, id_vars, tracked_vars){
109136
checkmate::assert_string(df_id, min.chars = 1, max.chars = 65535)
110137
checkmate::assert_data_frame(df)
@@ -133,17 +160,19 @@ RS_compute_base_memory <- function(df_id, df, id_vars, tracked_vars){
133160
# loss of data in case something goes wrong, as we can look for the last correct byte on any given
134161
# file, discard the remainder bytes and end up in a consistent state.
135162
res <- c(
136-
charToRaw("LISTBASE"), # file magic code
137-
as.raw(0), # format version number
138-
as.raw(0), # generation marker
139-
SH$double_to_raw(SH$get_UTC_time_in_seconds()), # timestamp
140-
df_hash, # complete hash of input data.frame
141-
SH$string_to_raw(df_id), # domain string
142-
SH$character_vector_to_raw(id_vars), # identifier vars
143-
SH$character_vector_to_raw(tracked_vars), # tracked vars
144-
SH$integer_to_raw(nrow(df)), # row count
145-
id_hashes, # one hash of id_vars per row
146-
tracked_hashes # one hash of tracked_vars per row
163+
charToRaw("LISTBASE"), # file magic code
164+
as.raw(0), # format version number
165+
as.raw(0), # generation marker
166+
SH$double_to_raw(SH$get_UTC_time_in_seconds()), # timestamp
167+
df_hash, # complete hash of input data.frame
168+
SH$string_to_raw(df_id), # domain string
169+
SH$character_vector_to_raw(id_vars), # identifier vars (names)
170+
as.raw(RS_compute_data_frame_variable_types(df, id_vars)), # identifier vars (types)
171+
SH$character_vector_to_raw(tracked_vars), # tracked vars (names)
172+
as.raw(RS_compute_data_frame_variable_types(df, tracked_vars)), # tracked vars (types)
173+
SH$integer_to_raw(nrow(df)), # row count
174+
id_hashes, # one hash of id_vars per row
175+
tracked_hashes # one hash of tracked_vars per row
147176
)
148177

149178
return(res)
@@ -164,7 +193,9 @@ RS_parse_base <- function(contents){
164193
contents_hash <- readBin(con, raw(), 16L)
165194
domain_string <- SH$read_string_from_con(con)
166195
id_vars <- SH$read_character_vector_from_con(con)
196+
id_var_types <- readBin(con, raw(), length(id_vars))
167197
tracked_vars <- SH$read_character_vector_from_con(con)
198+
tracked_var_types <- readBin(con, raw(), length(tracked_vars))
168199
row_count <- readBin(con, integer(), 1L)
169200

170201
id_hashes <- SH$read_hashes_from_con(con, row_count, 16L)

vignettes/review_design_notes.Rmd

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,9 @@ There will use a small collection of files for each input dataset configured for
122122
- 1 complete hash of "ae" data.frame
123123
- 1 domain string ("ae")
124124
- n `id_vars` column names
125-
- **MISSING**: n `id_vars` column types
125+
- n `id_vars` column types (see "Variable type encoding" below)
126126
- m `tracked_vars` column names
127-
- **MISSING**: m `tracked_vars` column types
127+
- m `tracked_vars` column types (see "Variable type encoding" below)
128128
- 1 row count
129129
- p (1 per "ae" row) `hash_id(ae[id_vars])`
130130
- p (1 per "ae" row, *m* bytes long) `hash_tracked(ae[tracked_vars])`
@@ -160,6 +160,26 @@ These file structures are designed so that they start with a short heterogeneous
160160

161161
These files won't benefit much from compression since their main content (the hashes) is by construction statistically indistinguishable from noise.
162162

163+
#### Variable type encoding
164+
The two "variable type" `.base` fields are encoded as single bytes that take the following values:
165+
166+
- Date: 1
167+
- POSIXct: 2
168+
- POSIXlt: 3
169+
- Logical: 10
170+
- Factor: 11
171+
- Integer: 13
172+
- Numeric: 14
173+
- Complex: 15
174+
- Character: 16
175+
- Raw: 24
176+
177+
Most of these values are taken from the base R `SEXPTYPE` enum definition (see `src/include/Rinternals.h` on any recent R source distribution).
178+
179+
The values assigned to time types are arbitrary, because they are S3 objects and thus lack dedicated `SEXPTYPE` values.
180+
181+
The type of a `factor()` variable is not fully defined by it being tagged as such, since the levels and their internal encoding is also part of the type. For purposes of hashing, the review feature of `dv.listings` treats the content of factor columns as `character()` by mapping their value to their assign string-like representation. This feature also is indifferent to a factor being ordered.
182+
163183
## Hashing
164184
We store hashes for the values of `id_vars` and `tracked_vars` dataset columns. These hashes serve as content IDs.
165185

0 commit comments

Comments
 (0)