Skip to content

Commit f649008

Browse files
committed
[annotation] Store dataframe variable types in .base file.
1 parent 4966c1d commit f649008

2 files changed

Lines changed: 62 additions & 13 deletions

File tree

R/review_structures.R

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,31 @@ RS_hash_data_frame <- function(df){
105105
return(res)
106106
}
107107

108+
RS_compute_data_frame_variable_types <- function(df, vars){
109+
res <- raw(length(vars))
110+
for(i_var in seq_along(vars)){
111+
v <- 0
112+
var <- df[[vars[[i_var]]]]
113+
114+
if(inherits(var, "Date")) v <- 1
115+
else if(inherits(var, "POSIXct")) v <- 2
116+
else if(inherits(var, "POSIXlt")) v <- 3
117+
else if(is.logical(var)) v <- 10
118+
else if(is.factor(var)) v <- 11
119+
else if(is.integer(var)) v <- 13
120+
else if(is.numeric(var)) v <- 14
121+
else if(is.complex(var)) v <- 15
122+
else if(is.character(var)) v <- 16
123+
else if(is.raw(var)) v <- 24
124+
125+
checkmate::assert_true(v != 0)
126+
127+
res[[i_var]] <- as.raw(v)
128+
}
129+
130+
return(res)
131+
}
132+
108133
RS_compute_base_memory <- function(df_id, df, id_vars, tracked_vars){
109134
checkmate::assert_string(df_id, min.chars = 1, max.chars = 65535)
110135
checkmate::assert_data_frame(df)
@@ -133,17 +158,19 @@ RS_compute_base_memory <- function(df_id, df, id_vars, tracked_vars){
133158
# loss of data in case something goes wrong, as we can look for the last correct byte on any given
134159
# file, discard the remainder bytes and end up in a consistent state.
135160
res <- c(
136-
charToRaw("LISTBASE"), # file magic code
137-
as.raw(0), # format version number
138-
as.raw(0), # generation marker
139-
SH$double_to_raw(SH$get_UTC_time_in_seconds()), # timestamp
140-
df_hash, # complete hash of input data.frame
141-
SH$string_to_raw(df_id), # domain string
142-
SH$character_vector_to_raw(id_vars), # identifier vars
143-
SH$character_vector_to_raw(tracked_vars), # tracked vars
144-
SH$integer_to_raw(nrow(df)), # row count
145-
id_hashes, # one hash of id_vars per row
146-
tracked_hashes # one hash of tracked_vars per row
161+
charToRaw("LISTBASE"), # file magic code
162+
as.raw(0), # format version number
163+
as.raw(0), # generation marker
164+
SH$double_to_raw(SH$get_UTC_time_in_seconds()), # timestamp
165+
df_hash, # complete hash of input data.frame
166+
SH$string_to_raw(df_id), # domain string
167+
SH$character_vector_to_raw(id_vars), # identifier vars (names)
168+
as.raw(RS_compute_data_frame_variable_types(df, id_vars)), # identifier vars (types)
169+
SH$character_vector_to_raw(tracked_vars), # tracked vars (names)
170+
as.raw(RS_compute_data_frame_variable_types(df, tracked_vars)), # tracked vars (types)
171+
SH$integer_to_raw(nrow(df)), # row count
172+
id_hashes, # one hash of id_vars per row
173+
tracked_hashes # one hash of tracked_vars per row
147174
)
148175

149176
return(res)
@@ -164,7 +191,9 @@ RS_parse_base <- function(contents){
164191
contents_hash <- readBin(con, raw(), 16L)
165192
domain_string <- SH$read_string_from_con(con)
166193
id_vars <- SH$read_character_vector_from_con(con)
194+
id_var_types <- readBin(con, raw(), length(id_vars))
167195
tracked_vars <- SH$read_character_vector_from_con(con)
196+
tracked_var_types <- readBin(con, raw(), length(tracked_vars))
168197
row_count <- readBin(con, integer(), 1L)
169198

170199
id_hashes <- SH$read_hashes_from_con(con, row_count, 16L)

vignettes/review_design_notes.Rmd

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,9 @@ There will use a small collection of files for each input dataset configured for
122122
- 1 complete hash of "ae" data.frame
123123
- 1 domain string ("ae")
124124
- n `id_vars` column names
125-
- **MISSING**: n `id_vars` column types
125+
- n `id_vars` column types (see "Variable type encoding" below)
126126
- m `tracked_vars` column names
127-
- **MISSING**: m `tracked_vars` column types
127+
- m `tracked_vars` column types (see "Variable type encoding" below)
128128
- 1 row count
129129
- p (1 per "ae" row) `hash_id(ae[id_vars])`
130130
- p (1 per "ae" row, *m* bytes long) `hash_tracked(ae[tracked_vars])`
@@ -160,6 +160,26 @@ These file structures are designed so that they start with a short heterogeneous
160160

161161
These files won't benefit much from compression since their main content (the hashes) is by construction statistically indistinguishable from noise.
162162

163+
#### Variable type encoding
164+
The two "variable type" `.base` fields are encoded as single bytes that take the following values:
165+
166+
- Date: 1
167+
- POSIXct: 2
168+
- POSIXlt: 3
169+
- Logical: 10
170+
- Factor: 11
171+
- Integer: 13
172+
- Numeric: 14
173+
- Complex: 15
174+
- Character: 16
175+
- Raw: 24
176+
177+
Most of these values are taken from the base R `SEXPTYPE` enum definition (see `src/include/Rinternals.h` on any recent R source distribution).
178+
179+
The values assigned to time types are arbitrary, because they are S3 objects and thus lack dedicated `SEXPTYPE` values.
180+
181+
The type of a `factor()` variable is not fully defined by it being tagged as such, since the levels and their internal encoding is also part of the type. For purposes of hashing, the review feature of `dv.listings` treats the content of factor columns as `character()` by mapping their value to their assign string-like representation. This feature also is indifferent to a factor being ordered.
182+
163183
## Hashing
164184
We store hashes for the values of `id_vars` and `tracked_vars` dataset columns. These hashes serve as content IDs.
165185

0 commit comments

Comments
 (0)