Skip to content

Commit a8d7ec1

Browse files
committed
[annotation] Use two bytes per hash_tracked column instead of one.
1 parent cece2e4 commit a8d7ec1

3 files changed

Lines changed: 37 additions & 35 deletions

File tree

R/review_structures.R

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
# nolint start
2+
3+
BYTES_PER_TRACKED_HASH <- 2L
4+
25
SH <- local({ # _S_erialization _H_elpers
36
get_UTC_time_in_seconds <- function() as.numeric(structure(Sys.time(), tzone = 'UTC'))
47
double_to_raw <- function(v) writeBin(v, con = raw(0), endian = 'little', useBytes = TRUE)
@@ -36,10 +39,12 @@ SH <- local({ # _S_erialization _H_elpers
3639
..ref_hash_tracked <- function(row) {
3740
n_col <- length(row)
3841

39-
res <- raw(n_col)
42+
res <- raw(BYTES_PER_TRACKED_HASH * n_col)
4043
for(i_col in seq(n_col)){
4144
col_indices <- (((i_col-1) + hash_tracked_offsets) %% n_col) + 1
42-
res[[i_col]] <- ..ref_hash_tracked_inner(row[col_indices])[[1]] # most significant byte
45+
first <- BYTES_PER_TRACKED_HASH * (i_col-1) + 1
46+
last <- BYTES_PER_TRACKED_HASH * i_col
47+
res[first:last] <- ..ref_hash_tracked_inner(row[col_indices])[1:BYTES_PER_TRACKED_HASH] # most significant bytes
4348
i_col <- i_col + 1
4449
}
4550

@@ -63,9 +68,9 @@ SH <- local({ # _S_erialization _H_elpers
6368
res <- list()
6469
for (i_col in seq_len(n_col)) {
6570
col_indices <- (((i_col - 1) + hash_tracked_offsets) %% n_col) + 1
66-
res[[i_col]] <- vectorized_hash_row(df[col_indices], algo = "xxh32")[1,] # most significant byte
71+
res[[i_col]] <- vectorized_hash_row(df[col_indices], algo = "xxh32")[1:BYTES_PER_TRACKED_HASH,] # most significant bytes
6772
}
68-
res <- matrix(unlist(res), nrow = ncol(df), ncol = nrow(df), byrow = TRUE)
73+
res <- do.call(rbind, res)
6974
return(res)
7075
}
7176

@@ -195,7 +200,7 @@ RS_compute_base_memory <- function(df_id, df, id_vars, tracked_vars){
195200
; if(any(duplicated(id_hashes, MARGIN = 2))) return(simpleCondition("Found duplicated IDs"))
196201

197202
tracked_hashes <- SH$hash_tracked(df[tracked_vars])
198-
; if(!identical(dim(tracked_hashes), c(length(tracked_vars), nrow(df))))
203+
; if(!identical(dim(tracked_hashes), c(BYTES_PER_TRACKED_HASH*length(tracked_vars), nrow(df))))
199204
return(simpleCondition("Internal error in tracked_vars hash preparation"))
200205

201206
# NOTE: We choose a serialization scheme with a well-known encoding. This avoid security concerns over
@@ -244,7 +249,7 @@ RS_parse_base <- function(contents){
244249
row_count <- readBin(con, integer(), 1L)
245250

246251
id_hashes <- SH$read_hashes_from_con(con, row_count, 16L)
247-
tracked_hashes <- SH$read_hashes_from_con(con, row_count, length(tracked_vars))
252+
tracked_hashes <- SH$read_hashes_from_con(con, row_count, BYTES_PER_TRACKED_HASH*length(tracked_vars))
248253

249254
empty_read <- readBin(con, raw(), 1L)
250255
; if(length(empty_read) > 0) return(simpleCondition("Too much hash data"))
@@ -282,7 +287,7 @@ RS_compute_delta_memory <- function(state, df){
282287
tracked_vars <- state$tracked_vars
283288
# FIXME: (LUIS): Ask Miguel about the postlude
284289
tracked_hashes <- (SH$hash_tracked(df[tracked_vars]) |> c() |>
285-
array(dim = c(length(tracked_vars), nrow(df))))
290+
array(dim = c(BYTES_PER_TRACKED_HASH*length(tracked_vars), nrow(df))))
286291

287292
# Assert against removal of rows
288293
local({
@@ -353,11 +358,11 @@ RS_parse_delta <- function(contents, tracked_var_count){
353358
domain <- SH$read_string_from_con(con)
354359
new_row_count <- readBin(con, integer(), 1L)
355360
new_id_hashes <- SH$read_hashes_from_con(con, new_row_count, 16L)
356-
new_tracked_hashes <- SH$read_hashes_from_con(con, new_row_count, tracked_var_count)
361+
new_tracked_hashes <- SH$read_hashes_from_con(con, new_row_count, BYTES_PER_TRACKED_HASH*tracked_var_count)
357362
modified_row_count <- NA_integer_
358363
modified_row_indices <- SH$read_integer_vector_from_con(con)
359364
modified_row_count <- length(modified_row_indices)
360-
modified_tracked_hashes <- SH$read_hashes_from_con(con, modified_row_count, tracked_var_count)
365+
modified_tracked_hashes <- SH$read_hashes_from_con(con, modified_row_count, BYTES_PER_TRACKED_HASH*tracked_var_count)
361366

362367
empty_read <- readBin(con, raw(), 1L)
363368
; if(length(empty_read) > 0) return(simpleCondition("Too much hash data"))
@@ -466,6 +471,7 @@ RS_load <- function(base, deltas){
466471
base_timestamp <- res$timestamp
467472
for(delta in deltas){
468473
state_delta <- RS_parse_delta(contents = delta, tracked_var_count = length(res[["tracked_vars"]]))
474+
if(inherits(state_delta, "simpleCondition")) return(state_delta)
469475

470476
if(!identical(state_delta$generation, res$generation+1L))
471477
return(simpleCondition(paste("Wrong generation marker. Should be", res$generation+1L)))

tests/testthat/test-hash_tracked.R

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,8 @@
11
test_that("SH$hash_tracked exhibits almost no false negatives and few false positives", {
2-
hash_df <- function(df, tracked_vars) {
3-
hashes <- SH$hash_tracked(df[tracked_vars])
4-
return(hashes)
5-
}
6-
72
# TODO(miguel): Refactor and move next to hash_tracked into SH
83
report_changes <- function(df, h0, verbose = FALSE){
94
res <- list()
10-
h1 <- hash_df(df, tracked_vars = colnames(df))
5+
h1 <- SH$hash_tracked(df[colnames(df)])
116

127
offsets <- c(0, 2, 3)
138

@@ -17,6 +12,7 @@ test_that("SH$hash_tracked exhibits almost no false negatives and few false posi
1712
prev <- as.integer(h0[,i_row])
1813
cur <- as.integer(h1[,i_row])
1914
diff <- (prev != cur)
15+
diff <- apply(matrix(diff, ncol = BYTES_PER_TRACKED_HASH, byrow = TRUE), 1, any)
2016
evidence <- integer(n_col)
2117
for(i in seq_len(n_col)){
2218
v <- diff[[i]]
@@ -56,7 +52,7 @@ test_that("SH$hash_tracked exhibits almost no false negatives and few false posi
5652
}
5753

5854
stress <- function(df, test_count, changes_per_test){
59-
hashes <- hash_df(df, tracked_vars = colnames(df))
55+
hashes <- SH$hash_tracked(df[colnames(df)])
6056

6157
n_row <- nrow(df)
6258
n_col <- ncol(df)
@@ -116,7 +112,7 @@ test_that("SH$hash_tracked exhibits almost no false negatives and few false posi
116112
false_negatives <- append(false_negatives, tail)
117113
}
118114
if(i_rep <= length(reported_changes)) {
119-
if(length(false_positives) == 0) false_positive_first_delta <- expected_changes
115+
if(length(false_positives) == 0) false_positive_first_delta <- expected_changes
120116
tail <- reported_changes[i_rep:length(reported_changes)]
121117
false_positives <- append(false_positives, tail)
122118
}

vignettes/data_review.Rmd

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ If we take a hypothetical "xyz" domain, `dv.listings` will store the following f
146146
- m `tracked_vars` variable types (see "Variable type encoding" below)
147147
- 1 row count
148148
- p (1 per "xyz" row) `hash_id(xyz[id_vars])`
149-
- p (1 per "xyz" row, *m* bytes long) `hash_tracked(xyz[tracked_vars])`
149+
- p (1 per "xyz" row, 2\*m bytes long) `hash_tracked(xyz[tracked_vars])`
150150
</details>
151151

152152
<details><summary>`xyz_001.delta` (one per domain dataset update)</summary>
@@ -158,10 +158,10 @@ If we take a hypothetical "xyz" domain, `dv.listings` will store the following f
158158
- 1 domain string ("xyz")
159159
- 1 count of new rows
160160
- n (1 per *new* "xyz" row) `hash_id(xyz[id_vars])`
161-
- n (1 per *new* "xyz" row, *m* bytes long) `hash_tracked(xyz[tracked_vars])`
161+
- n (1 per *new* "xyz" row, 2\*m bytes long) `hash_tracked(xyz[tracked_vars])`
162162
- 1 count of modified rows
163163
- p (1 per *modified* "xyz" row) row index
164-
- p (1 per *modified* "xyz" row, *m* bytes long) `hash_tracked(xyz[tracked_vars])`
164+
- p (1 per *modified* "xyz" row, 2\*m bytes long) `hash_tracked(xyz[tracked_vars])`
165165
</details>
166166

167167
<details><summary>`xyz_<ROLE>.review` (one per domain and ROLE)</summary>
@@ -180,7 +180,7 @@ If we take a hypothetical "xyz" domain, `dv.listings` will store the following f
180180

181181
(Row indices refer to indices in the stored base+delta matrix, which is append-only. These _canonical_ indices are as good as identifiers).
182182

183-
The dominant factor governing the size of these files is the length of a hash, which is 16 bytes, as we discuss in the "Hashing" session below. Row indices and delta timestamps can be encoded in 4 bytes. Review indices take up 1 byte each. Estimating an upper bound of 1 million rows per dataset, a `.base` file would take around 32 MiB. A comprehensive `.review` file for such a dataset would take around 9 MiB.
183+
The dominant factor governing the size of these files is the length of a hash, which is 16 bytes in the case of `hash_id` and 2\*m bytes (m being the number of tracked columns) in the case of `hash_tracked`, as we discuss in the "Hashing" session below. Row indices and delta timestamps can be encoded in 4 bytes. Review indices take up 1 byte each. Estimating an upper bound of 1 million rows per dataset, a `.base` file tracking 8 variables would take around 32 MiB. A comprehensive `.review` file for such a dataset would take around 9 MiB.
184184

185185
These file structures are designed so that they start with a short heterogeneous header that reiterates the information that can be gleaned from the file name. The rest of the records are all homogeneous and of known size. That allows to load them into memory without the need for expensive parsing.
186186

@@ -232,36 +232,36 @@ We opt instead for 128 bits, which makes the possibility of a collision extremel
232232
#### Hashing of tracked variables (`hash_tracked()`)
233233
We could apply the same reasoning behind the choice of the `hash_id()` function to the hashing of the variable parts of each row. We instead propose a more complex hashing scheme to provide partial information about *which variables of a row have been altered* when its hash changes.
234234

235-
Each hash value is *m* bytes long, where *m* is the number of variables tracked of a given dataset. Each of those bytes is an independent hash of three of the tracked variables of a dataset row. Each variable, in turn, contributes to three of the *m* byte-sized hashes. This mixing of variables makes it harder for an external adversarial observer of the `.base` and `.delta` files to brute-force the original values of the dataset by looking for collisions with the computed hash values.
235+
Each hash value is 2\*m bytes long, where *m* is the number of variables tracked of a given dataset. Each of those bytes is an independent hash of three of the tracked variables of a dataset row. Each variable, in turn, contributes to three of the 2\*m byte-sized hashes. This mixing of variables makes it harder for an external adversarial observer of the `.base` and `.delta` files to brute-force the original values of the dataset by looking for collisions with the computed hash values.
236236

237-
To compute which variables contribute to which hash byte, we use the following scheme:
237+
To compute which variables contribute to which hash byte pair, we use the following scheme:
238238

239-
- Byte *n*: Variables (*n*+0)%*n*, (*n*+2)%*n* and (*n*+3)%*n*
239+
- Byte pair *n*: Variables (*n*+0)%*n*, (*n*+2)%*n* and (*n*+3)%*n*
240240

241241
Where `%` indicates the remainder of the integer division.
242242

243243
So, for a input dataset with seven tracked variables (zero through six), this would mean:
244244

245-
- Byte 0: Variables 0, 2 and 3
246-
- Byte 1: Variables 1, 3 and 4
247-
- Byte 2: Variables 2, 4 and 5
248-
- Byte 3: Variables 3, 5 and 6
249-
- Byte 4: Variables 4, 6 and 0
250-
- Byte 5: Variables 5, 0 and 1
251-
- Byte 6: Variables 6, 1 and 2
245+
- Byte pair 0: Variables 0, 2 and 3
246+
- Byte pair 1: Variables 1, 3 and 4
247+
- Byte pair 2: Variables 2, 4 and 5
248+
- Byte pair 3: Variables 3, 5 and 6
249+
- Byte pair 4: Variables 4, 6 and 0
250+
- Byte pair 5: Variables 5, 0 and 1
251+
- Byte pair 6: Variables 6, 1 and 2
252252

253-
This scheme creates a unique mixtures of variables. Take, for instance, variable 0. It is combined with variables 2 and 3 on the zeroth byte, with variables 4 and 6 on the fourth byte and with 1 and 5 for the fifth byte.
253+
This scheme creates a unique mixtures of variables. Take, for instance, variable 0. It is combined with variables 2 and 3 on the zeroth byte pair, with variables 4 and 6 on the fourth byte pair and with 1 and 5 for the fifth byte pair.
254254

255-
Each of these bytes is computed by:
255+
Each of these byte pairs is computed by:
256256

257257
- Taking the three values to hash.
258258
- Serializing them to text and concatenating them using the non-ASCII byte separator `1D` (also known as "group separator").
259-
- Computing the `xxh32` hash and returning its most significant byte.
259+
- Computing the `xxh32` hash and returning its two most significant bytes.
260260

261261
Informal testing (refer to `tests/testthat/tests-hash_tracked.R` for more details) of this hashing scheme shows the following properties:
262262

263263
- It's capable of identifying up to four modified variables per row (after that, it's preferable to give up and notify the whole row as modified).
264-
- It has a very low **false negative rate** (a variable is modified without it being notified as such) of one for every 8 million row updates.
264+
- It has a very low **false negative rate** (a variable is modified without it being notified as such).
265265
- It has a low **false positive rate** (a variable that retains its value is notified as modified). This only happens when there are actual changes to a row.
266266

267267
False positives are not critical, as they ask reviewers to consider a larger set of variables when re-reviewing a row that has been altered.

0 commit comments

Comments
 (0)