Skip to content

Commit cece2e4

Browse files
committed
Merge branch 'merge_point/review' into 303774-app_creator_vignette
2 parents ebc354e + c595979 commit cece2e4

3 files changed

Lines changed: 150 additions & 10 deletions

File tree

R/review_structures.R

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@ SH <- local({ # _S_erialization _H_elpers
1515
return(res)
1616
}
1717

18-
hash_id <- function(row) {
18+
..ref_hash_id <- function(row) {
1919
input <- paste(row, collapse = '\1D')
2020
input <- charToRaw(input)
2121
res <- xxhashlite::xxhash_raw(input, as_raw = TRUE)
2222
return(res)
2323
}
2424

25-
hash_tracked_inner <- function(row) {
25+
..ref_hash_tracked_inner <- function(row) {
2626
# FIXME: Ensure that precision of numeric values does not affect serialization
2727
# Maybe by using a string hex representation of their binary contents
2828
input <- paste(row, collapse = '\1D')
@@ -33,19 +33,42 @@ SH <- local({ # _S_erialization _H_elpers
3333

3434
hash_tracked_offsets <- c(0, 2, 3)
3535

36-
hash_tracked <- function(row) {
36+
..ref_hash_tracked <- function(row) {
3737
n_col <- length(row)
3838

3939
res <- raw(n_col)
40-
for(i_col in seq(n_col)){
40+
for(i_col in seq(n_col)){
4141
col_indices <- (((i_col-1) + hash_tracked_offsets) %% n_col) + 1
42-
res[[i_col]] <- hash_tracked_inner(row[col_indices])[[1]] # most significant byte
42+
res[[i_col]] <- ..ref_hash_tracked_inner(row[col_indices])[[1]] # most significant byte
4343
i_col <- i_col + 1
4444
}
4545

4646
return(res)
4747
}
4848

49+
vectorized_hash_row <- function(df, algo = "xxh128") {
50+
vectorized_hash_id <- Vectorize(function(x) xxhashlite::xxhash_raw(charToRaw(x), as_raw = TRUE, algo = algo), USE.NAMES = FALSE, SIMPLIFY = FALSE)
51+
single_col <- do.call(function(...) paste(..., sep = "\1D"), lapply(df, as.character))
52+
hashed_col <- vectorized_hash_id(single_col)
53+
n_col <- length(hashed_col)
54+
n_row <- if (length(hashed_col) > 0) length(hashed_col[[1]]) else 0
55+
res <- matrix(unlist(vectorized_hash_id(single_col)), nrow = n_row, ncol = n_col)
56+
res
57+
}
58+
59+
hash_id <- vectorized_hash_row
60+
61+
hash_tracked <- function(df) {
62+
n_col <- ncol(df)
63+
res <- list()
64+
for (i_col in seq_len(n_col)) {
65+
col_indices <- (((i_col - 1) + hash_tracked_offsets) %% n_col) + 1
66+
res[[i_col]] <- vectorized_hash_row(df[col_indices], algo = "xxh32")[1,] # most significant byte
67+
}
68+
res <- matrix(unlist(res), nrow = ncol(df), ncol = nrow(df), byrow = TRUE)
69+
return(res)
70+
}
71+
4972
read_string_from_con <- function(con){
5073
res <- NULL
5174
n <- readBin(con, integer(), 1L)
@@ -85,6 +108,10 @@ SH <- local({ # _S_erialization _H_elpers
85108
integer_vector_to_raw = integer_vector_to_raw,
86109
hash_id = hash_id,
87110
hash_tracked = hash_tracked,
111+
..ref = list(
112+
hash_id = ..ref_hash_id,
113+
hash_tracked = ..ref_hash_tracked
114+
),
88115
read_string_from_con = read_string_from_con,
89116
read_character_vector_from_con = read_character_vector_from_con,
90117
read_integer_vector_from_con = read_integer_vector_from_con,
@@ -149,7 +176,7 @@ RS_parse_data_frame_variable_types <- function(v){
149176
}
150177

151178
RS_compute_id_hashes <- function(df, id_vars){
152-
return(apply(df[id_vars], 1, SH$hash_id, simplify = TRUE)) # coerces all types to be the same (character?)
179+
return(SH$hash_id(df[id_vars]))
153180
}
154181

155182
RS_compute_base_memory <- function(df_id, df, id_vars, tracked_vars){
@@ -167,7 +194,7 @@ RS_compute_base_memory <- function(df_id, df, id_vars, tracked_vars){
167194
; if(!identical(dim(id_hashes), c(16L, nrow(df)))) return(simpleCondition("Internal error in id_vars hash preparation"))
168195
; if(any(duplicated(id_hashes, MARGIN = 2))) return(simpleCondition("Found duplicated IDs"))
169196

170-
tracked_hashes <- apply(df[tracked_vars], 1, SH$hash_tracked, simplify = TRUE)
197+
tracked_hashes <- SH$hash_tracked(df[tracked_vars])
171198
; if(!identical(dim(tracked_hashes), c(length(tracked_vars), nrow(df))))
172199
return(simpleCondition("Internal error in tracked_vars hash preparation"))
173200

@@ -253,9 +280,10 @@ RS_compute_delta_memory <- function(state, df){
253280
id_hashes <- RS_compute_id_hashes(df, id_vars) |> c() |> array(dim = c(16L, nrow(df)))
254281

255282
tracked_vars <- state$tracked_vars
256-
tracked_hashes <- (apply(df[tracked_vars], 1, SH$hash_tracked, simplify = TRUE) |> c() |>
283+
# FIXME: (LUIS): Ask Miguel about the postlude
284+
tracked_hashes <- (SH$hash_tracked(df[tracked_vars]) |> c() |>
257285
array(dim = c(length(tracked_vars), nrow(df))))
258-
286+
259287
# Assert against removal of rows
260288
local({
261289
merged <- cbind(id_hashes, state$id_hashes, deparse.level = 0)
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# 303747-replace-apply-for-bug
2+
#
3+
# When using apply to iterate through the rows of a data.frame it automatically casts the type to the a common
4+
# representation. In the case below, the common representation is character therefore before the loop starts all
5+
# df is transformed into character. Curiously, the casting does not work as one would expect and because the widest
6+
# element in b is two characters b will be casted in a character vector of elements of width 2 left-padded with spaces.
7+
#
8+
# # apply(as.data.frame(list(a = c(1L,2L), b = c(2L, 20L), c = "B")), 1, function(row) paste(row, collapse = "%%"))
9+
# # [1] "1%% 2%%B" "2%%20%%B" # Notice the space before the two, when one element is of at least width 2
10+
#
11+
# # apply(as.data.frame(list(a = c(1L,2L), b = c(2L, 0L), c = "B")), 1, function(row) paste0(row, collapse = "%%"))
12+
# # [1] "1%%2%%B" "2%%0%%B" # Notice how the space is not there when all elements are of width 1
13+
#
14+
# # apply(as.data.frame(list(a = c(1L,2L), b = c(2L, 100L), c = "B")), 1, function(row) paste0(row, collapse = "%%"))
15+
# # [1] "1%% 2%%B" "2%%100%%B" # Notice the two spaces when widest element is of length three
16+
#
17+
# Also fails for
18+
# # apply(as.data.frame(list(a = c(1.21,2.2), c = "B")), 1, function(row) paste0(row, collapse = "%%")) # See trailing 0 in 2.2
19+
# # apply(as.data.frame(list(a = c(1.2,2.2), c = "B")), 1, function(row) paste0(row, collapse = "%%"))
20+
#
21+
# When calculating hashes the following call was used
22+
# # apply(df, 1, SH$hash_tracked, simplify = TRUE)
23+
# Therefore returned hashes for each row differ even when the row in df was the same, as the casted contents
24+
# differ.
25+
#
26+
27+
# Save current RNG state
28+
if (!exists(".Random.seed", envir = .GlobalEnv)) runif(1) # ensure seed exists
29+
old_seed <- .Random.seed
30+
on.exit(assign(".Random.seed", old_seed, envir = .GlobalEnv), add = TRUE)
31+
set.seed(Sys.time())
32+
33+
generate_random_df <- function(n_rows, equal_length = FALSE) {
34+
35+
random_strings <- function(n) {
36+
min_len <- 1
37+
max_len <- 20
38+
replicate(
39+
n,
40+
{
41+
l <- if(equal_length) max_len else sample(min_len:max_len, 1)
42+
paste0(sample(c(letters, LETTERS), l, replace = TRUE), collapse = "")
43+
}
44+
)
45+
}
46+
47+
# Random date range
48+
start_date <- as.Date("2000-01-01")
49+
end_date <- as.Date("2020-12-31")
50+
date_range <- as.numeric(end_date - start_date)
51+
52+
num <- if(equal_length) sample(10:99, n_rows, replace = TRUE) + sample(1:9, n_rows, replace = TRUE)/100 else rnorm(n_rows, mean = 100)
53+
int <- as.integer(if(equal_length) sample(10:99, n_rows, replace = TRUE) else sample(1:100, n_rows, replace = TRUE))
54+
55+
df <- data.frame(
56+
num = num,
57+
int = int,
58+
date = start_date + sample(0:date_range, n_rows, TRUE),
59+
log = sample(c(TRUE, FALSE), n_rows, replace = TRUE),
60+
factor = factor(random_strings(n_rows)),
61+
char = random_strings(n_rows)
62+
)
63+
64+
return(df)
65+
}
66+
67+
68+
# Tested by double programming with the previous hash functions
69+
fixed_apply_hash <- function(df, fun) {
70+
char <- lapply(df, as.character)
71+
mat <- matrix(unlist(char), ncol = length(char), nrow = length(char[[1]]))
72+
apply(mat, 1, fun)
73+
}
74+
75+
76+
test_that(
77+
"apply, fixed_lapply and hash_id are identical when rows of each column have the same width", {
78+
79+
df <- generate_random_df(100, equal_length = TRUE)
80+
apply_hash_res <- apply(df, 1, SH$`..ref`$hash_id, simplify = TRUE)
81+
fixed_apply_hash_res <- fixed_apply_hash(df, SH$`..ref`$hash_id)
82+
vectorized_hash <- SH$hash_id(df)
83+
expect_identical(apply_hash_res, fixed_apply_hash_res)
84+
expect_identical(apply_hash_res, vectorized_hash)
85+
})
86+
87+
test_that(
88+
"fixed_lapply and hash_id are identical when rows of each column have different width", {
89+
df <- generate_random_df(100, equal_length = FALSE)
90+
fixed_apply_hash_res <- fixed_apply_hash(df, SH$`..ref`$hash_id)
91+
vectorized_hash <- SH$hash_id(df)
92+
expect_identical(fixed_apply_hash_res, vectorized_hash)
93+
})
94+
95+
test_that(
96+
"apply, fixed_lapply and hash_tracked are identical when rows of each column have the same width", {
97+
98+
df <- generate_random_df(100, equal_length = TRUE)
99+
apply_hash_res <- apply(df, 1, SH$`..ref`$hash_tracked, simplify = TRUE)
100+
fixed_apply_hash_res <- fixed_apply_hash(df, SH$`..ref`$hash_tracked)
101+
vectorized_hash <- SH$hash_tracked(df)
102+
expect_identical(apply_hash_res, fixed_apply_hash_res)
103+
expect_identical(apply_hash_res, vectorized_hash)
104+
})
105+
106+
test_that(
107+
"fixed_lapply and hash_tracked are identical when rows of each column have different width", {
108+
df <- generate_random_df(100, equal_length = FALSE)
109+
fixed_apply_hash_res <- fixed_apply_hash(df, SH$`..ref`$hash_tracked)
110+
vectorized_hash <- SH$hash_tracked(df)
111+
expect_identical(fixed_apply_hash_res, vectorized_hash)
112+
})

tests/testthat/test-hash_tracked.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
test_that("SH$hash_tracked exhibits almost no false negatives and few false positives", {
22
hash_df <- function(df, tracked_vars) {
3-
hashes <- apply(df[tracked_vars], 1, SH$hash_tracked, simplify = TRUE) # coerces all types to be the same (character?)
3+
hashes <- SH$hash_tracked(df[tracked_vars])
44
return(hashes)
55
}
66

0 commit comments

Comments
 (0)