|
| 1 | +# 303747-replace-apply-for-bug |
| 2 | +# |
| 3 | +# When using apply to iterate through the rows of a data.frame it automatically casts the type to the a common |
| 4 | +# representation. In the case below, the common representation is character therefore before the loop starts all |
| 5 | +# df is transformed into character. Curiously, the casting does not work as one would expect and because the widest |
| 6 | +# element in b is two characters b will be casted in a character vector of elements of width 2 left-padded with spaces. |
| 7 | +# |
| 8 | +# # apply(as.data.frame(list(a = c(1L,2L), b = c(2L, 20L), c = "B")), 1, function(row) paste(row, collapse = "%%")) |
| 9 | +# # [1] "1%% 2%%B" "2%%20%%B" # Notice the space before the two, when one element is of at least width 2 |
| 10 | +# |
| 11 | +# # apply(as.data.frame(list(a = c(1L,2L), b = c(2L, 0L), c = "B")), 1, function(row) paste0(row, collapse = "%%")) |
| 12 | +# # [1] "1%%2%%B" "2%%0%%B" # Notice how the space is not there when all elements are of width 1 |
| 13 | +# |
| 14 | +# # apply(as.data.frame(list(a = c(1L,2L), b = c(2L, 100L), c = "B")), 1, function(row) paste0(row, collapse = "%%")) |
| 15 | +# # [1] "1%% 2%%B" "2%%100%%B" # Notice the two spaces when widest element is of length three |
| 16 | +# |
| 17 | +# Also fails for |
| 18 | +# # apply(as.data.frame(list(a = c(1.21,2.2), c = "B")), 1, function(row) paste0(row, collapse = "%%")) # See trailing 0 in 2.2 |
| 19 | +# # apply(as.data.frame(list(a = c(1.2,2.2), c = "B")), 1, function(row) paste0(row, collapse = "%%")) |
| 20 | +# |
| 21 | +# When calculating hashes the following call was used |
| 22 | +# # apply(df, 1, SH$hash_tracked, simplify = TRUE) |
| 23 | +# Therefore returned hashes for each row differ even when the row in df was the same, as the casted contents |
| 24 | +# differ. |
| 25 | +# |
| 26 | + |
| 27 | +# Save current RNG state |
| 28 | +if (!exists(".Random.seed", envir = .GlobalEnv)) runif(1) # ensure seed exists |
| 29 | +old_seed <- .Random.seed |
| 30 | +on.exit(assign(".Random.seed", old_seed, envir = .GlobalEnv), add = TRUE) |
| 31 | +set.seed(Sys.time()) |
| 32 | + |
| 33 | +generate_random_df <- function(n_rows, equal_length = FALSE) { |
| 34 | + |
| 35 | + random_strings <- function(n) { |
| 36 | + min_len <- 1 |
| 37 | + max_len <- 20 |
| 38 | + replicate( |
| 39 | + n, |
| 40 | + { |
| 41 | + l <- if(equal_length) max_len else sample(min_len:max_len, 1) |
| 42 | + paste0(sample(c(letters, LETTERS), l, replace = TRUE), collapse = "") |
| 43 | + } |
| 44 | + ) |
| 45 | + } |
| 46 | + |
| 47 | + # Random date range |
| 48 | + start_date <- as.Date("2000-01-01") |
| 49 | + end_date <- as.Date("2020-12-31") |
| 50 | + date_range <- as.numeric(end_date - start_date) |
| 51 | + |
| 52 | + num <- if(equal_length) sample(10:99, n_rows, replace = TRUE) + sample(1:9, n_rows, replace = TRUE)/100 else rnorm(n_rows, mean = 100) |
| 53 | + int <- as.integer(if(equal_length) sample(10:99, n_rows, replace = TRUE) else sample(1:100, n_rows, replace = TRUE)) |
| 54 | + |
| 55 | + df <- data.frame( |
| 56 | + num = num, |
| 57 | + int = int, |
| 58 | + date = start_date + sample(0:date_range, n_rows, TRUE), |
| 59 | + log = sample(c(TRUE, FALSE), n_rows, replace = TRUE), |
| 60 | + factor = factor(random_strings(n_rows)), |
| 61 | + char = random_strings(n_rows) |
| 62 | + ) |
| 63 | + |
| 64 | + return(df) |
| 65 | +} |
| 66 | + |
| 67 | + |
| 68 | +# Tested by double programming with the previous hash functions |
| 69 | +fixed_apply_hash <- function(df, fun) { |
| 70 | + char <- lapply(df, as.character) |
| 71 | + mat <- matrix(unlist(char), ncol = length(char), nrow = length(char[[1]])) |
| 72 | + apply(mat, 1, fun) |
| 73 | +} |
| 74 | + |
| 75 | + |
| 76 | +test_that( |
| 77 | + "apply, fixed_lapply and hash_id are identical when rows of each column have the same width", { |
| 78 | + |
| 79 | + df <- generate_random_df(100, equal_length = TRUE) |
| 80 | + apply_hash_res <- apply(df, 1, SH$`..ref`$hash_id, simplify = TRUE) |
| 81 | + fixed_apply_hash_res <- fixed_apply_hash(df, SH$`..ref`$hash_id) |
| 82 | + vectorized_hash <- SH$hash_id(df) |
| 83 | + expect_identical(apply_hash_res, fixed_apply_hash_res) |
| 84 | + expect_identical(apply_hash_res, vectorized_hash) |
| 85 | +}) |
| 86 | + |
| 87 | +test_that( |
| 88 | + "fixed_lapply and hash_id are identical when rows of each column have different width", { |
| 89 | + df <- generate_random_df(100, equal_length = FALSE) |
| 90 | + fixed_apply_hash_res <- fixed_apply_hash(df, SH$`..ref`$hash_id) |
| 91 | + vectorized_hash <- SH$hash_id(df) |
| 92 | + expect_identical(fixed_apply_hash_res, vectorized_hash) |
| 93 | +}) |
| 94 | + |
| 95 | +test_that( |
| 96 | + "apply, fixed_lapply and hash_tracked are identical when rows of each column have the same width", { |
| 97 | + |
| 98 | + df <- generate_random_df(100, equal_length = TRUE) |
| 99 | + apply_hash_res <- apply(df, 1, SH$`..ref`$hash_tracked, simplify = TRUE) |
| 100 | + fixed_apply_hash_res <- fixed_apply_hash(df, SH$`..ref`$hash_tracked) |
| 101 | + vectorized_hash <- SH$hash_tracked(df) |
| 102 | + expect_identical(apply_hash_res, fixed_apply_hash_res) |
| 103 | + expect_identical(apply_hash_res, vectorized_hash) |
| 104 | +}) |
| 105 | + |
| 106 | +test_that( |
| 107 | + "fixed_lapply and hash_tracked are identical when rows of each column have different width", { |
| 108 | + df <- generate_random_df(100, equal_length = FALSE) |
| 109 | + fixed_apply_hash_res <- fixed_apply_hash(df, SH$`..ref`$hash_tracked) |
| 110 | + vectorized_hash <- SH$hash_tracked(df) |
| 111 | + expect_identical(fixed_apply_hash_res, vectorized_hash) |
| 112 | +}) |
0 commit comments