Merge pull request #763 from SebKrantz/development

SebKrantz · web-flow · commit c8dca5334651 · 2025-04-16T13:38:21.000+02:00
Development
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: collapse
 Title: Advanced and Fast Data Transformation
-Version: 2.1.1
-Date: 2025-04-14
+Version: 2.1.1.9000
+Date: 2025-04-16
 Authors@R: c(
            person("Sebastian", "Krantz", role = c("aut", "cre"), 
                   email = "sebastian.krantz@graduateinstitute.ch", 
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# collapse 2.1.1.9000
+
+* `na_insert` has new argument `set` to do this by reference. 
+
 # collapse 2.1.1
 
 * `alloc(list(1), 2)` now gives `list(1, 1)` instead of `list(list(1), list(1))`, which can still be generated with `alloc(list(1), 2, simplify = FALSE)`. This change also affects `ftransform()`/`fmutate()`, making, e.g., `fmutate(data, y = list(1))` consistent with `dplyr::mutate(data, y = list(1))`. Thanks @MattAFiedler (#753). 
diff --git a/R/small_helper.R b/R/small_helper.R
@@ -445,17 +445,24 @@ na_omit <- function(X, cols = NULL, na.attr = FALSE, prop = 0, ...) {
   res
 }
 
-na_insert <- function(X, prop = 0.1, value = NA) {
+na_insert <- function(X, prop = 0.1, value = NA, set = FALSE) {
   if(is.list(X)) {
     n <- fnrow(X)
     nmiss <- floor(n * prop)
-    res <- duplAttributes(lapply(unattrib(X), function(y) `[<-`(y, sample.int(n, nmiss), value = value)), X)
+    if(set) {
+      lapply(unattrib(X), function(y) scv(y, sample.int(n, nmiss), value, TRUE))
+      return(invisible(X))
+    }
+    res <- duplAttributes(lapply(unattrib(X), function(y) scv(y, sample.int(n, nmiss), value)), X)
     return(if(inherits(X, "data.table")) alc(res) else res)
   }
   if(!is.atomic(X)) stop("X must be an atomic vector, array or data.frame")
   l <- length(X)
-  X[sample.int(l, floor(l * prop))] <- value
-  X
+  if(set) {
+    scv(X, sample.int(l, floor(l * prop)), value, TRUE)
+    return(invisible(X))
+  }
+  return(scv(X, sample.int(l, floor(l * prop)), value))
 }
 
 fdapply <- function(X, FUN, ...) duplAttributes(lapply(`attributes<-`(X, NULL), FUN, ...), X)
diff --git a/man/efficient-programming.Rd b/man/efficient-programming.Rd
@@ -69,8 +69,8 @@ na_focb(x, set = FALSE)     # (by reference). These also support lists (NULL/emp
 na_omit(X, cols = NULL,     # Faster na.omit for matrices and data frames,
         na.attr = FALSE,    # can use selected columns to check, attach indices,
         prop = 0, ...)      # and remove cases with a proportion of values missing
-na_insert(X, prop = 0.1,    # Insert missing values at random
-          value = NA)
+na_insert(X, prop = 0.1,    # Insert missing values at random (by reference)
+    value = NA, set = TRUE)
 missing_cases(X, cols=NULL, # The opposite of complete.cases(), faster for DF's.
   prop = 0, count = FALSE)  # See also kit::panyNA(), kit::pallNA(), kit::pcountNA()
 vlengths(X, use.names=TRUE) # Faster lengths() and nchar() (in C, no method dispatch)
@@ -88,7 +88,7 @@ cinv(x)                     # Choleski (fast) inverse of symmetric PD matrix, e.
 %- maybe also 'usage' for other objects documented here.
 \arguments{
   \item{X, V, R}{a vector, matrix or data frame.}
-  \item{x, v}{a (atomic) vector or matrix (\code{na_rm} also supports lists).}
+  \item{x, v}{a (atomic) vector or matrix (\code{na_rm}/\code{locf}/\code{focb} also support lists).}
   \item{value}{a single value of any (atomic) vector type. For \code{whichv} it can also be a \code{length(x)} vector.}
   \item{invert}{logical. \code{TRUE} considers elements \code{x != value}.}
   \item{set}{logical. \code{TRUE} transforms \code{x} by reference.}
diff --git a/src/kit_dup.c b/src/kit_dup.c
@@ -128,20 +128,22 @@ SEXP dupVecIndex(SEXP x) {
   } break;
   case REALSXP: {
     const double *restrict px = REAL(x);
+    // size_t offset;
     union uno tpv;
     for (int i = 0; i != n; ++i) {
       tpv.d = px[i]; // R_IsNA(px[i]) ? NA_REAL : (R_IsNaN(px[i]) ? R_NaN : px[i]);
       id = HASH(tpv.u[0] + tpv.u[1], K);
-      // Double hashing idea: not faster!
+      // // Double hashing idea: not faster!
       // if(h[id]) {
       //   if(REQUAL(px[h[id]-1], px[i])) {
       //     pans_i[i] = pans_i[h[id]-1]; // h[id];
       //     continue;
       //   }
-      //   offset = (id / M) + 1;
+      //   offset = HASH(tpv.u[0] * tpv.u[1], K) / M + 1;
       //   // if(offset == 0) offset = 1;
       //   id += offset;
       //   id %= M;
+      //   // if(id >= M) id = 0;
       //   while(h[id]) {
       //     if(REQUAL(px[h[id]-1], px[i])) {
       //       pans_i[i] = pans_i[h[id]-1]; // h[id];
@@ -152,7 +154,6 @@ SEXP dupVecIndex(SEXP x) {
       //     // if(id >= M) id = 0;
       //   }
       // }
-      //
       while(h[id]) {
         if(REQUAL(px[h[id]-1], px[i])) {
           pans_i[i] = pans_i[h[id]-1]; // h[id];