Avoid 0-length copy with potentially undefined behavior #1968

MichaelChirico · 2025-04-16T06:38:27Z

Updating R 4.4.1 to R 4.5.0 broke {vctrs} suites pretty badly (segfaults).

To be a bit more specific, this test breaks, though there may be others:

vctrs/tests/testthat/test-cast.R

Line 302 in 78d9f2b

expect_error(vec_rbind(gdf, gdf), NA)

Stack trace hidden but available below.

r-devel thread:

https://stat.ethz.ch/pipermail/r-devel/2024-June/083456.html

In sum:

R 4.5.0 make the behavior of previous versions under --enable-strict-barrier the default, i.e., strictly speaking this affects all R versions
Something about our compiler (perhaps -fsanitize-trap) makes this segfault for us where CRAN does not

stack trace...

 *** caught illegal operation ***
address 0x7f405afb3a6a, cause 'illegal operand'

Traceback:
 1: vec_locate_sorted_groups(x, nan_distinct = TRUE)
 2: dplyr_locate_sorted_groups(group_vars)
 3: compute_groups(data, vars, drop = drop)
 4: dplyr::grouped_df(x, vars, drop = drop)
 5: vec_restore.grouped_df(x = x, to = to)
 6: vec_restore_dispatch(x = x, to = to)
 7: (function () vec_restore_dispatch(x = x, to = to))()
 8: vec_rbind(gdf, gdf)
 9: eval_bare(quo_get_expr(.quo), quo_get_env(.quo))
10: withCallingHandlers(expr, condition = function(cnd) {    if (!is.null(matched) || !matches(cnd)) {        return()    }    if (can_entrace(cnd)) {        cnd <- cnd_entrace(cnd)    }    matched <<- cnd    if (inherits(cnd, "message") || inherits(cnd, "warning")) {        cnd_muffle(cnd)    }    else if (inherits(cnd, "error") || inherits(cnd, "skip")) {        return_from(tl, cnd)    }})
11: .capture(act$val <- eval_bare(quo_get_expr(.quo), quo_get_env(.quo)),     ...)
12: quasi_capture(enquo(object), label, capture_matching_condition,     matches = matcher)
13: expect_condition_matching("error", {    {        object    }}, regexp = regexp, class = class, ..., inherit = inherit, info = info,     label = label)
14: expect_error(vec_rbind(gdf, gdf), NA)
15: eval(code, test_env)
16: eval(code, test_env)
17: withCallingHandlers({    eval(code, test_env)    if (!handled && !is.null(test)) {        skip_empty()    }}, expectation = handle_expectation, skip = handle_skip, warning = handle_warning,     message = handle_message, error = handle_error)
18: doTryCatch(return(expr), name, parentenv, handler)
19: tryCatchOne(expr, names, parentenv, handlers[[1L]])
20: tryCatchList(expr, names[-nh], parentenv, handlers[-nh])
21: doTryCatch(return(expr), name, parentenv, handler)
22: tryCatchOne(tryCatchList(expr, names[-nh], parentenv, handlers[-nh]),     names[nh], parentenv, handlers[[nh]])
23: tryCatchList(expr, classes, parentenv, handlers)
24: tryCatch(withCallingHandlers({    eval(code, test_env)    if (!handled && !is.null(test)) {        skip_empty()    }}, expectation = handle_expectation, skip = handle_skip, warning = handle_warning,     message = handle_message, error = handle_error), error = handle_fatal,     skip = function(e) {    })
25: test_code(desc, code, env = parent.frame(), reporter = reporter)
26: test_that("bare-type fallback for df-cast works", {    local_methods(c.vctrs_foobaz = function(...) quux(NextMethod()))    df <- data_frame(x = 1, y = foobaz("foo"))    gdf <- dplyr::new_grouped_df(df, data_frame(x = 1, .rows = list(1L)),         class = "vctrs_foobar")    expect_error(vec_rbind(gdf, gdf), NA)})
27: eval(code, test_env)
28: eval(code, test_env)
29: withCallingHandlers({    eval(code, test_env)    if (!handled && !is.null(test)) {        skip_empty()    }}, expectation = handle_expectation, skip = handle_skip, warning = handle_warning,     message = handle_message, error = handle_error)
30: doTryCatch(return(expr), name, parentenv, handler)
31: tryCatchOne(expr, names, parentenv, handlers[[1L]])
32: tryCatchList(expr, names[-nh], parentenv, handlers[-nh])
33: doTryCatch(return(expr), name, parentenv, handler)
34: tryCatchOne(tryCatchList(expr, names[-nh], parentenv, handlers[-nh]),     names[nh], parentenv, handlers[[nh]])
35: tryCatchList(expr, classes, parentenv, handlers)
36: tryCatch(withCallingHandlers({    eval(code, test_env)    if (!handled && !is.null(test)) {        skip_empty()    }}, expectation = handle_expectation, skip = handle_skip, warning = handle_warning,     message = handle_message, error = handle_error), error = handle_fatal,     skip = function(e) {    })
37: test_code(test = NULL, code = exprs, env = env, reporter = get_reporter() %||%     StopReporter$new())
38: source_file(path, env = env(env), desc = desc, error_call = error_call)
39: FUN(X[[i]], ...)
40: lapply(test_paths, test_one_file, env = env, desc = desc, error_call = error_call)
41: doTryCatch(return(expr), name, parentenv, handler)
42: tryCatchOne(expr, names, parentenv, handlers[[1L]])
43: tryCatchList(expr, classes, parentenv, handlers)
44: tryCatch(code, testthat_abort_reporter = function(cnd) {    cat(conditionMessage(cnd), "\n")    NULL})
45: with_reporter(reporters$multi, lapply(test_paths, test_one_file,     env = env, desc = desc, error_call = error_call))
46: test_files_serial(test_dir = test_dir, test_package = test_package,     test_paths = test_paths, load_helpers = load_helpers, reporter = reporter,     env = env, stop_on_failure = stop_on_failure, stop_on_warning = stop_on_warning,     desc = desc, load_package = load_package, error_call = error_call)
47: test_files(test_dir = path, test_paths = test_paths, test_package = package,     reporter = reporter, load_helpers = load_helpers, env = env,     stop_on_failure = stop_on_failure, stop_on_warning = stop_on_warning,     load_package = load_package, parallel = parallel)
48: test_dir(test_path, package = package, reporter = reporter, ...,     load_package = "installed")
49: test_check("vctrs")
An irrecoverable exception occurred. R is aborting now ...

MichaelChirico · 2025-04-16T13:06:48Z

I added the vec_set_union() test cases as regression tests for:

r-lib/rlang#1793

MichaelChirico · 2025-04-21T17:58:22Z

Added reference to a relevant r-devel thread (h/t @aitap)

DavisVaughan · 2025-04-21T18:39:42Z

It's rather annoying that this change ultimately means you can't call REAL() and friends on a 0-length vector, even if your code ends up not using it when you call things like memcpy() with a size of 0.

That's extremely annoying ergonomics wise. Ideally algorithmic code should be able to work with any size from 0->Max without having to special case any size, and having to special case 0 everywhere you try and deref anything is extremely annoying. Like, IMO there is absolutely nothing wrong with this code:

void fn() {
  R_xlen_t size = Rf_xlength(x);
  const double* p_x = REAL(x);

  for (R_xlen_t i = 0; i < size; ++i) {
    double elt = p_x[i];
  }
}

Relevant commits:
r-devel/r-svn@92c1d5d
wch/r-source@d4dff56

Which ultimately turn on here:
https://github.com/r-devel/r-svn/blob/9976c3d7f08c754593d01ba8380afb6be803dde2/src/main/memory.c#L4137-L4161

We call memcpy() in many places, and deref with REAL() and friends all over the place. Guarding every one of these instances seems pretty unreasonable.

DavisVaughan · 2025-04-21T18:48:33Z

Actually I guess calling REAL() and friends on 0-length vectors is still "fine", you just get (void*) 1 back?

So it's a matter of ensuring you don't use that invalid pointer anywhere. So, for example, the above loop is still fine, I guess.

But memcpy() technically says:

Although paragraph 7.21.1/2 from the c99 standard says it is legal to pass a size of 0, it should be noted that memcpy(src, dest, 0) results in undefined behavior when src or dest points to invalid memory

https://www.reddit.com/r/Cprog/comments/33p5ho/is_it_legal_to_call_memcpy_with_zero_length_on_a/

And dest in this case is (void*) 1, therefore invalid memory I guess, and your checks seem to be picking that up.

I would have thought most implementations of memcpy() in practice still just "see" 0 as the size and do nothing, but I guess not?

So maybe a more holistic and ergonomic solution for rlang and vctrs is to introduce r_memcpy() which is just:

void* r_memcpy(void* dest, const void* src, size_t count) {
  if (count) {
    memcpy(dest, src, count);
  }
}

And just switch to that everywhere SEXP memory is involved.

MichaelChirico · 2025-04-21T20:04:51Z

That works for me. Agree about the awkward ergonomics.

I am still trying to track down exactly what it is about our setup that leads to the divergent behavior vs. CRAN. I'll report back if that ever bears fruit.

Ivan mentioned -fsanitize-trap here, but I don't think that's it:

mhahsler/arules#85

Avoid 0-length copy with potentially undefined behavior

ad50279

This was referenced Apr 16, 2025

growVector, copyAsPlain may formally cause undefined behaviour with 0-length vectors Rdatatable/data.table#6819

Closed

Don't memcpy a 0-length vector r-lib/rlang#1793

Closed

Add a new regression test for 2nd case

ca58a9d

This was referenced Apr 16, 2025

Avoid [0] access of 0-length objects r-lib/bit#44

Merged

Avoid potential 0-length access in memcpy paleolimbot/wk#227

Merged

Avoid passing invalid pointers to memcpy mhahsler/arules#85

Closed

DavisVaughan mentioned this pull request Apr 21, 2025

Use safer r_memcpy() and r_memset() everywhere r-lib/rlang#1797

Merged

This was referenced Apr 22, 2025

Avoid memcpy of invalid pointer jeroen/curl#403

Merged

Avoid memcpy of invalid pointer r-dbi/bigrquery#634

Open

Avoid memcpy of invalid pointer vegandevs/vegan#742

Merged

shikokuchuo mentioned this pull request Apr 25, 2025

Additional safety for memcpy on zero-length R vectors r-lib/nanonext#121

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Avoid 0-length copy with potentially undefined behavior #1968

Avoid 0-length copy with potentially undefined behavior #1968

Uh oh!

MichaelChirico commented Apr 16, 2025 •

edited

Loading

Uh oh!

MichaelChirico commented Apr 16, 2025

Uh oh!

MichaelChirico commented Apr 21, 2025

Uh oh!

DavisVaughan commented Apr 21, 2025 •

edited

Loading

Uh oh!

DavisVaughan commented Apr 21, 2025 •

edited

Loading

Uh oh!

MichaelChirico commented Apr 21, 2025 •

edited

Loading

Uh oh!

Uh oh!

Avoid 0-length copy with potentially undefined behavior #1968

Are you sure you want to change the base?

Avoid 0-length copy with potentially undefined behavior #1968

Uh oh!

Conversation

MichaelChirico commented Apr 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

MichaelChirico commented Apr 16, 2025

Uh oh!

MichaelChirico commented Apr 21, 2025

Uh oh!

DavisVaughan commented Apr 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

DavisVaughan commented Apr 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

MichaelChirico commented Apr 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

MichaelChirico commented Apr 16, 2025 •

edited

Loading

DavisVaughan commented Apr 21, 2025 •

edited

Loading

DavisVaughan commented Apr 21, 2025 •

edited

Loading

MichaelChirico commented Apr 21, 2025 •

edited

Loading