From 42cf51f99ec56309581a7cc639344461a9a9e710 Mon Sep 17 00:00:00 2001 From: Joachim Gassen Date: Sun, 7 Jun 2020 18:00:26 +0200 Subject: [PATCH 1/2] Added naStrings option to 'null_to_na()' and precessors. Addresses #98 and #314 --- DESCRIPTION | 2 +- R/fromJSON.R | 14 +++++--- R/list_to_vec.R | 4 +-- R/null_to_na.R | 4 +-- R/simplify.R | 10 +++--- R/simplifyDataFrame.R | 5 +-- jsonlite.Rproj | 1 + man/fromJSON.Rd | 3 ++ src/null_to_na.c | 35 ++++++++++++++----- .../test-fromJSON-custom-na-strings.R | 33 +++++++++++++++++ 10 files changed, 86 insertions(+), 25 deletions(-) create mode 100644 tests/testthat/test-fromJSON-custom-na-strings.R diff --git a/DESCRIPTION b/DESCRIPTION index 0d100a9b..858a4429 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,4 +33,4 @@ Suggests: rmarkdown, R.rsp, sp -RoxygenNote: 7.0.2 +RoxygenNote: 7.1.0 diff --git a/R/fromJSON.R b/R/fromJSON.R index 60ab812d..2f261b9b 100644 --- a/R/fromJSON.R +++ b/R/fromJSON.R @@ -32,6 +32,7 @@ #' @param factor how to encode factor objects: must be one of 'string' or 'integer' #' @param complex how to encode complex numbers: must be one of 'string' or 'list' #' @param raw how to encode raw objects: must be one of 'base64', 'hex' or 'mongo' +#' @param naStrings which strings to treat as NA when 'simplifyVector' is 'TRUE'. Defaults to c("NA", "NaN", "Inf", "-Inf") #' @param null how to encode NULL values within a list: must be one of 'null' or 'list' #' @param na how to print NA values: must be one of 'null' or 'string'. Defaults are class specific #' @param auto_unbox automatically \code{\link{unbox}} all atomic vectors of length 1. It is usually safer to avoid this and instead use the \code{\link{unbox}} function to unbox individual elements. @@ -75,7 +76,8 @@ #' identical(data3, flatten(data2)) #' } fromJSON <- function(txt, simplifyVector = TRUE, simplifyDataFrame = simplifyVector, - simplifyMatrix = simplifyVector, flatten = FALSE, ...) { + simplifyMatrix = simplifyVector, flatten = FALSE, + naStrings = c("NA", "NaN", "Inf", "-Inf"), ...) { # check type if (!is.character(txt) && !inherits(txt, "connection")) { @@ -98,11 +100,13 @@ fromJSON <- function(txt, simplifyVector = TRUE, simplifyDataFrame = simplifyVec # call the actual function (with deprecated arguments) parse_and_simplify(txt = txt, simplifyVector = simplifyVector, simplifyDataFrame = simplifyDataFrame, - simplifyMatrix = simplifyMatrix, flatten = flatten, ...) + simplifyMatrix = simplifyMatrix, flatten = flatten, naStrings = naStrings, ...) } -parse_and_simplify <- function(txt, simplifyVector = TRUE, simplifyDataFrame = simplifyVector, - simplifyMatrix = simplifyVector, flatten = FALSE, unicode = TRUE, validate = TRUE, bigint_as_char = FALSE, ...){ +parse_and_simplify <- function(txt, simplifyVector = TRUE, + simplifyDataFrame = simplifyVector, simplifyMatrix = simplifyVector, + flatten = FALSE, unicode = TRUE, validate = TRUE, bigint_as_char = FALSE, + naStrings = c("NA", "NaN", "Inf", "-Inf"), ...){ if(!missing(unicode)){ message("Argument unicode has been deprecated. YAJL always parses unicode.") @@ -118,7 +122,7 @@ parse_and_simplify <- function(txt, simplifyVector = TRUE, simplifyDataFrame = s # post processing if (any(isTRUE(simplifyVector), isTRUE(simplifyDataFrame), isTRUE(simplifyMatrix))) { return(simplify(obj, simplifyVector = simplifyVector, simplifyDataFrame = simplifyDataFrame, - simplifyMatrix = simplifyMatrix, flatten = flatten, ...)) + simplifyMatrix = simplifyMatrix, flatten = flatten, naStrings = naStrings, ...)) } else { return(obj) } diff --git a/R/list_to_vec.R b/R/list_to_vec.R index cdcd81cb..f2ab5854 100644 --- a/R/list_to_vec.R +++ b/R/list_to_vec.R @@ -1,6 +1,6 @@ -list_to_vec <- function(x) { +list_to_vec <- function(x, naStrings = c("NA", "NaN", "Inf", "-Inf")) { isdates <- is_datelist(x) - out <- unlist(null_to_na(x), recursive = FALSE, use.names = FALSE) + out <- unlist(null_to_na(x, naStrings), recursive = FALSE, use.names = FALSE) if(isdates && is.numeric(out)){ structure(out, class = c("POSIXct", "POSIXt")) } else{ diff --git a/R/null_to_na.R b/R/null_to_na.R index a881e4fe..e20e6440 100644 --- a/R/null_to_na.R +++ b/R/null_to_na.R @@ -1,6 +1,6 @@ #' @useDynLib jsonlite C_null_to_na -null_to_na <- function(x) { - .Call(C_null_to_na, x) +null_to_na <- function(x, naStrings) { + .Call(C_null_to_na, x, naStrings) } #' @useDynLib jsonlite C_is_datelist diff --git a/R/simplify.R b/R/simplify.R index e59206eb..79428557 100644 --- a/R/simplify.R +++ b/R/simplify.R @@ -1,6 +1,6 @@ simplify <- function(x, simplifyVector = TRUE, simplifyDataFrame = TRUE, simplifyMatrix = TRUE, simplifyDate = simplifyVector, homoList = TRUE, flatten = FALSE, columnmajor = FALSE, - simplifySubMatrix = simplifyMatrix) { + simplifySubMatrix = simplifyMatrix, naStrings = c("NA", "NaN", "Inf", "-Inf")) { #This includes '[]' and '{}') if (!is.list(x) || !length(x)) { @@ -9,7 +9,8 @@ simplify <- function(x, simplifyVector = TRUE, simplifyDataFrame = TRUE, simplif # list can be a dataframe recordlist if (isTRUE(simplifyDataFrame) && is.recordlist(x)) { - mydf <- simplifyDataFrame(x, flatten = flatten, simplifyMatrix = simplifySubMatrix) + mydf <- simplifyDataFrame(x, flatten = flatten, + simplifyMatrix = simplifySubMatrix, naStrings = naStrings) if(isTRUE(simplifyDate) && is.data.frame(mydf) && is.datelist(mydf)){ return(parse_date(mydf[["$date"]])) } @@ -18,12 +19,13 @@ simplify <- function(x, simplifyVector = TRUE, simplifyDataFrame = TRUE, simplif # or a scalar list (atomic vector) if (isTRUE(simplifyVector) && is.null(names(x)) && is.scalarlist(x)) { - return(list_to_vec(x)) + return(list_to_vec(x, naStrings = naStrings)) } # apply recursively out <- lapply(x, simplify, simplifyVector = simplifyVector, simplifyDataFrame = simplifyDataFrame, - simplifyMatrix = simplifySubMatrix, columnmajor = columnmajor, flatten = flatten) + simplifyMatrix = simplifySubMatrix, columnmajor = columnmajor, flatten = flatten, + naStrings = naStrings) # fix for mongo style dates turning into scalars *after* simplifying # only happens when simplifyDataframe=FALSE diff --git a/R/simplifyDataFrame.R b/R/simplifyDataFrame.R index bd653cf5..36b8ae90 100644 --- a/R/simplifyDataFrame.R +++ b/R/simplifyDataFrame.R @@ -1,4 +1,4 @@ -simplifyDataFrame <- function(recordlist, columns, flatten, simplifyMatrix) { +simplifyDataFrame <- function(recordlist, columns, flatten, simplifyMatrix, naStrings) { # no records at all if (!length(recordlist)) { @@ -27,7 +27,8 @@ simplifyDataFrame <- function(recordlist, columns, flatten, simplifyMatrix) { # simplify vectors and nested data frames columnlist <- lapply(columnlist, simplify, simplifyVector = TRUE, simplifyDataFrame = TRUE, - simplifyMatrix = FALSE, simplifySubMatrix = simplifyMatrix, flatten = flatten) + simplifyMatrix = FALSE, simplifySubMatrix = simplifyMatrix, flatten = flatten, + naStrings = naStrings) # check that all elements have equal length columnlengths <- unlist(vapply(columnlist, function(z) { diff --git a/jsonlite.Rproj b/jsonlite.Rproj index 6a04a73f..8d5f5fc9 100644 --- a/jsonlite.Rproj +++ b/jsonlite.Rproj @@ -17,3 +17,4 @@ StripTrailingWhitespace: Yes BuildType: Package PackageInstallArgs: --no-multiarch --with-keep.source --install-tests +PackageRoxygenize: rd,collate,namespace diff --git a/man/fromJSON.Rd b/man/fromJSON.Rd index d26c7010..ab16976c 100644 --- a/man/fromJSON.Rd +++ b/man/fromJSON.Rd @@ -13,6 +13,7 @@ fromJSON( simplifyDataFrame = simplifyVector, simplifyMatrix = simplifyVector, flatten = FALSE, + naStrings = c("NA", "NaN", "Inf", "-Inf"), ... ) @@ -45,6 +46,8 @@ toJSON( \item{flatten}{automatically \code{\link{flatten}} nested data frames into a single non-nested data frame} +\item{naStrings}{which strings to treat as NA when 'simplifyVector' is 'TRUE'. Defaults to c("NA", "NaN", "Inf", "-Inf")} + \item{...}{arguments passed on to class specific \code{print} methods} \item{x}{the object to be encoded} diff --git a/src/null_to_na.c b/src/null_to_na.c index d3eb307d..7cba8f91 100644 --- a/src/null_to_na.c +++ b/src/null_to_na.c @@ -5,27 +5,37 @@ /* This function takes a list and replaces all NULL values by NA. -In addition, it will parse strings "NA" "NaN" "Inf" and "-Inf", -unless there is at least one non-na string element in the list. +In addition, it will replace strings matched by 'naStrings' +(defaults to "NA" "NaN" "Inf" and "-Inf") with NA, unless there is +at least one non-na string element in the list. In that case converting to real values has no point because unlist() will coerse them back into a string anyway. */ -SEXP C_null_to_na(SEXP x) { +SEXP C_null_to_na(SEXP x, SEXP naStrings) { int len = length(x); if(len == 0) return x; - //null always turns into NA + int len_naStrings = length(naStrings); + bool looks_like_na_string = false; bool looks_like_character_vector = false; + for (int i=0; i Date: Sun, 7 Jun 2020 18:34:59 +0200 Subject: [PATCH 2/2] Registered the new parameter in 'C_null_to_na' --- src/register.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/register.c b/src/register.c index 8cd16fec..8f91cf34 100644 --- a/src/register.c +++ b/src/register.c @@ -14,7 +14,7 @@ extern SEXP C_escape_chars(SEXP); extern SEXP C_is_datelist(SEXP); extern SEXP C_is_recordlist(SEXP); extern SEXP C_is_scalarlist(SEXP); -extern SEXP C_null_to_na(SEXP); +extern SEXP C_null_to_na(SEXP, SEXP); extern SEXP C_row_collapse_array(SEXP, SEXP); extern SEXP C_row_collapse_object(SEXP, SEXP, SEXP); extern SEXP C_transpose_list(SEXP, SEXP); @@ -37,7 +37,7 @@ static const R_CallMethodDef CallEntries[] = { {"C_is_datelist", (DL_FUNC) &C_is_datelist, 1}, {"C_is_recordlist", (DL_FUNC) &C_is_recordlist, 1}, {"C_is_scalarlist", (DL_FUNC) &C_is_scalarlist, 1}, - {"C_null_to_na", (DL_FUNC) &C_null_to_na, 1}, + {"C_null_to_na", (DL_FUNC) &C_null_to_na, 2}, {"C_row_collapse_array", (DL_FUNC) &C_row_collapse_array, 2}, {"C_row_collapse_object", (DL_FUNC) &C_row_collapse_object, 3}, {"C_transpose_list", (DL_FUNC) &C_transpose_list, 2},