From 1b3b82541538ecbf0a57039875f189ccfca88bc0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 7 Sep 2023 20:20:29 +0200 Subject: [PATCH] fix issues with NA values in recodes (#455) * fix issues with NA values in recodes * add reserve_na attr, add tests * add comments * version bump * Update test-recode_into.R * scoping issue * rename objects in tests, maybe fixes random test order --- DESCRIPTION | 2 +- NEWS.md | 6 +++ R/recode_into.r | 26 ++++++++++++- man/recode_into.Rd | 14 ++++++- tests/testthat/test-recode_into.R | 64 ++++++++++++++++++++++++++++++- 5 files changed, 107 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e9c46fef5..76d6967bc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.8.0.9 +Version: 0.8.0.10 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NEWS.md b/NEWS.md index 7e91322e7..295570ec4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,9 @@ CHANGES * `recode_into()` gains an `overwrite` argument to skip overwriting already recoded cases when multiple recode patterns apply to the same case. +* `recode_into()` gains an `preserve_na` argument to preserve `NA` values + when recoding. + * `data_read()` now passes the `encoding` argument to `data.table::fread()`. This allows to read files with non-ASCII characters. @@ -28,6 +31,9 @@ BUG FIXES * Fixed issue in `recode_into()` with probably wrong case number printed in the warning when several recode patterns match to one case. +* Fixed issue in `recode_into()` when original data contained `NA` values and + `NA` was not included in the recode pattern. + * Fixed issue in `data_filter()` where functions containing a `=` (e.g. when naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as faulty syntax. diff --git a/R/recode_into.r b/R/recode_into.r index 8a382fbe1..b93dbe5d9 100644 --- a/R/recode_into.r +++ b/R/recode_into.r @@ -20,6 +20,10 @@ #' recode patterns. If `FALSE`, former recoded cases will not be altered by later #' recode patterns that would apply to those cases again. A warning message is #' printed to alert such situations and to avoid unintentional recodings. +#' @param preserve_na Logical, if `TRUE` (default) and `default` is not `NA`, +#' missing values in the original variable will be set back to `NA` in the +#' recoded variable (unless overwritten by other recode patterns). If `FALSE`, +#' missing values in the original variable will be recoded to `default`. #' @param verbose Toggle warnings. #' #' @return A vector with recoded values. @@ -73,7 +77,12 @@ #' default = 0 #' ) #' @export -recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbose = TRUE) { +recode_into <- function(..., + data = NULL, + default = NA, + overwrite = TRUE, + preserve_na = TRUE, + verbose = TRUE) { dots <- list(...) # get length of vector, so we know the length of the output vector @@ -135,6 +144,12 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos index <- with(data, eval(dots[[i]][[2]])) value <- with(data, eval(dots[[i]][[3]])) } + # remember missing values, so we can add back later + missing_index <- is.na(index) + # make sure index has no missing values. when we have missing values in + # original expression, these are considered as "no match" and set to FALSE + # we handle NA value later and thus want to remove them from "index" now + index[is.na(index)] <- FALSE # overwriting values? do more recode-patterns match the same case? if (is.na(default)) { already_exists <- !is.na(out[index]) @@ -144,7 +159,7 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos # save indices of overwritten cases overwritten_cases <- which(index)[already_exists] # tell user... - if (any(already_exists) && verbose) { + if (any(already_exists, na.rm = TRUE) && verbose) { if (overwrite) { msg <- paste( "Several recode patterns apply to the same cases.", @@ -164,7 +179,14 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos if (!overwrite) { index[overwritten_cases] <- FALSE } + # write new values into output vector out[index] <- value + # set back missing values + if (any(missing_index) && !is.na(default) && preserve_na) { + # but only where we still have default values + # we don't want to overwrite already recoded values with NA + out[missing_index & out == default] <- NA + } } out diff --git a/man/recode_into.Rd b/man/recode_into.Rd index 064d72f6c..b3e164131 100644 --- a/man/recode_into.Rd +++ b/man/recode_into.Rd @@ -4,7 +4,14 @@ \alias{recode_into} \title{Recode values from one or more variables into a new variable} \usage{ -recode_into(..., data = NULL, default = NA, overwrite = TRUE, verbose = TRUE) +recode_into( + ..., + data = NULL, + default = NA, + overwrite = TRUE, + preserve_na = TRUE, + verbose = TRUE +) } \arguments{ \item{...}{A sequence of two-sided formulas, where the left hand side (LHS) @@ -25,6 +32,11 @@ recode patterns. If \code{FALSE}, former recoded cases will not be altered by la recode patterns that would apply to those cases again. A warning message is printed to alert such situations and to avoid unintentional recodings.} +\item{preserve_na}{Logical, if \code{TRUE} (default) and \code{default} is not \code{NA}, +missing values in the original variable will be set back to \code{NA} in the +recoded variable (unless overwritten by other recode patterns). If \code{FALSE}, +missing values in the original variable will be recoded to \code{default}.} + \item{verbose}{Toggle warnings.} } \value{ diff --git a/tests/testthat/test-recode_into.R b/tests/testthat/test-recode_into.R index ab5c7908b..df7cf60b2 100644 --- a/tests/testthat/test-recode_into.R +++ b/tests/testthat/test-recode_into.R @@ -170,7 +170,7 @@ test_that("recode_into, check differen input length", { ) }) -test_that("recode_into, check differen input length", { +test_that("recode_into, check different input length", { x <- 1:5 y <- c(5, 2, 3, 1, 4) expect_warning( @@ -184,3 +184,65 @@ test_that("recode_into, check differen input length", { regexp = "Several recode patterns" ) }) + +test_that("recode_into, make sure recode works with missing in original variable", { + data(mtcars) + mtcars$mpg[c(3, 10, 12, 15, 16)] <- NA + mtcars$cyl[c(2, 15, 16)] <- NA + d_recode_na <<- as.data.frame(mtcars) + out1_recoded_na <- recode_into( + d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, + d_recode_na$mpg <= 20 ~ 2, + d_recode_na$cyl == 4 ~ 3, + default = 0 + ) + out2_recoded_na <- recode_into( + d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, + d_recode_na$mpg <= 20 ~ 2, + default = 0 + ) + out3_recoded_na <- recode_into( + d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, + d_recode_na$mpg <= 20 ~ 2, + d_recode_na$cyl == 4 ~ 3, + default = 0, + preserve_na = FALSE + ) + out4_recoded_na <- recode_into( + d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, + d_recode_na$mpg <= 20 ~ 2, + default = 0, + preserve_na = FALSE + ) + # one NA in mpg is overwritten by valid value from cyl, total 5 NA + expect_identical( + out1_recoded_na, + c( + 1, NA, 3, 1, 2, 2, 2, 3, 3, NA, 2, NA, 2, 2, NA, NA, 2, 3, + 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3 + ) + ) + # total 6 NA + expect_identical( + out2_recoded_na, + c( + 1, NA, NA, 1, 2, 2, 2, 0, 0, NA, 2, NA, 2, 2, NA, NA, 2, 0, + 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0 + ) + ) + # NA is preserved, set to default if not overwritten by other recodes + expect_identical( + out3_recoded_na, + c( + 1, 0, 3, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 2, 0, 0, 2, 3, 3, 3, + 3, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3 + ) + ) + expect_identical( + out4_recoded_na, + c( + 1, 0, 0, 1, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, + 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0 + ) + ) +})