diff --git a/R/recode_into.r b/R/recode_into.r index b93dbe5d9..a33f03eb7 100644 --- a/R/recode_into.r +++ b/R/recode_into.r @@ -20,10 +20,13 @@ #' recode patterns. If `FALSE`, former recoded cases will not be altered by later #' recode patterns that would apply to those cases again. A warning message is #' printed to alert such situations and to avoid unintentional recodings. -#' @param preserve_na Logical, if `TRUE` (default) and `default` is not `NA`, -#' missing values in the original variable will be set back to `NA` in the -#' recoded variable (unless overwritten by other recode patterns). If `FALSE`, -#' missing values in the original variable will be recoded to `default`. +#' @param preserve_na Logical, if `TRUE` and `default` is not `NA`, missing +#' values in the original variable will be set back to `NA` in the recoded +#' variable (unless overwritten by other recode patterns). If `FALSE`, missing +#' values in the original variable will be recoded to `default`. The latter +#' behaviour prevents unintentional overwriting of missing values with `default`, +#' which means that you won't find valid values where the original data only +#' had missing values. See 'Examples'. #' @param verbose Toggle warnings. #' #' @return A vector with recoded values. @@ -76,12 +79,37 @@ #' data = d, #' default = 0 #' ) +#' +#' # handling of missing values +#' d <- data.frame( +#' x = c(1, NA, 2, NA, 3, 4), +#' y = c(1, 11, 3, NA, 5, 6) +#' ) +#' # first NA in x is overwritten by valid value from y +#' # we have no known value for second NA in x and y, +#' # thus we get one NA in the result +#' recode_into( +#' x <= 3 ~ 1, +#' y > 5 ~ 2, +#' data = d, +#' default = 0, +#' preserve_na = TRUE +#' ) +#' # first NA in x is overwritten by valid value from y +#' # default value is used for second NA +#' recode_into( +#' x <= 3 ~ 1, +#' y > 5 ~ 2, +#' data = d, +#' default = 0, +#' preserve_na = FALSE +#' ) #' @export recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, - preserve_na = TRUE, + preserve_na = FALSE, verbose = TRUE) { dots <- list(...) @@ -133,6 +161,9 @@ recode_into <- function(..., ) } + # indicator to show message when replacing NA by default + # needed to show message only once + overwrite_NA_msg <- TRUE # iterate all expressions for (i in seq_len(n_params)) { @@ -182,10 +213,18 @@ recode_into <- function(..., # write new values into output vector out[index] <- value # set back missing values - if (any(missing_index) && !is.na(default) && preserve_na) { - # but only where we still have default values - # we don't want to overwrite already recoded values with NA - out[missing_index & out == default] <- NA + if (any(missing_index) && !is.na(default)) { + if (preserve_na) { + # but only where we still have default values + # we don't want to overwrite already recoded values with NA + out[missing_index & out == default] <- NA + } else if (overwrite_NA_msg && verbose) { + # don't show msg again + overwrite_NA_msg <- FALSE + insight::format_alert( + "Missing values in original variable are overwritten by default value. If you want to preserve missing values, set `preserve_na = TRUE`." + ) + } } } diff --git a/man/recode_into.Rd b/man/recode_into.Rd index b3e164131..d8d0a337d 100644 --- a/man/recode_into.Rd +++ b/man/recode_into.Rd @@ -9,7 +9,7 @@ recode_into( data = NULL, default = NA, overwrite = TRUE, - preserve_na = TRUE, + preserve_na = FALSE, verbose = TRUE ) } @@ -32,10 +32,13 @@ recode patterns. If \code{FALSE}, former recoded cases will not be altered by la recode patterns that would apply to those cases again. A warning message is printed to alert such situations and to avoid unintentional recodings.} -\item{preserve_na}{Logical, if \code{TRUE} (default) and \code{default} is not \code{NA}, -missing values in the original variable will be set back to \code{NA} in the -recoded variable (unless overwritten by other recode patterns). If \code{FALSE}, -missing values in the original variable will be recoded to \code{default}.} +\item{preserve_na}{Logical, if \code{TRUE} and \code{default} is not \code{NA}, missing +values in the original variable will be set back to \code{NA} in the recoded +variable (unless overwritten by other recode patterns). If \code{FALSE}, missing +values in the original variable will be recoded to \code{default}. The latter +behaviour prevents unintentional overwriting of missing values with \code{default}, +which means that you won't find valid values where the original data only +had missing values. See 'Examples'.} \item{verbose}{Toggle warnings.} } @@ -95,4 +98,29 @@ recode_into( data = d, default = 0 ) + +# handling of missing values +d <- data.frame( + x = c(1, NA, 2, NA, 3, 4), + y = c(1, 11, 3, NA, 5, 6) +) +# first NA in x is overwritten by valid value from y +# we have no known value for second NA in x and y, +# thus we get one NA in the result +recode_into( + x <= 3 ~ 1, + y > 5 ~ 2, + data = d, + default = 0, + preserve_na = TRUE +) +# first NA in x is overwritten by valid value from y +# default value is used for second NA +recode_into( + x <= 3 ~ 1, + y > 5 ~ 2, + data = d, + default = 0, + preserve_na = FALSE +) } diff --git a/tests/testthat/test-recode_into.R b/tests/testthat/test-recode_into.R index df7cf60b2..90fabcd2f 100644 --- a/tests/testthat/test-recode_into.R +++ b/tests/testthat/test-recode_into.R @@ -194,25 +194,37 @@ test_that("recode_into, make sure recode works with missing in original variable d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, d_recode_na$mpg <= 20 ~ 2, d_recode_na$cyl == 4 ~ 3, - default = 0 + default = 0, + preserve_na = TRUE ) out2_recoded_na <- recode_into( d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, d_recode_na$mpg <= 20 ~ 2, - default = 0 - ) - out3_recoded_na <- recode_into( - d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, - d_recode_na$mpg <= 20 ~ 2, - d_recode_na$cyl == 4 ~ 3, default = 0, - preserve_na = FALSE + preserve_na = TRUE ) - out4_recoded_na <- recode_into( - d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, - d_recode_na$mpg <= 20 ~ 2, - default = 0, - preserve_na = FALSE + expect_message( + { + out3_recoded_na <- recode_into( + d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, + d_recode_na$mpg <= 20 ~ 2, + d_recode_na$cyl == 4 ~ 3, + default = 0, + preserve_na = FALSE + ) + }, + regex = "Missing values in original variable" + ) + expect_message( + { + out4_recoded_na <- recode_into( + d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, + d_recode_na$mpg <= 20 ~ 2, + default = 0, + preserve_na = FALSE + ) + }, + regex = "Missing values in original variable" ) # one NA in mpg is overwritten by valid value from cyl, total 5 NA expect_identical(