Skip to content

Commit

Permalink
fix issues with NA values in recodes (#455)
Browse files Browse the repository at this point in the history
* fix issues with NA values in recodes

* add reserve_na attr, add tests

* add comments

* version bump

* Update test-recode_into.R

* scoping issue

* rename objects in tests, maybe fixes random test order
  • Loading branch information
strengejacke committed Sep 7, 2023
1 parent 877c587 commit 1b3b825
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 5 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.8.0.9
Version: 0.8.0.10
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ CHANGES
* `recode_into()` gains an `overwrite` argument to skip overwriting already
recoded cases when multiple recode patterns apply to the same case.

* `recode_into()` gains an `preserve_na` argument to preserve `NA` values
when recoding.

* `data_read()` now passes the `encoding` argument to `data.table::fread()`.
This allows to read files with non-ASCII characters.

Expand All @@ -28,6 +31,9 @@ BUG FIXES
* Fixed issue in `recode_into()` with probably wrong case number printed in the
warning when several recode patterns match to one case.

* Fixed issue in `recode_into()` when original data contained `NA` values and
`NA` was not included in the recode pattern.

* Fixed issue in `data_filter()` where functions containing a `=` (e.g. when
naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as
faulty syntax.
Expand Down
26 changes: 24 additions & 2 deletions R/recode_into.r
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
#' recode patterns. If `FALSE`, former recoded cases will not be altered by later
#' recode patterns that would apply to those cases again. A warning message is
#' printed to alert such situations and to avoid unintentional recodings.
#' @param preserve_na Logical, if `TRUE` (default) and `default` is not `NA`,
#' missing values in the original variable will be set back to `NA` in the
#' recoded variable (unless overwritten by other recode patterns). If `FALSE`,
#' missing values in the original variable will be recoded to `default`.
#' @param verbose Toggle warnings.
#'
#' @return A vector with recoded values.
Expand Down Expand Up @@ -73,7 +77,12 @@
#' default = 0
#' )
#' @export
recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbose = TRUE) {
recode_into <- function(...,
data = NULL,
default = NA,
overwrite = TRUE,
preserve_na = TRUE,
verbose = TRUE) {
dots <- list(...)

# get length of vector, so we know the length of the output vector
Expand Down Expand Up @@ -135,6 +144,12 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
index <- with(data, eval(dots[[i]][[2]]))
value <- with(data, eval(dots[[i]][[3]]))
}
# remember missing values, so we can add back later
missing_index <- is.na(index)
# make sure index has no missing values. when we have missing values in
# original expression, these are considered as "no match" and set to FALSE
# we handle NA value later and thus want to remove them from "index" now
index[is.na(index)] <- FALSE
# overwriting values? do more recode-patterns match the same case?
if (is.na(default)) {
already_exists <- !is.na(out[index])
Expand All @@ -144,7 +159,7 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
# save indices of overwritten cases
overwritten_cases <- which(index)[already_exists]
# tell user...
if (any(already_exists) && verbose) {
if (any(already_exists, na.rm = TRUE) && verbose) {
if (overwrite) {
msg <- paste(
"Several recode patterns apply to the same cases.",
Expand All @@ -164,7 +179,14 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
if (!overwrite) {
index[overwritten_cases] <- FALSE
}
# write new values into output vector
out[index] <- value
# set back missing values
if (any(missing_index) && !is.na(default) && preserve_na) {
# but only where we still have default values
# we don't want to overwrite already recoded values with NA
out[missing_index & out == default] <- NA
}
}

out
Expand Down
14 changes: 13 additions & 1 deletion man/recode_into.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

64 changes: 63 additions & 1 deletion tests/testthat/test-recode_into.R
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ test_that("recode_into, check differen input length", {
)
})

test_that("recode_into, check differen input length", {
test_that("recode_into, check different input length", {
x <- 1:5
y <- c(5, 2, 3, 1, 4)
expect_warning(
Expand All @@ -184,3 +184,65 @@ test_that("recode_into, check differen input length", {
regexp = "Several recode patterns"
)
})

test_that("recode_into, make sure recode works with missing in original variable", {
data(mtcars)
mtcars$mpg[c(3, 10, 12, 15, 16)] <- NA
mtcars$cyl[c(2, 15, 16)] <- NA
d_recode_na <<- as.data.frame(mtcars)
out1_recoded_na <- recode_into(
d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
d_recode_na$mpg <= 20 ~ 2,
d_recode_na$cyl == 4 ~ 3,
default = 0
)
out2_recoded_na <- recode_into(
d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
d_recode_na$mpg <= 20 ~ 2,
default = 0
)
out3_recoded_na <- recode_into(
d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
d_recode_na$mpg <= 20 ~ 2,
d_recode_na$cyl == 4 ~ 3,
default = 0,
preserve_na = FALSE
)
out4_recoded_na <- recode_into(
d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
d_recode_na$mpg <= 20 ~ 2,
default = 0,
preserve_na = FALSE
)
# one NA in mpg is overwritten by valid value from cyl, total 5 NA
expect_identical(
out1_recoded_na,
c(
1, NA, 3, 1, 2, 2, 2, 3, 3, NA, 2, NA, 2, 2, NA, NA, 2, 3,
3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3
)
)
# total 6 NA
expect_identical(
out2_recoded_na,
c(
1, NA, NA, 1, 2, 2, 2, 0, 0, NA, 2, NA, 2, 2, NA, NA, 2, 0,
0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0
)
)
# NA is preserved, set to default if not overwritten by other recodes
expect_identical(
out3_recoded_na,
c(
1, 0, 3, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 2, 0, 0, 2, 3, 3, 3,
3, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3
)
)
expect_identical(
out4_recoded_na,
c(
1, 0, 0, 1, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0,
0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0
)
)
})

0 comments on commit 1b3b825

Please sign in to comment.