From 54635c854e5871ee54fd8a14881f1f0e4c4e541d Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 7 May 2024 12:19:31 +0200 Subject: [PATCH] `data_to_long()` did not work with labelled data frames (#498) --- DESCRIPTION | 2 +- NEWS.md | 5 +++ R/data_to_long.R | 14 +++++++ R/slide.R | 6 +-- R/standardize.R | 62 +++++++++++++++--------------- R/utils_labels.R | 18 ++++----- tests/testthat/test-data_to_long.R | 19 +++++++-- 7 files changed, 77 insertions(+), 49 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e5f85a0eb..f5fb6c61c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.10.0.1 +Version: 0.10.0.2 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NEWS.md b/NEWS.md index 0689046d2..f5e1a3543 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,11 @@ CHANGES If you recode into a numeric variable, and one of the recode values is `NA`, you no longer need to use `NA_real_` for numeric `NA` values. +BUG FIXES + +* `data_to_long()` did not work for data frame where columns had attributes + (like labelled data). + # datawizard 0.10.0 BREAKING CHANGES diff --git a/R/data_to_long.R b/R/data_to_long.R index 711793d14..06fe91efd 100644 --- a/R/data_to_long.R +++ b/R/data_to_long.R @@ -78,6 +78,7 @@ data_to_long <- function(data, regex = FALSE, ..., cols) { # nolint + original_data <- data # Prefer "cols" over "select" for compat with tidyr::pivot_longer # nolint start @@ -219,6 +220,13 @@ data_to_long <- function(data, stacked_data <- data_relocate(stacked_data, select = values_to, after = -1) + # if columns in data frame have attributes (e.g. labelled data), `cbind()` + # won't work, so we need to remove them. We'll set them back later + not_stacked[] <- lapply(not_stacked, function(i) { + attributes(i) <- NULL + i + }) + # reunite unselected data with stacked data out <- cbind( not_stacked, stats::setNames(stacked_data, c(names_to, values_to)), @@ -264,6 +272,12 @@ data_to_long <- function(data, row.names(out) <- NULL } + # set back labels + shared_columns <- intersect(colnames(out), colnames(original_data)) + for (i in shared_columns) { + out[[i]] <- .set_back_labels(out[[i]], original_data[[i]], include_values = TRUE) + } + out } diff --git a/R/slide.R b/R/slide.R index 5a4dfeff6..eb7b1d6f4 100644 --- a/R/slide.R +++ b/R/slide.R @@ -78,7 +78,7 @@ slide.data.frame <- function(x, # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments - args <- .process_append( + my_args <- .process_append( x, select, append, @@ -86,8 +86,8 @@ slide.data.frame <- function(x, keep_factors = FALSE ) # update processed arguments - x <- args$x - select <- args$select + x <- my_args$x + select <- my_args$select } x[select] <- lapply( diff --git a/R/standardize.R b/R/standardize.R index 0c88722b6..d5082803d 100644 --- a/R/standardize.R +++ b/R/standardize.R @@ -160,27 +160,25 @@ standardize.numeric <- function(x, center <- TRUE } - args <- .process_std_center(x, weights, robust, verbose, reference, center, scale) + my_args <- .process_std_center(x, weights, robust, verbose, reference, center, scale) dot_args <- list(...) # Perform standardization - if (is.null(args)) { + if (is.null(my_args)) { # all NA? return(x) - } else if (is.null(args$check)) { - vals <- rep(0, length(args$vals)) # If only unique value + } else if (is.null(my_args$check)) { + vals <- rep(0, length(my_args$vals)) # If only unique value + } else if (two_sd) { + vals <- as.vector((my_args$vals - my_args$center) / (2 * my_args$scale)) } else { - if (two_sd) { - vals <- as.vector((args$vals - args$center) / (2 * args$scale)) - } else { - vals <- as.vector((args$vals - args$center) / args$scale) - } + vals <- as.vector((my_args$vals - my_args$center) / my_args$scale) } - scaled_x <- rep(NA, length(args$valid_x)) - scaled_x[args$valid_x] <- vals - attr(scaled_x, "center") <- args$center - attr(scaled_x, "scale") <- args$scale + scaled_x <- rep(NA, length(my_args$valid_x)) + scaled_x[my_args$valid_x] <- vals + attr(scaled_x, "center") <- my_args$center + attr(scaled_x, "scale") <- my_args$scale attr(scaled_x, "robust") <- robust # labels z <- .set_back_labels(scaled_x, x, include_values = FALSE) @@ -277,33 +275,33 @@ standardize.data.frame <- function(x, ) # process arguments - args <- .process_std_args(x, select, exclude, weights, append, + my_args <- .process_std_args(x, select, exclude, weights, append, append_suffix = "_z", keep_factors = force, remove_na, reference, .center = center, .scale = scale ) # set new values - x <- args$x + x <- my_args$x # Loop through variables and standardize it - for (var in args$select) { + for (var in my_args$select) { x[[var]] <- standardize(x[[var]], robust = robust, two_sd = two_sd, - weights = args$weights, + weights = my_args$weights, reference = reference[[var]], - center = args$center[var], - scale = args$scale[var], + center = my_args$center[var], + scale = my_args$scale[var], verbose = FALSE, force = force, add_transform_class = FALSE ) } - attr(x, "center") <- unlist(lapply(x[args$select], function(z) { + attr(x, "center") <- unlist(lapply(x[my_args$select], function(z) { attributes(z)$center })) - attr(x, "scale") <- unlist(lapply(x[args$select], function(z) { + attr(x, "scale") <- unlist(lapply(x[my_args$select], function(z) { attributes(z)$scale })) attr(x, "robust") <- robust @@ -338,7 +336,7 @@ standardize.grouped_df <- function(x, verbose = verbose ) - args <- .process_grouped_df( + my_args <- .process_grouped_df( x, select, exclude, append, append_suffix = "_z", reference, weights, keep_factors = force @@ -346,17 +344,17 @@ standardize.grouped_df <- function(x, # create column(s) to store dw_transformer attributes for (i in select) { - args$info$groups[[paste0("attr_", i)]] <- rep(NA, length(args$grps)) + my_args$info$groups[[paste0("attr_", i)]] <- rep(NA, length(my_args$grps)) } - for (rows in seq_along(args$grps)) { + for (rows in seq_along(my_args$grps)) { tmp <- standardize( - args$x[args$grps[[rows]], , drop = FALSE], - select = args$select, + my_args$x[my_args$grps[[rows]], , drop = FALSE], + select = my_args$select, exclude = NULL, robust = robust, two_sd = two_sd, - weights = args$weights, + weights = my_args$weights, remove_na = remove_na, verbose = verbose, force = force, @@ -369,18 +367,18 @@ standardize.grouped_df <- function(x, # store dw_transformer_attributes for (i in select) { - args$info$groups[rows, paste0("attr_", i)][[1]] <- list(unlist(attributes(tmp[[i]]))) + my_args$info$groups[rows, paste0("attr_", i)][[1]] <- list(unlist(attributes(tmp[[i]]))) } - args$x[args$grps[[rows]], ] <- tmp + my_args$x[my_args$grps[[rows]], ] <- tmp } # last column of "groups" attributes must be called ".rows" - args$info$groups <- data_relocate(args$info$groups, ".rows", after = -1) + my_args$info$groups <- data_relocate(my_args$info$groups, ".rows", after = -1) # set back class, so data frame still works with dplyr - attributes(args$x) <- args$info - args$x + attributes(my_args$x) <- my_args$info + my_args$x } diff --git a/R/utils_labels.R b/R/utils_labels.R index 64b517086..c8ece5fe7 100644 --- a/R/utils_labels.R +++ b/R/utils_labels.R @@ -12,19 +12,17 @@ if (reverse_values) { # reverse values? Used for "reverse_scale()" attr(new, "labels") <- stats::setNames(rev(value_labels), names(value_labels)) - } else { + } else if (is.numeric(new)) { # keep value oder? Used for "to_numeric()" - if (is.numeric(new)) { - if (any(grepl("[^0-9]", value_labels))) { - # if we have any non-numeric characters, convert to numeric - attr(new, "labels") <- stats::setNames(as.numeric(as.factor(value_labels)), names(value_labels)) - } else { - # if we have numeric, or "numeric character" (like "1", "2", "3" etc.) - attr(new, "labels") <- stats::setNames(as.numeric(value_labels), names(value_labels)) - } + if (any(grepl("[^0-9]", value_labels))) { + # if we have any non-numeric characters, convert to numeric + attr(new, "labels") <- stats::setNames(as.numeric(as.factor(value_labels)), names(value_labels)) } else { - attr(new, "labels") <- stats::setNames(value_labels, names(value_labels)) + # if we have numeric, or "numeric character" (like "1", "2", "3" etc.) + attr(new, "labels") <- stats::setNames(as.numeric(value_labels), names(value_labels)) } + } else { + attr(new, "labels") <- stats::setNames(value_labels, names(value_labels)) } } else if (isFALSE(include_values)) { attr(new, "labels") <- NULL diff --git a/tests/testthat/test-data_to_long.R b/tests/testthat/test-data_to_long.R index 397dbcb2e..37d926b11 100644 --- a/tests/testthat/test-data_to_long.R +++ b/tests/testthat/test-data_to_long.R @@ -1,5 +1,5 @@ set.seed(123) -wide_data <- data.frame(replicate(3, sample(1:5))) +wide_data <- data.frame(replicate(3, sample.int(5))) test_that("data_to_long works", { expect_equal( @@ -267,10 +267,10 @@ test_that("data_to_long: error if no columns to reshape", { test_that("data_to_long equivalent to pivot_longer: ex 1", { skip_if_not_installed("tidyr") - x <- tidyr::relig_income %>% + x <- tidyr::relig_income %>% # nolint tidyr::pivot_longer(!religion, names_to = "income", values_to = "count") - y <- tidyr::relig_income %>% + y <- tidyr::relig_income %>% # nolint data_to_long(cols = -religion, names_to = "income", values_to = "count") expect_equal(x, y, ignore_attr = TRUE) @@ -475,3 +475,16 @@ test_that("preserve date format", { expect_identical(tidyr, datawiz) }) + + +test_that("works with labelled data", { + data(efc, package = "datawizard") + out <- data_to_long( + efc, + select = c("e16sex", "c172code"), + names_to = "Dummy", + values_to = "Score" + ) + expect_identical(nrow(out), 200L) + expect_identical(attributes(out$e42dep)$label, "elder's dependency") +})