data_to_long() did not work with labelled data frames (#498)

easystats · May 7, 2024 · 54635c8 · 54635c8
1 parent e5b0ec4
commit 54635c8
Show file tree

Hide file tree

Showing 7 changed files with 77 additions and 49 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.10.0.1
+Version: 0.10.0.2
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NEWS.md b/NEWS.md
@@ -6,6 +6,11 @@ CHANGES
   If you recode into a numeric variable, and one of the recode values is `NA`,
   you no longer need to use `NA_real_` for numeric `NA` values.
 
+BUG FIXES
+
+* `data_to_long()` did not work for data frame where columns had attributes
+  (like labelled data).
+
 # datawizard 0.10.0
 
 BREAKING CHANGES

diff --git a/R/data_to_long.R b/R/data_to_long.R
@@ -78,6 +78,7 @@ data_to_long <- function(data,
                          regex = FALSE,
                          ...,
                          cols) { # nolint
+  original_data <- data
 
   # Prefer "cols" over "select" for compat with tidyr::pivot_longer
   # nolint start
@@ -219,6 +220,13 @@ data_to_long <- function(data,
 
   stacked_data <- data_relocate(stacked_data, select = values_to, after = -1)
 
+  # if columns in data frame have attributes (e.g. labelled data), `cbind()`
+  # won't work, so we need to remove them. We'll set them back later
+  not_stacked[] <- lapply(not_stacked, function(i) {
+    attributes(i) <- NULL
+    i
+  })
+
   # reunite unselected data with stacked data
   out <- cbind(
     not_stacked, stats::setNames(stacked_data, c(names_to, values_to)),
@@ -264,6 +272,12 @@ data_to_long <- function(data,
     row.names(out) <- NULL
   }
 
+  # set back labels
+  shared_columns <- intersect(colnames(out), colnames(original_data))
+  for (i in shared_columns) {
+    out[[i]] <- .set_back_labels(out[[i]], original_data[[i]], include_values = TRUE)
+  }
+
   out
 }
 

diff --git a/R/slide.R b/R/slide.R
@@ -78,16 +78,16 @@ slide.data.frame <- function(x,
   # create the new variables and updates "select", so new variables are processed
   if (!isFALSE(append)) {
     # process arguments
-    args <- .process_append(
+    my_args <- .process_append(
       x,
       select,
       append,
       append_suffix = "_s",
       keep_factors = FALSE
     )
     # update processed arguments
-    x <- args$x
-    select <- args$select
+    x <- my_args$x
+    select <- my_args$select
   }
 
   x[select] <- lapply(

diff --git a/R/standardize.R b/R/standardize.R
@@ -160,27 +160,25 @@ standardize.numeric <- function(x,
     center <- TRUE
   }
 
-  args <- .process_std_center(x, weights, robust, verbose, reference, center, scale)
+  my_args <- .process_std_center(x, weights, robust, verbose, reference, center, scale)
   dot_args <- list(...)
 
   # Perform standardization
-  if (is.null(args)) {
+  if (is.null(my_args)) {
     # all NA?
     return(x)
-  } else if (is.null(args$check)) {
-    vals <- rep(0, length(args$vals)) # If only unique value
+  } else if (is.null(my_args$check)) {
+    vals <- rep(0, length(my_args$vals)) # If only unique value
+  } else if (two_sd) {
+    vals <- as.vector((my_args$vals - my_args$center) / (2 * my_args$scale))
   } else {
-    if (two_sd) {
-      vals <- as.vector((args$vals - args$center) / (2 * args$scale))
-    } else {
-      vals <- as.vector((args$vals - args$center) / args$scale)
-    }
+    vals <- as.vector((my_args$vals - my_args$center) / my_args$scale)
   }
 
-  scaled_x <- rep(NA, length(args$valid_x))
-  scaled_x[args$valid_x] <- vals
-  attr(scaled_x, "center") <- args$center
-  attr(scaled_x, "scale") <- args$scale
+  scaled_x <- rep(NA, length(my_args$valid_x))
+  scaled_x[my_args$valid_x] <- vals
+  attr(scaled_x, "center") <- my_args$center
+  attr(scaled_x, "scale") <- my_args$scale
   attr(scaled_x, "robust") <- robust
   # labels
   z <- .set_back_labels(scaled_x, x, include_values = FALSE)
@@ -277,33 +275,33 @@ standardize.data.frame <- function(x,
   )
 
   # process arguments
-  args <- .process_std_args(x, select, exclude, weights, append,
+  my_args <- .process_std_args(x, select, exclude, weights, append,
     append_suffix = "_z", keep_factors = force, remove_na, reference,
     .center = center, .scale = scale
   )
 
   # set new values
-  x <- args$x
+  x <- my_args$x
 
   # Loop through variables and standardize it
-  for (var in args$select) {
+  for (var in my_args$select) {
     x[[var]] <- standardize(x[[var]],
       robust = robust,
       two_sd = two_sd,
-      weights = args$weights,
+      weights = my_args$weights,
       reference = reference[[var]],
-      center = args$center[var],
-      scale = args$scale[var],
+      center = my_args$center[var],
+      scale = my_args$scale[var],
       verbose = FALSE,
       force = force,
       add_transform_class = FALSE
     )
   }
 
-  attr(x, "center") <- unlist(lapply(x[args$select], function(z) {
+  attr(x, "center") <- unlist(lapply(x[my_args$select], function(z) {
     attributes(z)$center
   }))
-  attr(x, "scale") <- unlist(lapply(x[args$select], function(z) {
+  attr(x, "scale") <- unlist(lapply(x[my_args$select], function(z) {
     attributes(z)$scale
   }))
   attr(x, "robust") <- robust
@@ -338,25 +336,25 @@ standardize.grouped_df <- function(x,
     verbose = verbose
   )
 
-  args <- .process_grouped_df(
+  my_args <- .process_grouped_df(
     x, select, exclude, append,
     append_suffix = "_z",
     reference, weights, keep_factors = force
   )
 
   # create column(s) to store dw_transformer attributes
   for (i in select) {
-    args$info$groups[[paste0("attr_", i)]] <- rep(NA, length(args$grps))
+    my_args$info$groups[[paste0("attr_", i)]] <- rep(NA, length(my_args$grps))
   }
 
-  for (rows in seq_along(args$grps)) {
+  for (rows in seq_along(my_args$grps)) {
     tmp <- standardize(
-      args$x[args$grps[[rows]], , drop = FALSE],
-      select = args$select,
+      my_args$x[my_args$grps[[rows]], , drop = FALSE],
+      select = my_args$select,
       exclude = NULL,
       robust = robust,
       two_sd = two_sd,
-      weights = args$weights,
+      weights = my_args$weights,
       remove_na = remove_na,
       verbose = verbose,
       force = force,
@@ -369,18 +367,18 @@ standardize.grouped_df <- function(x,
 
     # store dw_transformer_attributes
     for (i in select) {
-      args$info$groups[rows, paste0("attr_", i)][[1]] <- list(unlist(attributes(tmp[[i]])))
+      my_args$info$groups[rows, paste0("attr_", i)][[1]] <- list(unlist(attributes(tmp[[i]])))
     }
 
-    args$x[args$grps[[rows]], ] <- tmp
+    my_args$x[my_args$grps[[rows]], ] <- tmp
   }
 
   # last column of "groups" attributes must be called ".rows"
-  args$info$groups <- data_relocate(args$info$groups, ".rows", after = -1)
+  my_args$info$groups <- data_relocate(my_args$info$groups, ".rows", after = -1)
 
   # set back class, so data frame still works with dplyr
-  attributes(args$x) <- args$info
-  args$x
+  attributes(my_args$x) <- my_args$info
+  my_args$x
 }
 
 

diff --git a/R/utils_labels.R b/R/utils_labels.R
@@ -12,19 +12,17 @@
     if (reverse_values) {
       # reverse values? Used for "reverse_scale()"
       attr(new, "labels") <- stats::setNames(rev(value_labels), names(value_labels))
-    } else {
+    } else if (is.numeric(new)) {
       # keep value oder? Used for "to_numeric()"
-      if (is.numeric(new)) {
-        if (any(grepl("[^0-9]", value_labels))) {
-          # if we have any non-numeric characters, convert to numeric
-          attr(new, "labels") <- stats::setNames(as.numeric(as.factor(value_labels)), names(value_labels))
-        } else {
-          # if we have numeric, or "numeric character" (like "1", "2", "3" etc.)
-          attr(new, "labels") <- stats::setNames(as.numeric(value_labels), names(value_labels))
-        }
+      if (any(grepl("[^0-9]", value_labels))) {
+        # if we have any non-numeric characters, convert to numeric
+        attr(new, "labels") <- stats::setNames(as.numeric(as.factor(value_labels)), names(value_labels))
       } else {
-        attr(new, "labels") <- stats::setNames(value_labels, names(value_labels))
+        # if we have numeric, or "numeric character" (like "1", "2", "3" etc.)
+        attr(new, "labels") <- stats::setNames(as.numeric(value_labels), names(value_labels))
       }
+    } else {
+      attr(new, "labels") <- stats::setNames(value_labels, names(value_labels))
     }
   } else if (isFALSE(include_values)) {
     attr(new, "labels") <- NULL

diff --git a/tests/testthat/test-data_to_long.R b/tests/testthat/test-data_to_long.R
@@ -1,5 +1,5 @@
 set.seed(123)
-wide_data <- data.frame(replicate(3, sample(1:5)))
+wide_data <- data.frame(replicate(3, sample.int(5)))
 
 test_that("data_to_long works", {
   expect_equal(
@@ -267,10 +267,10 @@ test_that("data_to_long: error if no columns to reshape", {
 test_that("data_to_long equivalent to pivot_longer: ex 1", {
   skip_if_not_installed("tidyr")
 
-  x <- tidyr::relig_income %>%
+  x <- tidyr::relig_income %>% # nolint
     tidyr::pivot_longer(!religion, names_to = "income", values_to = "count")
 
-  y <- tidyr::relig_income %>%
+  y <- tidyr::relig_income %>% # nolint
     data_to_long(cols = -religion, names_to = "income", values_to = "count")
 
   expect_equal(x, y, ignore_attr = TRUE)
@@ -475,3 +475,16 @@ test_that("preserve date format", {
 
   expect_identical(tidyr, datawiz)
 })
+
+
+test_that("works with labelled data", {
+  data(efc, package = "datawizard")
+  out <- data_to_long(
+    efc,
+    select = c("e16sex", "c172code"),
+    names_to = "Dummy",
+    values_to = "Score"
+  )
+  expect_identical(nrow(out), 200L)
+  expect_identical(attributes(out$e42dep)$label, "elder's dependency")
+})