Skip to content

Commit

Permalink
data_to_long() did not work with labelled data frames (#498)
Browse files Browse the repository at this point in the history
  • Loading branch information
strengejacke authored May 7, 2024
1 parent e5b0ec4 commit 54635c8
Show file tree
Hide file tree
Showing 7 changed files with 77 additions and 49 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.10.0.1
Version: 0.10.0.2
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ CHANGES
If you recode into a numeric variable, and one of the recode values is `NA`,
you no longer need to use `NA_real_` for numeric `NA` values.

BUG FIXES

* `data_to_long()` did not work for data frame where columns had attributes
(like labelled data).

# datawizard 0.10.0

BREAKING CHANGES
Expand Down
14 changes: 14 additions & 0 deletions R/data_to_long.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ data_to_long <- function(data,
regex = FALSE,
...,
cols) { # nolint
original_data <- data

# Prefer "cols" over "select" for compat with tidyr::pivot_longer
# nolint start
Expand Down Expand Up @@ -219,6 +220,13 @@ data_to_long <- function(data,

stacked_data <- data_relocate(stacked_data, select = values_to, after = -1)

# if columns in data frame have attributes (e.g. labelled data), `cbind()`
# won't work, so we need to remove them. We'll set them back later
not_stacked[] <- lapply(not_stacked, function(i) {
attributes(i) <- NULL
i
})

# reunite unselected data with stacked data
out <- cbind(
not_stacked, stats::setNames(stacked_data, c(names_to, values_to)),
Expand Down Expand Up @@ -264,6 +272,12 @@ data_to_long <- function(data,
row.names(out) <- NULL
}

# set back labels
shared_columns <- intersect(colnames(out), colnames(original_data))
for (i in shared_columns) {
out[[i]] <- .set_back_labels(out[[i]], original_data[[i]], include_values = TRUE)
}

out
}

Expand Down
6 changes: 3 additions & 3 deletions R/slide.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,16 @@ slide.data.frame <- function(x,
# create the new variables and updates "select", so new variables are processed
if (!isFALSE(append)) {
# process arguments
args <- .process_append(
my_args <- .process_append(
x,
select,
append,
append_suffix = "_s",
keep_factors = FALSE
)
# update processed arguments
x <- args$x
select <- args$select
x <- my_args$x
select <- my_args$select
}

x[select] <- lapply(
Expand Down
62 changes: 30 additions & 32 deletions R/standardize.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,27 +160,25 @@ standardize.numeric <- function(x,
center <- TRUE
}

args <- .process_std_center(x, weights, robust, verbose, reference, center, scale)
my_args <- .process_std_center(x, weights, robust, verbose, reference, center, scale)
dot_args <- list(...)

# Perform standardization
if (is.null(args)) {
if (is.null(my_args)) {
# all NA?
return(x)
} else if (is.null(args$check)) {
vals <- rep(0, length(args$vals)) # If only unique value
} else if (is.null(my_args$check)) {
vals <- rep(0, length(my_args$vals)) # If only unique value
} else if (two_sd) {
vals <- as.vector((my_args$vals - my_args$center) / (2 * my_args$scale))
} else {
if (two_sd) {
vals <- as.vector((args$vals - args$center) / (2 * args$scale))
} else {
vals <- as.vector((args$vals - args$center) / args$scale)
}
vals <- as.vector((my_args$vals - my_args$center) / my_args$scale)
}

scaled_x <- rep(NA, length(args$valid_x))
scaled_x[args$valid_x] <- vals
attr(scaled_x, "center") <- args$center
attr(scaled_x, "scale") <- args$scale
scaled_x <- rep(NA, length(my_args$valid_x))
scaled_x[my_args$valid_x] <- vals
attr(scaled_x, "center") <- my_args$center
attr(scaled_x, "scale") <- my_args$scale
attr(scaled_x, "robust") <- robust
# labels
z <- .set_back_labels(scaled_x, x, include_values = FALSE)
Expand Down Expand Up @@ -277,33 +275,33 @@ standardize.data.frame <- function(x,
)

# process arguments
args <- .process_std_args(x, select, exclude, weights, append,
my_args <- .process_std_args(x, select, exclude, weights, append,
append_suffix = "_z", keep_factors = force, remove_na, reference,
.center = center, .scale = scale
)

# set new values
x <- args$x
x <- my_args$x

# Loop through variables and standardize it
for (var in args$select) {
for (var in my_args$select) {
x[[var]] <- standardize(x[[var]],
robust = robust,
two_sd = two_sd,
weights = args$weights,
weights = my_args$weights,
reference = reference[[var]],
center = args$center[var],
scale = args$scale[var],
center = my_args$center[var],
scale = my_args$scale[var],
verbose = FALSE,
force = force,
add_transform_class = FALSE
)
}

attr(x, "center") <- unlist(lapply(x[args$select], function(z) {
attr(x, "center") <- unlist(lapply(x[my_args$select], function(z) {
attributes(z)$center
}))
attr(x, "scale") <- unlist(lapply(x[args$select], function(z) {
attr(x, "scale") <- unlist(lapply(x[my_args$select], function(z) {
attributes(z)$scale
}))
attr(x, "robust") <- robust
Expand Down Expand Up @@ -338,25 +336,25 @@ standardize.grouped_df <- function(x,
verbose = verbose
)

args <- .process_grouped_df(
my_args <- .process_grouped_df(
x, select, exclude, append,
append_suffix = "_z",
reference, weights, keep_factors = force
)

# create column(s) to store dw_transformer attributes
for (i in select) {
args$info$groups[[paste0("attr_", i)]] <- rep(NA, length(args$grps))
my_args$info$groups[[paste0("attr_", i)]] <- rep(NA, length(my_args$grps))
}

for (rows in seq_along(args$grps)) {
for (rows in seq_along(my_args$grps)) {
tmp <- standardize(
args$x[args$grps[[rows]], , drop = FALSE],
select = args$select,
my_args$x[my_args$grps[[rows]], , drop = FALSE],
select = my_args$select,
exclude = NULL,
robust = robust,
two_sd = two_sd,
weights = args$weights,
weights = my_args$weights,
remove_na = remove_na,
verbose = verbose,
force = force,
Expand All @@ -369,18 +367,18 @@ standardize.grouped_df <- function(x,

# store dw_transformer_attributes
for (i in select) {
args$info$groups[rows, paste0("attr_", i)][[1]] <- list(unlist(attributes(tmp[[i]])))
my_args$info$groups[rows, paste0("attr_", i)][[1]] <- list(unlist(attributes(tmp[[i]])))
}

args$x[args$grps[[rows]], ] <- tmp
my_args$x[my_args$grps[[rows]], ] <- tmp
}

# last column of "groups" attributes must be called ".rows"
args$info$groups <- data_relocate(args$info$groups, ".rows", after = -1)
my_args$info$groups <- data_relocate(my_args$info$groups, ".rows", after = -1)

# set back class, so data frame still works with dplyr
attributes(args$x) <- args$info
args$x
attributes(my_args$x) <- my_args$info
my_args$x
}


Expand Down
18 changes: 8 additions & 10 deletions R/utils_labels.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,17 @@
if (reverse_values) {
# reverse values? Used for "reverse_scale()"
attr(new, "labels") <- stats::setNames(rev(value_labels), names(value_labels))
} else {
} else if (is.numeric(new)) {
# keep value oder? Used for "to_numeric()"
if (is.numeric(new)) {
if (any(grepl("[^0-9]", value_labels))) {
# if we have any non-numeric characters, convert to numeric
attr(new, "labels") <- stats::setNames(as.numeric(as.factor(value_labels)), names(value_labels))
} else {
# if we have numeric, or "numeric character" (like "1", "2", "3" etc.)
attr(new, "labels") <- stats::setNames(as.numeric(value_labels), names(value_labels))
}
if (any(grepl("[^0-9]", value_labels))) {
# if we have any non-numeric characters, convert to numeric
attr(new, "labels") <- stats::setNames(as.numeric(as.factor(value_labels)), names(value_labels))
} else {
attr(new, "labels") <- stats::setNames(value_labels, names(value_labels))
# if we have numeric, or "numeric character" (like "1", "2", "3" etc.)
attr(new, "labels") <- stats::setNames(as.numeric(value_labels), names(value_labels))
}
} else {
attr(new, "labels") <- stats::setNames(value_labels, names(value_labels))
}
} else if (isFALSE(include_values)) {
attr(new, "labels") <- NULL
Expand Down
19 changes: 16 additions & 3 deletions tests/testthat/test-data_to_long.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set.seed(123)
wide_data <- data.frame(replicate(3, sample(1:5)))
wide_data <- data.frame(replicate(3, sample.int(5)))

test_that("data_to_long works", {
expect_equal(
Expand Down Expand Up @@ -267,10 +267,10 @@ test_that("data_to_long: error if no columns to reshape", {
test_that("data_to_long equivalent to pivot_longer: ex 1", {
skip_if_not_installed("tidyr")

x <- tidyr::relig_income %>%
x <- tidyr::relig_income %>% # nolint
tidyr::pivot_longer(!religion, names_to = "income", values_to = "count")

y <- tidyr::relig_income %>%
y <- tidyr::relig_income %>% # nolint
data_to_long(cols = -religion, names_to = "income", values_to = "count")

expect_equal(x, y, ignore_attr = TRUE)
Expand Down Expand Up @@ -475,3 +475,16 @@ test_that("preserve date format", {

expect_identical(tidyr, datawiz)
})


test_that("works with labelled data", {
data(efc, package = "datawizard")
out <- data_to_long(
efc,
select = c("e16sex", "c172code"),
names_to = "Dummy",
values_to = "Score"
)
expect_identical(nrow(out), 200L)
expect_identical(attributes(out$e42dep)$label, "elder's dependency")
})

0 comments on commit 54635c8

Please sign in to comment.