Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

data_to_long() did not work with labelled data frames #498

Merged
merged 1 commit into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.10.0.1
Version: 0.10.0.2
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ CHANGES
If you recode into a numeric variable, and one of the recode values is `NA`,
you no longer need to use `NA_real_` for numeric `NA` values.

BUG FIXES

* `data_to_long()` did not work for data frame where columns had attributes
(like labelled data).

# datawizard 0.10.0

BREAKING CHANGES
Expand Down
14 changes: 14 additions & 0 deletions R/data_to_long.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ data_to_long <- function(data,
regex = FALSE,
...,
cols) { # nolint
original_data <- data

# Prefer "cols" over "select" for compat with tidyr::pivot_longer
# nolint start
Expand Down Expand Up @@ -219,6 +220,13 @@ data_to_long <- function(data,

stacked_data <- data_relocate(stacked_data, select = values_to, after = -1)

# if columns in data frame have attributes (e.g. labelled data), `cbind()`
# won't work, so we need to remove them. We'll set them back later
not_stacked[] <- lapply(not_stacked, function(i) {
attributes(i) <- NULL
i
})

# reunite unselected data with stacked data
out <- cbind(
not_stacked, stats::setNames(stacked_data, c(names_to, values_to)),
Expand Down Expand Up @@ -264,6 +272,12 @@ data_to_long <- function(data,
row.names(out) <- NULL
}

# set back labels
shared_columns <- intersect(colnames(out), colnames(original_data))
for (i in shared_columns) {
out[[i]] <- .set_back_labels(out[[i]], original_data[[i]], include_values = TRUE)
}

out
}

Expand Down
6 changes: 3 additions & 3 deletions R/slide.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,16 @@ slide.data.frame <- function(x,
# create the new variables and updates "select", so new variables are processed
if (!isFALSE(append)) {
# process arguments
args <- .process_append(
my_args <- .process_append(
x,
select,
append,
append_suffix = "_s",
keep_factors = FALSE
)
# update processed arguments
x <- args$x
select <- args$select
x <- my_args$x
select <- my_args$select
}

x[select] <- lapply(
Expand Down
62 changes: 30 additions & 32 deletions R/standardize.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,27 +160,25 @@ standardize.numeric <- function(x,
center <- TRUE
}

args <- .process_std_center(x, weights, robust, verbose, reference, center, scale)
my_args <- .process_std_center(x, weights, robust, verbose, reference, center, scale)
dot_args <- list(...)

# Perform standardization
if (is.null(args)) {
if (is.null(my_args)) {
# all NA?
return(x)
} else if (is.null(args$check)) {
vals <- rep(0, length(args$vals)) # If only unique value
} else if (is.null(my_args$check)) {
vals <- rep(0, length(my_args$vals)) # If only unique value
} else if (two_sd) {
vals <- as.vector((my_args$vals - my_args$center) / (2 * my_args$scale))
} else {
if (two_sd) {
vals <- as.vector((args$vals - args$center) / (2 * args$scale))
} else {
vals <- as.vector((args$vals - args$center) / args$scale)
}
vals <- as.vector((my_args$vals - my_args$center) / my_args$scale)
}

scaled_x <- rep(NA, length(args$valid_x))
scaled_x[args$valid_x] <- vals
attr(scaled_x, "center") <- args$center
attr(scaled_x, "scale") <- args$scale
scaled_x <- rep(NA, length(my_args$valid_x))
scaled_x[my_args$valid_x] <- vals
attr(scaled_x, "center") <- my_args$center
attr(scaled_x, "scale") <- my_args$scale
attr(scaled_x, "robust") <- robust
# labels
z <- .set_back_labels(scaled_x, x, include_values = FALSE)
Expand Down Expand Up @@ -277,33 +275,33 @@ standardize.data.frame <- function(x,
)

# process arguments
args <- .process_std_args(x, select, exclude, weights, append,
my_args <- .process_std_args(x, select, exclude, weights, append,
append_suffix = "_z", keep_factors = force, remove_na, reference,
.center = center, .scale = scale
)

# set new values
x <- args$x
x <- my_args$x

# Loop through variables and standardize it
for (var in args$select) {
for (var in my_args$select) {
x[[var]] <- standardize(x[[var]],
robust = robust,
two_sd = two_sd,
weights = args$weights,
weights = my_args$weights,
reference = reference[[var]],
center = args$center[var],
scale = args$scale[var],
center = my_args$center[var],
scale = my_args$scale[var],
verbose = FALSE,
force = force,
add_transform_class = FALSE
)
}

attr(x, "center") <- unlist(lapply(x[args$select], function(z) {
attr(x, "center") <- unlist(lapply(x[my_args$select], function(z) {
attributes(z)$center
}))
attr(x, "scale") <- unlist(lapply(x[args$select], function(z) {
attr(x, "scale") <- unlist(lapply(x[my_args$select], function(z) {
attributes(z)$scale
}))
attr(x, "robust") <- robust
Expand Down Expand Up @@ -338,25 +336,25 @@ standardize.grouped_df <- function(x,
verbose = verbose
)

args <- .process_grouped_df(
my_args <- .process_grouped_df(
x, select, exclude, append,
append_suffix = "_z",
reference, weights, keep_factors = force
)

# create column(s) to store dw_transformer attributes
for (i in select) {
args$info$groups[[paste0("attr_", i)]] <- rep(NA, length(args$grps))
my_args$info$groups[[paste0("attr_", i)]] <- rep(NA, length(my_args$grps))
}

for (rows in seq_along(args$grps)) {
for (rows in seq_along(my_args$grps)) {
tmp <- standardize(
args$x[args$grps[[rows]], , drop = FALSE],
select = args$select,
my_args$x[my_args$grps[[rows]], , drop = FALSE],
select = my_args$select,
exclude = NULL,
robust = robust,
two_sd = two_sd,
weights = args$weights,
weights = my_args$weights,
remove_na = remove_na,
verbose = verbose,
force = force,
Expand All @@ -369,18 +367,18 @@ standardize.grouped_df <- function(x,

# store dw_transformer_attributes
for (i in select) {
args$info$groups[rows, paste0("attr_", i)][[1]] <- list(unlist(attributes(tmp[[i]])))
my_args$info$groups[rows, paste0("attr_", i)][[1]] <- list(unlist(attributes(tmp[[i]])))
}

args$x[args$grps[[rows]], ] <- tmp
my_args$x[my_args$grps[[rows]], ] <- tmp
}

# last column of "groups" attributes must be called ".rows"
args$info$groups <- data_relocate(args$info$groups, ".rows", after = -1)
my_args$info$groups <- data_relocate(my_args$info$groups, ".rows", after = -1)

# set back class, so data frame still works with dplyr
attributes(args$x) <- args$info
args$x
attributes(my_args$x) <- my_args$info
my_args$x
}


Expand Down
18 changes: 8 additions & 10 deletions R/utils_labels.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,17 @@
if (reverse_values) {
# reverse values? Used for "reverse_scale()"
attr(new, "labels") <- stats::setNames(rev(value_labels), names(value_labels))
} else {
} else if (is.numeric(new)) {
# keep value oder? Used for "to_numeric()"
if (is.numeric(new)) {
if (any(grepl("[^0-9]", value_labels))) {
# if we have any non-numeric characters, convert to numeric
attr(new, "labels") <- stats::setNames(as.numeric(as.factor(value_labels)), names(value_labels))
} else {
# if we have numeric, or "numeric character" (like "1", "2", "3" etc.)
attr(new, "labels") <- stats::setNames(as.numeric(value_labels), names(value_labels))
}
if (any(grepl("[^0-9]", value_labels))) {
# if we have any non-numeric characters, convert to numeric
attr(new, "labels") <- stats::setNames(as.numeric(as.factor(value_labels)), names(value_labels))
} else {
attr(new, "labels") <- stats::setNames(value_labels, names(value_labels))
# if we have numeric, or "numeric character" (like "1", "2", "3" etc.)
attr(new, "labels") <- stats::setNames(as.numeric(value_labels), names(value_labels))
}
} else {
attr(new, "labels") <- stats::setNames(value_labels, names(value_labels))
}
} else if (isFALSE(include_values)) {
attr(new, "labels") <- NULL
Expand Down
19 changes: 16 additions & 3 deletions tests/testthat/test-data_to_long.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set.seed(123)
wide_data <- data.frame(replicate(3, sample(1:5)))
wide_data <- data.frame(replicate(3, sample.int(5)))

test_that("data_to_long works", {
expect_equal(
Expand Down Expand Up @@ -267,10 +267,10 @@ test_that("data_to_long: error if no columns to reshape", {
test_that("data_to_long equivalent to pivot_longer: ex 1", {
skip_if_not_installed("tidyr")

x <- tidyr::relig_income %>%
x <- tidyr::relig_income %>% # nolint
tidyr::pivot_longer(!religion, names_to = "income", values_to = "count")

y <- tidyr::relig_income %>%
y <- tidyr::relig_income %>% # nolint
data_to_long(cols = -religion, names_to = "income", values_to = "count")

expect_equal(x, y, ignore_attr = TRUE)
Expand Down Expand Up @@ -475,3 +475,16 @@ test_that("preserve date format", {

expect_identical(tidyr, datawiz)
})


test_that("works with labelled data", {
data(efc, package = "datawizard")
out <- data_to_long(
efc,
select = c("e16sex", "c172code"),
names_to = "Dummy",
values_to = "Score"
)
expect_identical(nrow(out), 200L)
expect_identical(attributes(out$e42dep)$label, "elder's dependency")
})
Loading