Skip to content

Commit

Permalink
Merge branch 'main' into unnormalize-grouped-data
Browse files Browse the repository at this point in the history
  • Loading branch information
etiennebacher authored Sep 10, 2023
2 parents 957dd35 + ae7df24 commit 04d08b3
Show file tree
Hide file tree
Showing 17 changed files with 608 additions and 212 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.8.0.8
Version: 0.8.0.12
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,9 @@ export(reshape_longer)
export(reshape_wider)
export(reverse)
export(reverse_scale)
export(row_means)
export(row_to_colnames)
export(rowid_as_column)
export(rowmean_n)
export(rownames_as_column)
export(skewness)
export(slide)
Expand Down
15 changes: 12 additions & 3 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

NEW FUNCTIONS

* `contr.deviation()` for sum-deviation contrast coding of factors.
* `row_means()`, to compute row means, optionally only for the rows with at
least `min_valid` non-missing values.

* `rowmean_n()`, to compute row means if row contains at least `n` non-missing
values.
* `contr.deviation()` for sum-deviation contrast coding of factors.

* `means_by_group()`, to compute mean values of variables, grouped by levels
of specified factors.
Expand All @@ -15,6 +15,9 @@ CHANGES
* `recode_into()` gains an `overwrite` argument to skip overwriting already
recoded cases when multiple recode patterns apply to the same case.

* `recode_into()` gains an `preserve_na` argument to preserve `NA` values
when recoding.

* `data_read()` now passes the `encoding` argument to `data.table::fread()`.
This allows to read files with non-ASCII characters.

Expand All @@ -24,12 +27,18 @@ CHANGES

BUG FIXES

* Fixed issue in `labels_to_levels()` when values of labels were not in sorted
order and values were not sequentially numbered.

* Fixed issues in `data_write()` when writing labelled data into SPSS format
and vectors were of different type as value labels.

* Fixed issue in `recode_into()` with probably wrong case number printed in the
warning when several recode patterns match to one case.

* Fixed issue in `recode_into()` when original data contained `NA` values and
`NA` was not included in the recode pattern.

* Fixed issue in `data_filter()` where functions containing a `=` (e.g. when
naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as
faulty syntax.
Expand Down
File renamed without changes.
File renamed without changes.
65 changes: 63 additions & 2 deletions R/recode_into.r → R/recode_into.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@
#' recode patterns. If `FALSE`, former recoded cases will not be altered by later
#' recode patterns that would apply to those cases again. A warning message is
#' printed to alert such situations and to avoid unintentional recodings.
#' @param preserve_na Logical, if `TRUE` and `default` is not `NA`, missing
#' values in the original variable will be set back to `NA` in the recoded
#' variable (unless overwritten by other recode patterns). If `FALSE`, missing
#' values in the original variable will be recoded to `default`. Setting
#' `preserve_na = TRUE` prevents unintentional overwriting of missing values
#' with `default`, which means that you won't find valid values where the
#' original data only had missing values. See 'Examples'.
#' @param verbose Toggle warnings.
#'
#' @return A vector with recoded values.
Expand Down Expand Up @@ -72,8 +79,38 @@
#' data = d,
#' default = 0
#' )
#'
#' # handling of missing values
#' d <- data.frame(
#' x = c(1, NA, 2, NA, 3, 4),
#' y = c(1, 11, 3, NA, 5, 6)
#' )
#' # first NA in x is overwritten by valid value from y
#' # we have no known value for second NA in x and y,
#' # thus we get one NA in the result
#' recode_into(
#' x <= 3 ~ 1,
#' y > 5 ~ 2,
#' data = d,
#' default = 0,
#' preserve_na = TRUE
#' )
#' # first NA in x is overwritten by valid value from y
#' # default value is used for second NA
#' recode_into(
#' x <= 3 ~ 1,
#' y > 5 ~ 2,
#' data = d,
#' default = 0,
#' preserve_na = FALSE
#' )
#' @export
recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbose = TRUE) {
recode_into <- function(...,
data = NULL,
default = NA,
overwrite = TRUE,
preserve_na = FALSE,
verbose = TRUE) {
dots <- list(...)

# get length of vector, so we know the length of the output vector
Expand Down Expand Up @@ -124,6 +161,9 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
)
}

# indicator to show message when replacing NA by default
# needed to show message only once
overwrite_NA_msg <- TRUE

# iterate all expressions
for (i in seq_len(n_params)) {
Expand All @@ -135,6 +175,12 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
index <- with(data, eval(dots[[i]][[2]]))
value <- with(data, eval(dots[[i]][[3]]))
}
# remember missing values, so we can add back later
missing_index <- is.na(index)
# make sure index has no missing values. when we have missing values in
# original expression, these are considered as "no match" and set to FALSE
# we handle NA value later and thus want to remove them from "index" now
index[is.na(index)] <- FALSE
# overwriting values? do more recode-patterns match the same case?
if (is.na(default)) {
already_exists <- !is.na(out[index])
Expand All @@ -144,7 +190,7 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
# save indices of overwritten cases
overwritten_cases <- which(index)[already_exists]
# tell user...
if (any(already_exists) && verbose) {
if (any(already_exists, na.rm = TRUE) && verbose) {
if (overwrite) {
msg <- paste(
"Several recode patterns apply to the same cases.",
Expand All @@ -164,7 +210,22 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
if (!overwrite) {
index[overwritten_cases] <- FALSE
}
# write new values into output vector
out[index] <- value
# set back missing values
if (any(missing_index) && !is.na(default)) {
if (preserve_na) {
# but only where we still have default values
# we don't want to overwrite already recoded values with NA
out[missing_index & out == default] <- NA
} else if (overwrite_NA_msg && verbose) {
# don't show msg again
overwrite_NA_msg <- FALSE
insight::format_alert(
"Missing values in original variable are overwritten by default value. If you want to preserve missing values, set `preserve_na = TRUE`."
)
}
}
}

out
Expand Down
139 changes: 139 additions & 0 deletions R/row_means.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#' @title Row means (optionally with minimum amount of valid values)
#' @name row_means
#' @description This function is similar to the SPSS `MEAN.n` function and computes
#' row means from a data frame or matrix if at least `min_valid` values of a row are
#' valid (and not `NA`).
#'
#' @param data A data frame with at least two columns, where row means are applied.
#' @param min_valid Optional, a numeric value of length 1. May either be
#' - a numeric value that indicates the amount of valid values per row to
#' calculate the row mean;
#' - or a value between 0 and 1, indicating a proportion of valid values per
#' row to calculate the row mean (see 'Details').
#' - `NULL` (default), in which all cases are considered.
#'
#' If a row's sum of valid values is less than `min_valid`, `NA` will be returned.
#' @param digits Numeric value indicating the number of decimal places to be
#' used for rounding mean values. Negative values are allowed (see 'Details').
#' By default, `digits = NULL` and no rounding is used.
#' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values
#' before calculating row means. Only applies if `min_valuid` is not specified.
#' @param verbose Toggle warnings.
#' @inheritParams find_columns
#'
#' @return A vector with row means for those rows with at least `n` valid values.
#'
#' @details Rounding to a negative number of `digits` means rounding to a power of
#' ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest hundred.
#' For `min_valid`, if not `NULL`, `min_valid` must be a numeric value from `0`
#' to `ncol(data)`. If a row in the data frame has at least `min_valid`
#' non-missing values, the row mean is returned. If `min_valid` is a non-integer
#' value from 0 to 1, `min_valid` is considered to indicate the proportion of
#' required non-missing values per row. E.g., if `min_valid = 0.75`, a row must
#' have at least `ncol(data) * min_valid` non-missing values for the row mean
#' to be calculated. See 'Examples'.
#'
#' @examples
#' dat <- data.frame(
#' c1 = c(1, 2, NA, 4),
#' c2 = c(NA, 2, NA, 5),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, 8)
#' )
#'
#' # default, all means are shown, if no NA values are present
#' row_means(dat)
#'
#' # remove all NA before computing row means
#' row_means(dat, remove_na = TRUE)
#'
#' # needs at least 4 non-missing values per row
#' row_means(dat, min_valid = 4) # 1 valid return value
#'
#' # needs at least 3 non-missing values per row
#' row_means(dat, min_valid = 3) # 2 valid return values
#'
#' # needs at least 2 non-missing values per row
#' row_means(dat, min_valid = 2)
#'
#' # needs at least 1 non-missing value per row, for two selected variables
#' row_means(dat, select = c("c1", "c3"), min_valid = 1)
#'
#' # needs at least 50% of non-missing values per row
#' row_means(dat, min_valid = 0.5) # 3 valid return values
#'
#' # needs at least 75% of non-missing values per row
#' row_means(dat, min_valid = 0.75) # 2 valid return values
#'
#' @export
row_means <- function(data,
select = NULL,
exclude = NULL,
min_valid = NULL,
digits = NULL,
ignore_case = FALSE,
regex = FALSE,
remove_na = FALSE,
verbose = TRUE) {
# evaluate arguments
select <- .select_nse(select,
data,
exclude,
ignore_case = ignore_case,
regex = regex,
verbose = verbose
)

if (is.null(select) || length(select) == 0) {
insight::format_error("No columns selected.")
}

data <- .coerce_to_dataframe(data[select])

# n must be a numeric, non-missing value
if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) {
insight::format_error("`min_valid` must be a numeric value of length 1.")
}

# make sure we only have numeric values
numeric_columns <- vapply(data, is.numeric, TRUE)
if (!all(numeric_columns)) {
if (verbose) {
insight::format_alert("Only numeric columns are considered for calculation.")
}
data <- data[numeric_columns]
}

# check if we have a data framme with at least two columns
if (ncol(data) < 2) {
insight::format_error("`data` must be a data frame with at least two numeric columns.")
}

# proceed here if min_valid is not NULL
if (!is.null(min_valid)) {
# is 'min_valid' indicating a proportion?
decimals <- min_valid %% 1
if (decimals != 0) {
min_valid <- round(ncol(data) * decimals)
}

# min_valid may not be larger as df's amount of columns
if (ncol(data) < min_valid) {
insight::format_error("`min_valid` must be smaller or equal to number of columns in data frame.")
}

# row means
to_na <- rowSums(is.na(data)) > ncol(data) - min_valid
out <- rowMeans(data, na.rm = TRUE)
out[to_na] <- NA
} else {
out <- rowMeans(data, na.rm = remove_na)
}

# round, if requested
if (!is.null(digits) && !all(is.na(digits))) {
out <- round(out, digits = digits)
}

out
}
Loading

0 comments on commit 04d08b3

Please sign in to comment.