Merge branch 'main' into unnormalize-grouped-data

easystats · Sep 10, 2023 · 04d08b3 · 04d08b3
2 parents 957dd35 + ae7df24
commit 04d08b3
Show file tree

Hide file tree

Showing 17 changed files with 608 additions and 212 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.8.0.8
+Version: 0.8.0.12
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -279,9 +279,9 @@ export(reshape_longer)
 export(reshape_wider)
 export(reverse)
 export(reverse_scale)
+export(row_means)
 export(row_to_colnames)
 export(rowid_as_column)
-export(rowmean_n)
 export(rownames_as_column)
 export(skewness)
 export(slide)

diff --git a/NEWS.md b/NEWS.md
@@ -2,10 +2,10 @@
 
 NEW FUNCTIONS
 
-* `contr.deviation()` for sum-deviation contrast coding of factors.
+* `row_means()`, to compute row means, optionally only for the rows with at
+  least `min_valid` non-missing values.
 
-* `rowmean_n()`, to compute row means if row contains at least `n` non-missing
-  values.
+* `contr.deviation()` for sum-deviation contrast coding of factors.
 
 * `means_by_group()`, to compute mean values of variables, grouped by levels
   of specified factors.
@@ -15,6 +15,9 @@ CHANGES
 * `recode_into()` gains an `overwrite` argument to skip overwriting already
   recoded cases when multiple recode patterns apply to the same case.
 
+* `recode_into()` gains an `preserve_na` argument to preserve `NA` values
+  when recoding.
+
 * `data_read()` now passes the `encoding` argument to `data.table::fread()`.
   This allows to read files with non-ASCII characters.
 
@@ -24,12 +27,18 @@ CHANGES
 
 BUG FIXES
 
+* Fixed issue in `labels_to_levels()` when values of labels were not in sorted
+  order and values were not sequentially numbered.
+
 * Fixed issues in `data_write()` when writing labelled data into SPSS format
   and vectors were of different type as value labels.
 
 * Fixed issue in `recode_into()` with probably wrong case number printed in the
   warning when several recode patterns match to one case.
 
+* Fixed issue in `recode_into()` when original data contained `NA` values and
+  `NA` was not included in the recode pattern.
+
 * Fixed issue in `data_filter()` where functions containing a `=` (e.g. when
   naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as
   faulty syntax. 

diff --git a/R/data_peek.r → R/data_peek.R b/R/data_peek.r → R/data_peek.R
diff --git a/R/data_write.r → R/data_write.R b/R/data_write.r → R/data_write.R
diff --git a/R/recode_into.r → R/recode_into.R b/R/recode_into.r → R/recode_into.R
@@ -20,6 +20,13 @@
 #' recode patterns. If `FALSE`, former recoded cases will not be altered by later
 #' recode patterns that would apply to those cases again. A warning message is
 #' printed to alert such situations and to avoid unintentional recodings.
+#' @param preserve_na Logical, if `TRUE` and `default` is not `NA`, missing
+#' values in the original variable will be set back to `NA` in the recoded
+#' variable (unless overwritten by other recode patterns). If `FALSE`, missing
+#' values in the original variable will be recoded to `default`. Setting
+#' `preserve_na = TRUE` prevents unintentional overwriting of missing values
+#' with `default`, which means that you won't find valid values where the
+#' original data only had missing values. See 'Examples'.
 #' @param verbose Toggle warnings.
 #'
 #' @return A vector with recoded values.
@@ -72,8 +79,38 @@
 #'   data = d,
 #'   default = 0
 #' )
+#'
+#' # handling of missing values
+#' d <- data.frame(
+#'   x = c(1, NA, 2, NA, 3, 4),
+#'   y = c(1, 11, 3, NA, 5, 6)
+#' )
+#' # first NA in x is overwritten by valid value from y
+#' # we have no known value for second NA in x and y,
+#' # thus we get one NA in the result
+#' recode_into(
+#'   x <= 3 ~ 1,
+#'   y > 5 ~ 2,
+#'   data = d,
+#'   default = 0,
+#'   preserve_na = TRUE
+#' )
+#' # first NA in x is overwritten by valid value from y
+#' # default value is used for second NA
+#' recode_into(
+#'   x <= 3 ~ 1,
+#'   y > 5 ~ 2,
+#'   data = d,
+#'   default = 0,
+#'   preserve_na = FALSE
+#' )
 #' @export
-recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbose = TRUE) {
+recode_into <- function(...,
+                        data = NULL,
+                        default = NA,
+                        overwrite = TRUE,
+                        preserve_na = FALSE,
+                        verbose = TRUE) {
   dots <- list(...)
 
   # get length of vector, so we know the length of the output vector
@@ -124,6 +161,9 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
     )
   }
 
+  # indicator to show message when replacing NA by default
+  # needed to show message only once
+  overwrite_NA_msg <- TRUE
 
   # iterate all expressions
   for (i in seq_len(n_params)) {
@@ -135,6 +175,12 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
       index <- with(data, eval(dots[[i]][[2]]))
       value <- with(data, eval(dots[[i]][[3]]))
     }
+    # remember missing values, so we can add back later
+    missing_index <- is.na(index)
+    # make sure index has no missing values. when we have missing values in
+    # original expression, these are considered as "no match" and set to FALSE
+    # we handle NA value later and thus want to remove them from "index" now
+    index[is.na(index)] <- FALSE
     # overwriting values? do more recode-patterns match the same case?
     if (is.na(default)) {
       already_exists <- !is.na(out[index])
@@ -144,7 +190,7 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
     # save indices of overwritten cases
     overwritten_cases <- which(index)[already_exists]
     # tell user...
-    if (any(already_exists) && verbose) {
+    if (any(already_exists, na.rm = TRUE) && verbose) {
       if (overwrite) {
         msg <- paste(
           "Several recode patterns apply to the same cases.",
@@ -164,7 +210,22 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
     if (!overwrite) {
       index[overwritten_cases] <- FALSE
     }
+    # write new values into output vector
     out[index] <- value
+    # set back missing values
+    if (any(missing_index) && !is.na(default)) {
+      if (preserve_na) {
+        # but only where we still have default values
+        # we don't want to overwrite already recoded values with NA
+        out[missing_index & out == default] <- NA
+      } else if (overwrite_NA_msg && verbose) {
+        # don't show msg again
+        overwrite_NA_msg <- FALSE
+        insight::format_alert(
+          "Missing values in original variable are overwritten by default value. If you want to preserve missing values, set `preserve_na = TRUE`."
+        )
+      }
+    }
   }
 
   out

diff --git a/R/row_means.R b/R/row_means.R
@@ -0,0 +1,139 @@
+#' @title Row means (optionally with minimum amount of valid values)
+#' @name row_means
+#' @description This function is similar to the SPSS `MEAN.n` function and computes
+#' row means from a data frame or matrix if at least `min_valid` values of a row are
+#' valid (and not `NA`).
+#'
+#' @param data A data frame with at least two columns, where row means are applied.
+#' @param min_valid Optional, a numeric value of length 1. May either be
+#' - a numeric value that indicates the amount of valid values per row to
+#'   calculate the row mean;
+#' - or a value between 0 and 1, indicating a proportion of valid values per
+#'   row to calculate the row mean (see 'Details').
+#' - `NULL` (default), in which all cases are considered.
+#'
+#' If a row's sum of valid values is less than `min_valid`, `NA` will be returned.
+#' @param digits Numeric value indicating the number of decimal places to be
+#' used for rounding mean values. Negative values are allowed (see 'Details').
+#' By default, `digits = NULL` and no rounding is used.
+#' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values
+#' before calculating row means. Only applies if `min_valuid` is not specified.
+#' @param verbose Toggle warnings.
+#' @inheritParams find_columns
+#'
+#' @return A vector with row means for those rows with at least `n` valid values.
+#'
+#' @details Rounding to a negative number of `digits` means rounding to a power of
+#' ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest hundred.
+#' For `min_valid`, if not `NULL`, `min_valid` must be a numeric value from `0`
+#' to `ncol(data)`. If a row in the data frame has at least `min_valid`
+#' non-missing values, the row mean is returned. If `min_valid` is a non-integer
+#' value from 0 to 1, `min_valid` is considered to indicate the proportion of
+#' required non-missing values per row. E.g., if `min_valid = 0.75`, a row must
+#' have at least `ncol(data) * min_valid` non-missing values for the row mean
+#' to be calculated. See 'Examples'.
+#'
+#' @examples
+#' dat <- data.frame(
+#'   c1 = c(1, 2, NA, 4),
+#'   c2 = c(NA, 2, NA, 5),
+#'   c3 = c(NA, 4, NA, NA),
+#'   c4 = c(2, 3, 7, 8)
+#' )
+#'
+#' # default, all means are shown, if no NA values are present
+#' row_means(dat)
+#'
+#' # remove all NA before computing row means
+#' row_means(dat, remove_na = TRUE)
+#'
+#' # needs at least 4 non-missing values per row
+#' row_means(dat, min_valid = 4) # 1 valid return value
+#'
+#' # needs at least 3 non-missing values per row
+#' row_means(dat, min_valid = 3) # 2 valid return values
+#'
+#' # needs at least 2 non-missing values per row
+#' row_means(dat, min_valid = 2)
+#'
+#' # needs at least 1 non-missing value per row, for two selected variables
+#' row_means(dat, select = c("c1", "c3"), min_valid = 1)
+#'
+#' # needs at least 50% of non-missing values per row
+#' row_means(dat, min_valid = 0.5) # 3 valid return values
+#'
+#' # needs at least 75% of non-missing values per row
+#' row_means(dat, min_valid = 0.75) # 2 valid return values
+#'
+#' @export
+row_means <- function(data,
+                      select = NULL,
+                      exclude = NULL,
+                      min_valid = NULL,
+                      digits = NULL,
+                      ignore_case = FALSE,
+                      regex = FALSE,
+                      remove_na = FALSE,
+                      verbose = TRUE) {
+  # evaluate arguments
+  select <- .select_nse(select,
+    data,
+    exclude,
+    ignore_case = ignore_case,
+    regex = regex,
+    verbose = verbose
+  )
+
+  if (is.null(select) || length(select) == 0) {
+    insight::format_error("No columns selected.")
+  }
+
+  data <- .coerce_to_dataframe(data[select])
+
+  # n must be a numeric, non-missing value
+  if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) {
+    insight::format_error("`min_valid` must be a numeric value of length 1.")
+  }
+
+  # make sure we only have numeric values
+  numeric_columns <- vapply(data, is.numeric, TRUE)
+  if (!all(numeric_columns)) {
+    if (verbose) {
+      insight::format_alert("Only numeric columns are considered for calculation.")
+    }
+    data <- data[numeric_columns]
+  }
+
+  # check if we have a data framme with at least two columns
+  if (ncol(data) < 2) {
+    insight::format_error("`data` must be a data frame with at least two numeric columns.")
+  }
+
+  # proceed here if min_valid is not NULL
+  if (!is.null(min_valid)) {
+    # is 'min_valid' indicating a proportion?
+    decimals <- min_valid %% 1
+    if (decimals != 0) {
+      min_valid <- round(ncol(data) * decimals)
+    }
+
+    # min_valid may not be larger as df's amount of columns
+    if (ncol(data) < min_valid) {
+      insight::format_error("`min_valid` must be smaller or equal to number of columns in data frame.")
+    }
+
+    # row means
+    to_na <- rowSums(is.na(data)) > ncol(data) - min_valid
+    out <- rowMeans(data, na.rm = TRUE)
+    out[to_na] <- NA
+  } else {
+    out <- rowMeans(data, na.rm = remove_na)
+  }
+
+  # round, if requested
+  if (!is.null(digits) && !all(is.na(digits))) {
+    out <- round(out, digits = digits)
+  }
+
+  out
+}