Implement rowmean_n() (#445)

* draft mean_n() * desc, news * pkgdown * more performant than apply * apply more performant * finalize, add tests * no rounding by default * mean_n -> rowmean_n * address comments * fix test * fix * use rowMeans() * use .coerce_to_dataframe() --------- Co-authored-by: Etienne Bacher <[email protected]>
easystats · Aug 10, 2023 · 6e3182c · 6e3182c · IndrajeetPatil · Aug 12, 2023
1 parent 3ff2209
commit 6e3182c
Show file tree

Hide file tree

Showing 8 changed files with 217 additions and 4 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.8.0.4
+Version: 0.8.0.5
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -272,6 +272,7 @@ export(reverse)
 export(reverse_scale)
 export(row_to_colnames)
 export(rowid_as_column)
+export(rowmean_n)
 export(rownames_as_column)
 export(skewness)
 export(slide)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # datawizard (devel)
 
+NEW FUNCTIONS
+
+* `rowmean_n()`, to compute row means if row contains at least `n` non-missing
+  values.
+
 CHANGES
 
 * `recode_into()` gains an `overwrite` argument to skip overwriting already

diff --git a/R/rowmean_n.R b/R/rowmean_n.R
@@ -0,0 +1,101 @@
+#' @title Row means with minimum amount of valid values
+#' @name rowmean_n
+#' @description This function is similar to the SPSS `MEAN.n` function and computes
+#' row means from a data frame or matrix if at least `n` values of a row are
+#' valid (and not `NA`).
+#'
+#' @param data A data frame with at least two columns, where row means are applied.
+#' @param n A numeric value of length 1. May either be
+#' - a numeric value that indicates the amount of valid values per row to
+#'   calculate the row mean;
+#' - or a value between 0 and 1, indicating a proportion of valid values per
+#'   row to calculate the row mean (see 'Details').
+#'
+#' If a row's sum of valid values is less than `n`, `NA` will be returned.
+#' @param digits Numeric value indicating the number of decimal places to be
+#' used for rounding mean values. Negative values are allowed (see 'Details').
+#' By default, `digits = NULL` and no rounding is used.
+#' @param verbose Toggle warnings.
+#'
+#' @return A vector with row means for those rows with at least `n` valid values.
+#'
+#' @details Rounding to a negative number of `digits` means rounding to a power of
+#' ten, for example `rowmean_n(df, 3, digits = -2)` rounds to the nearest hundred.
+#' For `n`, must be a numeric value from `0` to `ncol(data)`. If a row in the
+#' data frame has at least `n` non-missing values, the row mean is returned. If
+#' `n` is a non-integer value from 0 to 1, `n` is considered to indicate the
+#' proportion of required non-missing values per row. E.g., if `n = 0.75`, a
+#' row must have at least `ncol(data) * n` non-missing values for the row mean
+#' to be calculated. See 'Examples'.
+#'
+#' @examples
+#' dat <- data.frame(
+#'   c1 = c(1, 2, NA, 4),
+#'   c2 = c(NA, 2, NA, 5),
+#'   c3 = c(NA, 4, NA, NA),
+#'   c4 = c(2, 3, 7, 8)
+#' )
+#'
+#' # needs at least 4 non-missing values per row
+#' rowmean_n(dat, 4) # 1 valid return value
+#'
+#' # needs at least 3 non-missing values per row
+#' rowmean_n(dat, 3) # 2 valid return values
+#'
+#' # needs at least 2 non-missing values per row
+#' rowmean_n(dat, 2)
+#'
+#' # needs at least 1 non-missing value per row
+#' rowmean_n(dat, 1) # all means are shown
+#'
+#' # needs at least 50% of non-missing values per row
+#' rowmean_n(dat, 0.5) # 3 valid return values
+#'
+#' # needs at least 75% of non-missing values per row
+#' rowmean_n(dat, 0.75) # 2 valid return values
+#'
+#' @export
+rowmean_n <- function(data, n, digits = NULL, verbose = TRUE) {
+  data <- .coerce_to_dataframe(data)
+
+  # n must be a numeric, non-missing value
+  if (is.null(n) || all(is.na(n)) || !is.numeric(n) || length(n) > 1) {
+    insight::format_error("`n` must be a numeric value of length 1.")
+  }
+
+  # make sure we only have numeric values
+  numeric_columns <- vapply(data, is.numeric, TRUE)
+  if (!all(numeric_columns)) {
+    if (verbose) {
+      insight::format_alert("Only numeric columns are considered for calculation.")
+    }
+    data <- data[numeric_columns]
+  }
+
+  # check if we have a data framme with at least two columns
+  if (ncol(data) < 2) {
+    insight::format_error("`data` must be a data frame with at least two numeric columns.")
+  }
+
+  # is 'n' indicating a proportion?
+  decimals <- n %% 1
+  if (decimals != 0) {
+    n <- round(ncol(data) * decimals)
+  }
+
+  # n may not be larger as df's amount of columns
+  if (ncol(data) < n) {
+    insight::format_error("`n` must be smaller or equal to number of columns in data frame.")
+  }
+
+  # row means
+  to_na <- rowSums(is.na(data)) > ncol(data) - n
+  out <- rowMeans(data, na.rm = TRUE)
+  out[to_na] <- NA
+
+  # round, if requested
+  if (!is.null(digits) && !all(is.na(digits))) {
+    out <- round(out, digits = digits)
+  }
+  out
+}
diff --git a/_pkgdown.yaml b/_pkgdown.yaml
@@ -64,6 +64,7 @@ reference:
       - smoothness
       - skewness
       - weighted_mean
+      - rowmean_n
       - mean_sd
 
   - title: Convert and Replace Data

diff --git a/man/describe_distribution.Rd b/man/describe_distribution.Rd
diff --git a/man/rowmean_n.Rd b/man/rowmean_n.Rd
diff --git a/tests/testthat/test-rowmean_n.R b/tests/testthat/test-rowmean_n.R
@@ -0,0 +1,26 @@
+test_that("rowmean_n", {
+  d_mn <- data.frame(
+    c1 = c(1, 2, NA, 4),
+    c2 = c(NA, 2, NA, 5),
+    c3 = c(NA, 4, NA, NA),
+    c4 = c(2, 3, 7, 8)
+  )
+  expect_equal(rowmean_n(d_mn, 4), c(NA, 2.75, NA, NA), tolerance = 1e-3)
+  expect_equal(rowmean_n(d_mn, 3), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3)
+  expect_equal(rowmean_n(d_mn, 2), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3)
+  expect_equal(rowmean_n(d_mn, 1), c(1.5, 2.75, 7, 5.66667), tolerance = 1e-3)
+  expect_equal(rowmean_n(d_mn, 0.5), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3)
+  expect_equal(rowmean_n(d_mn, 0.75), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3)
+  expect_equal(rowmean_n(d_mn, 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1)
+})
+
+test_that("rowmean_n, errors or messages", {
+  data(iris)
+  expect_error(rowmean_n(5, n = 1), regex = "`data` must be")
+  expect_error(rowmean_n(iris[1], n = 1), regex = "two numeric")
+  expect_error(rowmean_n(iris, n = NULL), regex = "numeric value")
+  expect_error(rowmean_n(iris, n = 1:4), regex = "numeric value")
+  expect_error(rowmean_n(iris, n = "a"), regex = "numeric value")
+  expect_message(rowmean_n(iris[1:3, ], n = 3), regex = "Only numeric")
+  expect_silent(rowmean_n(iris[1:3, ], n = 3, verbose = FALSE))
+})