From 6e3182c7eb9602badffb5db94cf3cea8149c193d Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Thu, 10 Aug 2023 18:46:34 +0200
Subject: [PATCH] Implement `rowmean_n()` (#445)

* draft mean_n()

* desc, news

* pkgdown

* more performant than apply

* apply more performant

* finalize, add tests

* no rounding by default

* mean_n -> rowmean_n

* address comments

* fix test

* fix

* use rowMeans()

* use .coerce_to_dataframe()

---------

Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
---
 DESCRIPTION                     |   2 +-
 NAMESPACE                       |   1 +
 NEWS.md                         |   5 ++
 R/rowmean_n.R                   | 101 ++++++++++++++++++++++++++++++++
 _pkgdown.yaml                   |   1 +
 man/describe_distribution.Rd    |  13 +++-
 man/rowmean_n.Rd                |  72 +++++++++++++++++++++++
 tests/testthat/test-rowmean_n.R |  26 ++++++++
 8 files changed, 217 insertions(+), 4 deletions(-)
 create mode 100644 R/rowmean_n.R
 create mode 100644 man/rowmean_n.Rd
 create mode 100644 tests/testthat/test-rowmean_n.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 6a0264ee0..3c71a6343 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.8.0.4
+Version: 0.8.0.5
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
diff --git a/NAMESPACE b/NAMESPACE
index 08ec43927..bb2d43766 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -272,6 +272,7 @@ export(reverse)
 export(reverse_scale)
 export(row_to_colnames)
 export(rowid_as_column)
+export(rowmean_n)
 export(rownames_as_column)
 export(skewness)
 export(slide)
diff --git a/NEWS.md b/NEWS.md
index c3aee3b24..529ec1398 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,10 @@
 # datawizard (devel)
 
+NEW FUNCTIONS
+
+* `rowmean_n()`, to compute row means if row contains at least `n` non-missing
+  values.
+
 CHANGES
 
 * `recode_into()` gains an `overwrite` argument to skip overwriting already
diff --git a/R/rowmean_n.R b/R/rowmean_n.R
new file mode 100644
index 000000000..ab47cf511
--- /dev/null
+++ b/R/rowmean_n.R
@@ -0,0 +1,101 @@
+#' @title Row means with minimum amount of valid values
+#' @name rowmean_n
+#' @description This function is similar to the SPSS `MEAN.n` function and computes
+#' row means from a data frame or matrix if at least `n` values of a row are
+#' valid (and not `NA`).
+#'
+#' @param data A data frame with at least two columns, where row means are applied.
+#' @param n A numeric value of length 1. May either be
+#' - a numeric value that indicates the amount of valid values per row to
+#'   calculate the row mean;
+#' - or a value between 0 and 1, indicating a proportion of valid values per
+#'   row to calculate the row mean (see 'Details').
+#'
+#' If a row's sum of valid values is less than `n`, `NA` will be returned.
+#' @param digits Numeric value indicating the number of decimal places to be
+#' used for rounding mean values. Negative values are allowed (see 'Details').
+#' By default, `digits = NULL` and no rounding is used.
+#' @param verbose Toggle warnings.
+#'
+#' @return A vector with row means for those rows with at least `n` valid values.
+#'
+#' @details Rounding to a negative number of `digits` means rounding to a power of
+#' ten, for example `rowmean_n(df, 3, digits = -2)` rounds to the nearest hundred.
+#' For `n`, must be a numeric value from `0` to `ncol(data)`. If a row in the
+#' data frame has at least `n` non-missing values, the row mean is returned. If
+#' `n` is a non-integer value from 0 to 1, `n` is considered to indicate the
+#' proportion of required non-missing values per row. E.g., if `n = 0.75`, a
+#' row must have at least `ncol(data) * n` non-missing values for the row mean
+#' to be calculated. See 'Examples'.
+#'
+#' @examples
+#' dat <- data.frame(
+#'   c1 = c(1, 2, NA, 4),
+#'   c2 = c(NA, 2, NA, 5),
+#'   c3 = c(NA, 4, NA, NA),
+#'   c4 = c(2, 3, 7, 8)
+#' )
+#'
+#' # needs at least 4 non-missing values per row
+#' rowmean_n(dat, 4) # 1 valid return value
+#'
+#' # needs at least 3 non-missing values per row
+#' rowmean_n(dat, 3) # 2 valid return values
+#'
+#' # needs at least 2 non-missing values per row
+#' rowmean_n(dat, 2)
+#'
+#' # needs at least 1 non-missing value per row
+#' rowmean_n(dat, 1) # all means are shown
+#'
+#' # needs at least 50% of non-missing values per row
+#' rowmean_n(dat, 0.5) # 3 valid return values
+#'
+#' # needs at least 75% of non-missing values per row
+#' rowmean_n(dat, 0.75) # 2 valid return values
+#'
+#' @export
+rowmean_n <- function(data, n, digits = NULL, verbose = TRUE) {
+  data <- .coerce_to_dataframe(data)
+
+  # n must be a numeric, non-missing value
+  if (is.null(n) || all(is.na(n)) || !is.numeric(n) || length(n) > 1) {
+    insight::format_error("`n` must be a numeric value of length 1.")
+  }
+
+  # make sure we only have numeric values
+  numeric_columns <- vapply(data, is.numeric, TRUE)
+  if (!all(numeric_columns)) {
+    if (verbose) {
+      insight::format_alert("Only numeric columns are considered for calculation.")
+    }
+    data <- data[numeric_columns]
+  }
+
+  # check if we have a data framme with at least two columns
+  if (ncol(data) < 2) {
+    insight::format_error("`data` must be a data frame with at least two numeric columns.")
+  }
+
+  # is 'n' indicating a proportion?
+  decimals <- n %% 1
+  if (decimals != 0) {
+    n <- round(ncol(data) * decimals)
+  }
+
+  # n may not be larger as df's amount of columns
+  if (ncol(data) < n) {
+    insight::format_error("`n` must be smaller or equal to number of columns in data frame.")
+  }
+
+  # row means
+  to_na <- rowSums(is.na(data)) > ncol(data) - n
+  out <- rowMeans(data, na.rm = TRUE)
+  out[to_na] <- NA
+
+  # round, if requested
+  if (!is.null(digits) && !all(is.na(digits))) {
+    out <- round(out, digits = digits)
+  }
+  out
+}
diff --git a/_pkgdown.yaml b/_pkgdown.yaml
index 038a405a3..9d321aa78 100644
--- a/_pkgdown.yaml
+++ b/_pkgdown.yaml
@@ -64,6 +64,7 @@ reference:
       - smoothness
       - skewness
       - weighted_mean
+      - rowmean_n
       - mean_sd
 
   - title: Convert and Replace Data
diff --git a/man/describe_distribution.Rd b/man/describe_distribution.Rd
index a23069eea..fd229567d 100644
--- a/man/describe_distribution.Rd
+++ b/man/describe_distribution.Rd
@@ -50,9 +50,14 @@ describe_distribution(x, ...)
 
 \item{...}{Additional arguments to be passed to or from methods.}
 
-\item{centrality}{The point-estimates (centrality indices) to compute.  Character (vector) or list with one or more of these options: \code{"median"}, \code{"mean"}, \code{"MAP"} or \code{"all"}.}
+\item{centrality}{The point-estimates (centrality indices) to compute. Character
+(vector) or list with one or more of these options: \code{"median"}, \code{"mean"}, \code{"MAP"}
+(see \code{\link[bayestestR:map_estimate]{map_estimate()}}), \code{"trimmed"} (which is just \code{mean(x, trim = threshold)}),
+\code{"mode"} or \code{"all"}.}
 
-\item{dispersion}{Logical, if \code{TRUE}, computes indices of dispersion related to the estimate(s) (\code{SD} and \code{MAD} for \code{mean} and \code{median}, respectively).}
+\item{dispersion}{Logical, if \code{TRUE}, computes indices of dispersion related
+to the estimate(s) (\code{SD} and \code{MAD} for \code{mean} and \code{median}, respectively).
+Dispersion is not available for \code{"MAP"} or \code{"mode"} centrality indices.}
 
 \item{iqr}{Logical, if \code{TRUE}, the interquartile range is calculated
 (based on \code{\link[stats:IQR]{stats::IQR()}}, using \code{type = 6}).}
@@ -71,7 +76,9 @@ the first centrality index (which is typically the median).}
 \item{iterations}{The number of bootstrap replicates for computing confidence
 intervals. Only applies when \code{ci} is not \code{NULL}.}
 
-\item{threshold}{For \code{centrality = "trimmed"} (i.e. trimmed mean), indicates the fraction (0 to 0.5) of observations to be trimmed from each end of the vector before the mean is computed.}
+\item{threshold}{For \code{centrality = "trimmed"} (i.e. trimmed mean), indicates
+the fraction (0 to 0.5) of observations to be trimmed from each end of the
+vector before the mean is computed.}
 
 \item{verbose}{Toggle warnings and messages.}
 
diff --git a/man/rowmean_n.Rd b/man/rowmean_n.Rd
new file mode 100644
index 000000000..df340eed3
--- /dev/null
+++ b/man/rowmean_n.Rd
@@ -0,0 +1,72 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rowmean_n.R
+\name{rowmean_n}
+\alias{rowmean_n}
+\title{Row means with minimum amount of valid values}
+\usage{
+rowmean_n(data, n, digits = NULL, verbose = TRUE)
+}
+\arguments{
+\item{data}{A data frame with at least two columns, where row means are applied.}
+
+\item{n}{A numeric value of length 1. May either be
+\itemize{
+\item a numeric value that indicates the amount of valid values per row to
+calculate the row mean;
+\item or a value between 0 and 1, indicating a proportion of valid values per
+row to calculate the row mean (see 'Details').
+}
+
+If a row's sum of valid values is less than \code{n}, \code{NA} will be returned.}
+
+\item{digits}{Numeric value indicating the number of decimal places to be
+used for rounding mean values. Negative values are allowed (see 'Details').
+By default, \code{digits = NULL} and no rounding is used.}
+
+\item{verbose}{Toggle warnings.}
+}
+\value{
+A vector with row means for those rows with at least \code{n} valid values.
+}
+\description{
+This function is similar to the SPSS \code{MEAN.n} function and computes
+row means from a data frame or matrix if at least \code{n} values of a row are
+valid (and not \code{NA}).
+}
+\details{
+Rounding to a negative number of \code{digits} means rounding to a power of
+ten, for example \code{rowmean_n(df, 3, digits = -2)} rounds to the nearest hundred.
+For \code{n}, must be a numeric value from \code{0} to \code{ncol(data)}. If a row in the
+data frame has at least \code{n} non-missing values, the row mean is returned. If
+\code{n} is a non-integer value from 0 to 1, \code{n} is considered to indicate the
+proportion of required non-missing values per row. E.g., if \code{n = 0.75}, a
+row must have at least \code{ncol(data) * n} non-missing values for the row mean
+to be calculated. See 'Examples'.
+}
+\examples{
+dat <- data.frame(
+  c1 = c(1, 2, NA, 4),
+  c2 = c(NA, 2, NA, 5),
+  c3 = c(NA, 4, NA, NA),
+  c4 = c(2, 3, 7, 8)
+)
+
+# needs at least 4 non-missing values per row
+rowmean_n(dat, 4) # 1 valid return value
+
+# needs at least 3 non-missing values per row
+rowmean_n(dat, 3) # 2 valid return values
+
+# needs at least 2 non-missing values per row
+rowmean_n(dat, 2)
+
+# needs at least 1 non-missing value per row
+rowmean_n(dat, 1) # all means are shown
+
+# needs at least 50\% of non-missing values per row
+rowmean_n(dat, 0.5) # 3 valid return values
+
+# needs at least 75\% of non-missing values per row
+rowmean_n(dat, 0.75) # 2 valid return values
+
+}
diff --git a/tests/testthat/test-rowmean_n.R b/tests/testthat/test-rowmean_n.R
new file mode 100644
index 000000000..a17996ff6
--- /dev/null
+++ b/tests/testthat/test-rowmean_n.R
@@ -0,0 +1,26 @@
+test_that("rowmean_n", {
+  d_mn <- data.frame(
+    c1 = c(1, 2, NA, 4),
+    c2 = c(NA, 2, NA, 5),
+    c3 = c(NA, 4, NA, NA),
+    c4 = c(2, 3, 7, 8)
+  )
+  expect_equal(rowmean_n(d_mn, 4), c(NA, 2.75, NA, NA), tolerance = 1e-3)
+  expect_equal(rowmean_n(d_mn, 3), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3)
+  expect_equal(rowmean_n(d_mn, 2), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3)
+  expect_equal(rowmean_n(d_mn, 1), c(1.5, 2.75, 7, 5.66667), tolerance = 1e-3)
+  expect_equal(rowmean_n(d_mn, 0.5), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3)
+  expect_equal(rowmean_n(d_mn, 0.75), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3)
+  expect_equal(rowmean_n(d_mn, 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1)
+})
+
+test_that("rowmean_n, errors or messages", {
+  data(iris)
+  expect_error(rowmean_n(5, n = 1), regex = "`data` must be")
+  expect_error(rowmean_n(iris[1], n = 1), regex = "two numeric")
+  expect_error(rowmean_n(iris, n = NULL), regex = "numeric value")
+  expect_error(rowmean_n(iris, n = 1:4), regex = "numeric value")
+  expect_error(rowmean_n(iris, n = "a"), regex = "numeric value")
+  expect_message(rowmean_n(iris[1:3, ], n = 3), regex = "Only numeric")
+  expect_silent(rowmean_n(iris[1:3, ], n = 3, verbose = FALSE))
+})