From 6e3182c7eb9602badffb5db94cf3cea8149c193d Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Aug 2023 18:46:34 +0200 Subject: [PATCH] Implement `rowmean_n()` (#445) * draft mean_n() * desc, news * pkgdown * more performant than apply * apply more performant * finalize, add tests * no rounding by default * mean_n -> rowmean_n * address comments * fix test * fix * use rowMeans() * use .coerce_to_dataframe() --------- Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 5 ++ R/rowmean_n.R | 101 ++++++++++++++++++++++++++++++++ _pkgdown.yaml | 1 + man/describe_distribution.Rd | 13 +++- man/rowmean_n.Rd | 72 +++++++++++++++++++++++ tests/testthat/test-rowmean_n.R | 26 ++++++++ 8 files changed, 217 insertions(+), 4 deletions(-) create mode 100644 R/rowmean_n.R create mode 100644 man/rowmean_n.Rd create mode 100644 tests/testthat/test-rowmean_n.R diff --git a/DESCRIPTION b/DESCRIPTION index 6a0264ee0..3c71a6343 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.8.0.4 +Version: 0.8.0.5 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NAMESPACE b/NAMESPACE index 08ec43927..bb2d43766 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -272,6 +272,7 @@ export(reverse) export(reverse_scale) export(row_to_colnames) export(rowid_as_column) +export(rowmean_n) export(rownames_as_column) export(skewness) export(slide) diff --git a/NEWS.md b/NEWS.md index c3aee3b24..529ec1398 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # datawizard (devel) +NEW FUNCTIONS + +* `rowmean_n()`, to compute row means if row contains at least `n` non-missing + values. + CHANGES * `recode_into()` gains an `overwrite` argument to skip overwriting already diff --git a/R/rowmean_n.R b/R/rowmean_n.R new file mode 100644 index 000000000..ab47cf511 --- /dev/null +++ b/R/rowmean_n.R @@ -0,0 +1,101 @@ +#' @title Row means with minimum amount of valid values +#' @name rowmean_n +#' @description This function is similar to the SPSS `MEAN.n` function and computes +#' row means from a data frame or matrix if at least `n` values of a row are +#' valid (and not `NA`). +#' +#' @param data A data frame with at least two columns, where row means are applied. +#' @param n A numeric value of length 1. May either be +#' - a numeric value that indicates the amount of valid values per row to +#' calculate the row mean; +#' - or a value between 0 and 1, indicating a proportion of valid values per +#' row to calculate the row mean (see 'Details'). +#' +#' If a row's sum of valid values is less than `n`, `NA` will be returned. +#' @param digits Numeric value indicating the number of decimal places to be +#' used for rounding mean values. Negative values are allowed (see 'Details'). +#' By default, `digits = NULL` and no rounding is used. +#' @param verbose Toggle warnings. +#' +#' @return A vector with row means for those rows with at least `n` valid values. +#' +#' @details Rounding to a negative number of `digits` means rounding to a power of +#' ten, for example `rowmean_n(df, 3, digits = -2)` rounds to the nearest hundred. +#' For `n`, must be a numeric value from `0` to `ncol(data)`. If a row in the +#' data frame has at least `n` non-missing values, the row mean is returned. If +#' `n` is a non-integer value from 0 to 1, `n` is considered to indicate the +#' proportion of required non-missing values per row. E.g., if `n = 0.75`, a +#' row must have at least `ncol(data) * n` non-missing values for the row mean +#' to be calculated. See 'Examples'. +#' +#' @examples +#' dat <- data.frame( +#' c1 = c(1, 2, NA, 4), +#' c2 = c(NA, 2, NA, 5), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, 8) +#' ) +#' +#' # needs at least 4 non-missing values per row +#' rowmean_n(dat, 4) # 1 valid return value +#' +#' # needs at least 3 non-missing values per row +#' rowmean_n(dat, 3) # 2 valid return values +#' +#' # needs at least 2 non-missing values per row +#' rowmean_n(dat, 2) +#' +#' # needs at least 1 non-missing value per row +#' rowmean_n(dat, 1) # all means are shown +#' +#' # needs at least 50% of non-missing values per row +#' rowmean_n(dat, 0.5) # 3 valid return values +#' +#' # needs at least 75% of non-missing values per row +#' rowmean_n(dat, 0.75) # 2 valid return values +#' +#' @export +rowmean_n <- function(data, n, digits = NULL, verbose = TRUE) { + data <- .coerce_to_dataframe(data) + + # n must be a numeric, non-missing value + if (is.null(n) || all(is.na(n)) || !is.numeric(n) || length(n) > 1) { + insight::format_error("`n` must be a numeric value of length 1.") + } + + # make sure we only have numeric values + numeric_columns <- vapply(data, is.numeric, TRUE) + if (!all(numeric_columns)) { + if (verbose) { + insight::format_alert("Only numeric columns are considered for calculation.") + } + data <- data[numeric_columns] + } + + # check if we have a data framme with at least two columns + if (ncol(data) < 2) { + insight::format_error("`data` must be a data frame with at least two numeric columns.") + } + + # is 'n' indicating a proportion? + decimals <- n %% 1 + if (decimals != 0) { + n <- round(ncol(data) * decimals) + } + + # n may not be larger as df's amount of columns + if (ncol(data) < n) { + insight::format_error("`n` must be smaller or equal to number of columns in data frame.") + } + + # row means + to_na <- rowSums(is.na(data)) > ncol(data) - n + out <- rowMeans(data, na.rm = TRUE) + out[to_na] <- NA + + # round, if requested + if (!is.null(digits) && !all(is.na(digits))) { + out <- round(out, digits = digits) + } + out +} diff --git a/_pkgdown.yaml b/_pkgdown.yaml index 038a405a3..9d321aa78 100644 --- a/_pkgdown.yaml +++ b/_pkgdown.yaml @@ -64,6 +64,7 @@ reference: - smoothness - skewness - weighted_mean + - rowmean_n - mean_sd - title: Convert and Replace Data diff --git a/man/describe_distribution.Rd b/man/describe_distribution.Rd index a23069eea..fd229567d 100644 --- a/man/describe_distribution.Rd +++ b/man/describe_distribution.Rd @@ -50,9 +50,14 @@ describe_distribution(x, ...) \item{...}{Additional arguments to be passed to or from methods.} -\item{centrality}{The point-estimates (centrality indices) to compute. Character (vector) or list with one or more of these options: \code{"median"}, \code{"mean"}, \code{"MAP"} or \code{"all"}.} +\item{centrality}{The point-estimates (centrality indices) to compute. Character +(vector) or list with one or more of these options: \code{"median"}, \code{"mean"}, \code{"MAP"} +(see \code{\link[bayestestR:map_estimate]{map_estimate()}}), \code{"trimmed"} (which is just \code{mean(x, trim = threshold)}), +\code{"mode"} or \code{"all"}.} -\item{dispersion}{Logical, if \code{TRUE}, computes indices of dispersion related to the estimate(s) (\code{SD} and \code{MAD} for \code{mean} and \code{median}, respectively).} +\item{dispersion}{Logical, if \code{TRUE}, computes indices of dispersion related +to the estimate(s) (\code{SD} and \code{MAD} for \code{mean} and \code{median}, respectively). +Dispersion is not available for \code{"MAP"} or \code{"mode"} centrality indices.} \item{iqr}{Logical, if \code{TRUE}, the interquartile range is calculated (based on \code{\link[stats:IQR]{stats::IQR()}}, using \code{type = 6}).} @@ -71,7 +76,9 @@ the first centrality index (which is typically the median).} \item{iterations}{The number of bootstrap replicates for computing confidence intervals. Only applies when \code{ci} is not \code{NULL}.} -\item{threshold}{For \code{centrality = "trimmed"} (i.e. trimmed mean), indicates the fraction (0 to 0.5) of observations to be trimmed from each end of the vector before the mean is computed.} +\item{threshold}{For \code{centrality = "trimmed"} (i.e. trimmed mean), indicates +the fraction (0 to 0.5) of observations to be trimmed from each end of the +vector before the mean is computed.} \item{verbose}{Toggle warnings and messages.} diff --git a/man/rowmean_n.Rd b/man/rowmean_n.Rd new file mode 100644 index 000000000..df340eed3 --- /dev/null +++ b/man/rowmean_n.Rd @@ -0,0 +1,72 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rowmean_n.R +\name{rowmean_n} +\alias{rowmean_n} +\title{Row means with minimum amount of valid values} +\usage{ +rowmean_n(data, n, digits = NULL, verbose = TRUE) +} +\arguments{ +\item{data}{A data frame with at least two columns, where row means are applied.} + +\item{n}{A numeric value of length 1. May either be +\itemize{ +\item a numeric value that indicates the amount of valid values per row to +calculate the row mean; +\item or a value between 0 and 1, indicating a proportion of valid values per +row to calculate the row mean (see 'Details'). +} + +If a row's sum of valid values is less than \code{n}, \code{NA} will be returned.} + +\item{digits}{Numeric value indicating the number of decimal places to be +used for rounding mean values. Negative values are allowed (see 'Details'). +By default, \code{digits = NULL} and no rounding is used.} + +\item{verbose}{Toggle warnings.} +} +\value{ +A vector with row means for those rows with at least \code{n} valid values. +} +\description{ +This function is similar to the SPSS \code{MEAN.n} function and computes +row means from a data frame or matrix if at least \code{n} values of a row are +valid (and not \code{NA}). +} +\details{ +Rounding to a negative number of \code{digits} means rounding to a power of +ten, for example \code{rowmean_n(df, 3, digits = -2)} rounds to the nearest hundred. +For \code{n}, must be a numeric value from \code{0} to \code{ncol(data)}. If a row in the +data frame has at least \code{n} non-missing values, the row mean is returned. If +\code{n} is a non-integer value from 0 to 1, \code{n} is considered to indicate the +proportion of required non-missing values per row. E.g., if \code{n = 0.75}, a +row must have at least \code{ncol(data) * n} non-missing values for the row mean +to be calculated. See 'Examples'. +} +\examples{ +dat <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) +) + +# needs at least 4 non-missing values per row +rowmean_n(dat, 4) # 1 valid return value + +# needs at least 3 non-missing values per row +rowmean_n(dat, 3) # 2 valid return values + +# needs at least 2 non-missing values per row +rowmean_n(dat, 2) + +# needs at least 1 non-missing value per row +rowmean_n(dat, 1) # all means are shown + +# needs at least 50\% of non-missing values per row +rowmean_n(dat, 0.5) # 3 valid return values + +# needs at least 75\% of non-missing values per row +rowmean_n(dat, 0.75) # 2 valid return values + +} diff --git a/tests/testthat/test-rowmean_n.R b/tests/testthat/test-rowmean_n.R new file mode 100644 index 000000000..a17996ff6 --- /dev/null +++ b/tests/testthat/test-rowmean_n.R @@ -0,0 +1,26 @@ +test_that("rowmean_n", { + d_mn <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) + ) + expect_equal(rowmean_n(d_mn, 4), c(NA, 2.75, NA, NA), tolerance = 1e-3) + expect_equal(rowmean_n(d_mn, 3), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3) + expect_equal(rowmean_n(d_mn, 2), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3) + expect_equal(rowmean_n(d_mn, 1), c(1.5, 2.75, 7, 5.66667), tolerance = 1e-3) + expect_equal(rowmean_n(d_mn, 0.5), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3) + expect_equal(rowmean_n(d_mn, 0.75), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3) + expect_equal(rowmean_n(d_mn, 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1) +}) + +test_that("rowmean_n, errors or messages", { + data(iris) + expect_error(rowmean_n(5, n = 1), regex = "`data` must be") + expect_error(rowmean_n(iris[1], n = 1), regex = "two numeric") + expect_error(rowmean_n(iris, n = NULL), regex = "numeric value") + expect_error(rowmean_n(iris, n = 1:4), regex = "numeric value") + expect_error(rowmean_n(iris, n = "a"), regex = "numeric value") + expect_message(rowmean_n(iris[1:3, ], n = 3), regex = "Only numeric") + expect_silent(rowmean_n(iris[1:3, ], n = 3, verbose = FALSE)) +})