Skip to content

Commit

Permalink
Implement means_by_group() (#446)
Browse files Browse the repository at this point in the history
* draft grouped mean

* add methods

* fix

* fix

* docs

* fix printing issues

* fix check issues

* fix

* news, desc

* fix issues

* add p-value

* emmeans to suggests

* add ci

* fix, tests

* snapshots

* docs

* add tests

* docs

* round weighted N

* fix for weights

* skip test if emmeans not installed

* typo in news

* fix typos in docs

---------

Co-authored-by: Etienne Bacher <[email protected]>
  • Loading branch information
strengejacke and etiennebacher authored Aug 12, 2023
1 parent 6e3182c commit 57b193d
Show file tree
Hide file tree
Showing 10 changed files with 678 additions and 18 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.8.0.5
Version: 0.8.0.6
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down Expand Up @@ -43,6 +43,7 @@ Suggests:
data.table,
dplyr (>= 1.0),
effectsize,
emmeans,
gamm4,
ggplot2,
gt,
Expand Down
7 changes: 7 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ S3method(describe_distribution,numeric)
S3method(format,data_codebook)
S3method(format,dw_data_peek)
S3method(format,dw_data_tabulate)
S3method(format,dw_groupmeans)
S3method(format,parameters_distribution)
S3method(kurtosis,data.frame)
S3method(kurtosis,default)
Expand All @@ -76,6 +77,9 @@ S3method(labels_to_levels,data.frame)
S3method(labels_to_levels,default)
S3method(labels_to_levels,factor)
S3method(makepredictcall,dw_transformer)
S3method(means_by_group,data.frame)
S3method(means_by_group,default)
S3method(means_by_group,numeric)
S3method(normalize,data.frame)
S3method(normalize,factor)
S3method(normalize,grouped_df)
Expand All @@ -86,6 +90,8 @@ S3method(print,data_codebook)
S3method(print,dw_data_peek)
S3method(print,dw_data_tabulate)
S3method(print,dw_data_tabulates)
S3method(print,dw_groupmeans)
S3method(print,dw_groupmeans_list)
S3method(print,dw_transformer)
S3method(print,parameters_distribution)
S3method(print,parameters_kurtosis)
Expand Down Expand Up @@ -252,6 +258,7 @@ export(get_columns)
export(kurtosis)
export(labels_to_levels)
export(mean_sd)
export(means_by_group)
export(median_mad)
export(normalize)
export(print_html)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ NEW FUNCTIONS
* `rowmean_n()`, to compute row means if row contains at least `n` non-missing
values.

* `means_by_group()`, to compute mean values of variables, grouped by levels
of specified factors.

CHANGES

* `recode_into()` gains an `overwrite` argument to skip overwriting already
Expand Down
294 changes: 294 additions & 0 deletions R/means_by_group.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
#' @title Summary of mean values by group
#' @name means_by_group
#'
#' @description Computes summary table of means by groups.
#'
#' @param x A vector or a data frame.
#' @param group If `x` is a numeric vector, `group` should be a factor that
#' indicates the group-classifying categories. If `x` is a data frame, `group`
#' should be a character string, naming the variable in `x` that is used for
#' grouping. Numeric vectors are coerced to factors. Not that `group` should
#' only refer to a single variable.
#' @param ci Level of confidence interval for mean estimates. Default is `0.95`.
#' Use `ci = NA` to suppress confidence intervals.
#' @param weights If `x` is a numeric vector, `weights` should be a vector of
#' weights that will be applied to weight all observations. If `x` is a data
#' frame, `weights` can also be a character string indicating the name of the
#' variable in `x` that should be used for weighting. Default is `NULL`, so no
#' weights are used.
#' @param digits Optional scalar, indicating the amount of digits after decimal
#' point when rounding estimates and values.
#' @param ... Currently not used
#' @inheritParams find_columns
#'
#' @return A data frame with information on mean and further summary statistics
#' for each sub-group.
#'
#' @details This function is comparable to `aggregate(x, group, mean)`, but provides
#' some further information, including summary statistics from a One-Way-ANOVA
#' using `x` as dependent and `group` as independent variable. [`emmeans::contrast()`]
#' is used to get p-values for each sub-group. P-values indicate whether each
#' group-mean is significantly different from the total mean.
#'
#' @examples
#' data(efc)
#' means_by_group(efc, "c12hour", "e42dep")
#'
#' data(iris)
#' means_by_group(iris, "Sepal.Width", "Species")
#'
#' # weighting
#' efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5))
#' means_by_group(efc, "c12hour", "e42dep", weights = "weight")
#' @export
means_by_group <- function(x, ...) {
UseMethod("means_by_group")
}


#' @export
means_by_group.default <- function(x, ...) {
insight::format_error("`means_by_group()` does not work for objects of class `", class(x)[1], "`.")
}


#' @rdname means_by_group
#' @export
means_by_group.numeric <- function(x,
group = NULL,
ci = 0.95,
weights = NULL,
digits = NULL,
...) {
# sanity check for arguments

# "group" must be provided
if (is.null(group)) {
insight::format_error("Argument `group` is missing.")
}

# group must be of same length as x
if (length(group) != length(x)) {
insight::format_error("Argument `group` must be of same length as `x`.")
}

# if weights are provided, must be of same length as x
if (!is.null(weights) && length(weights) != length(x)) {
insight::format_error("Argument `weights` must be of same length as `x`.")
}

# if weights are NULL, set weights to 1
if (is.null(weights)) weights <- rep(1, length(x))

# retrieve labels
var_mean_label <- attr(x, "label", exact = TRUE)
var_grp_label <- attr(group, "label", exact = TRUE)

# if no labels present, use variable names directly
if (is.null(var_mean_label)) {
var_mean_label <- deparse(substitute(x))
}
if (is.null(var_grp_label)) {
var_grp_label <- deparse(substitute(group))
}

# coerce group to factor if numeric, or convert labels to levels, if factor
if (is.factor(group)) {
group <- tryCatch(labels_to_levels(group, verbose = FALSE), error = function(e) group)
} else {
group <- to_factor(group)
}

data <- stats::na.omit(data.frame(
x = x,
group = group,
weights = weights,
stringsAsFactors = FALSE
))

# get grouped means table
out <- .means_by_group(data, ci = ci)

# attributes
attr(out, "var_mean_label") <- var_mean_label
attr(out, "var_grp_label") <- var_grp_label
attr(out, "digits") <- digits

class(out) <- c("dw_groupmeans", "data.frame")
out
}


#' @rdname means_by_group
#' @export
means_by_group.data.frame <- function(x,
select = NULL,
group = NULL,
ci = 0.95,
weights = NULL,
digits = NULL,
exclude = NULL,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
...) {
# evaluate select/exclude, may be select-helpers
select <- .select_nse(select,
x,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)

if (is.null(weights)) {
w <- NULL
} else if (is.character(weights)) {
w <- x[[weights]]
} else {
w <- weights
}

out <- lapply(select, function(i) {
# if no labels present, use variable names directy
if (is.null(attr(x[[i]], "label", exact = TRUE))) {
attr(x[[i]], "label") <- i
}
if (is.null(attr(x[[group]], "label", exact = TRUE))) {
attr(x[[group]], "label") <- group
}
# compute means table
means_by_group(x[[i]], group = x[[group]], ci = ci, weights = w, digits = digits, ...)
})

class(out) <- c("dw_groupmeans_list", "list")
out
}


#' @keywords internal
.means_by_group <- function(data, ci = 0.95) {
# compute anova statistics for mean table
if (is.null(data$weights) || all(data$weights == 1)) {
fit <- stats::lm(x ~ group, data = data)
} else {
fit <- stats::lm(x ~ group, weights = data$weights, data = data)
}

# summary table data
groups <- split(data$x, data$group)
group_weights <- split(data$weights, data$group)
out <- do.call(rbind, Map(function(x, w) {
data.frame(
Mean = weighted_mean(x, weights = w),
SD = weighted_sd(x, weights = w),
N = round(sum(w)),
stringsAsFactors = FALSE
)
}, groups, group_weights))

# add group names
out$Category <- levels(data$group)
out$p <- out$CI_high <- out$CI_low <- NA

# p-values of contrast-means
if (insight::check_if_installed("emmeans", quietly = TRUE)) {
# create summary table of contrasts, for p-values and confidence intervals
predicted <- emmeans::emmeans(fit, specs = "group", level = ci)
contrasts <- emmeans::contrast(predicted, method = "eff")
# add p-values and confidence intervals to "out"
if (!is.null(ci) && !is.na(ci)) {
summary_table <- as.data.frame(predicted)
out$CI_low <- summary_table$lower.CL
out$CI_high <- summary_table$upper.CL
}
summary_table <- as.data.frame(contrasts)
out$p <- summary_table$p.value
}

# reorder columns
out <- out[c("Category", "Mean", "N", "SD", "CI_low", "CI_high", "p")]

# finally, add total-row
out <- rbind(
out,
data.frame(
Category = "Total",
Mean = weighted_mean(data$x, weights = data$weights),
N = nrow(data),
SD = weighted_sd(data$x, weights = data$weights),
CI_low = NA,
CI_high = NA,
p = NA,
stringsAsFactors = FALSE
)
)

# get anova statistics for mean table
sum.fit <- summary(fit)

# r-squared values
r2 <- sum.fit$r.squared
r2.adj <- sum.fit$adj.r.squared

# F-statistics
fstat <- sum.fit$fstatistic
pval <- stats::pf(fstat[1], fstat[2], fstat[3], lower.tail = FALSE)

# copy as attributes
attr(out, "r2") <- r2
attr(out, "ci") <- ci
attr(out, "adj.r2") <- r2.adj
attr(out, "fstat") <- fstat[1]
attr(out, "p.value") <- pval

out
}


# methods -----------------

#' @export
format.dw_groupmeans <- function(x, digits = NULL, ...) {
if (is.null(digits)) {
digits <- attr(x, "digits", exact = TRUE)
}
if (is.null(digits)) {
digits <- 2
}
x$N <- insight::format_value(x$N, digits = 0)
insight::format_table(remove_empty_columns(x), digits = digits, ...)
}

#' @export
print.dw_groupmeans <- function(x, digits = NULL, ...) {
out <- format(x, digits = digits, ...)

# caption
l1 <- attributes(x)$var_mean_label
l2 <- attributes(x)$var_grp_label
if (!is.null(l1) && !is.null(l2)) {
caption <- c(paste0("# Mean of ", l1, " by ", l2), "blue")
} else {
caption <- NULL
}

# footer
footer <- paste0(
"\nAnova: R2=", insight::format_value(attributes(x)$r2, digits = 3),
"; adj.R2=", insight::format_value(attributes(x)$adj.r2, digits = 3),
"; F=", insight::format_value(attributes(x)$fstat, digits = 3),
"; ", insight::format_p(attributes(x)$p.value, whitespace = FALSE),
"\n"
)

cat(insight::export_table(out, caption = caption, footer = footer, ...))
}

#' @export
print.dw_groupmeans_list <- function(x, digits = NULL, ...) {
for (i in seq_along(x)) {
if (i > 1) cat("\n")
print(x[[i]], digits = digits, ...)
}
}
1 change: 1 addition & 0 deletions _pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ reference:
- data_codebook
- data_tabulate
- data_peek
- means_by_group
- contains("distribution")
- kurtosis
- smoothness
Expand Down
Loading

0 comments on commit 57b193d

Please sign in to comment.