Merge pull request #293 from easystats/plot_data_tabulate

Plot method for `data_tabulate`
easystats · Aug 31, 2023 · cd304c4 · cd304c4
2 parents 5b80195 + 1079e2b
commit cd304c4
Show file tree

Hide file tree

Showing 5 changed files with 217 additions and 0 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -23,6 +23,8 @@ S3method(data_plot,performance_pp_check)
 S3method(data_plot,point_estimate)
 S3method(data_plot,rope)
 S3method(data_plot,see_compare_parameters)
+S3method(plot,dw_data_tabulate)
+S3method(plot,dw_data_tabulates)
 S3method(plot,see_bayesfactor_models)
 S3method(plot,see_bayesfactor_parameters)
 S3method(plot,see_bayesfactor_savagedickey)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # see (development version)
 
+## New features
+
+* There is now a `plot()` method for outputs of `datawizard::data_tabulate()` 
+  (#293).
+
 ## Minor Changes
 
 * The `print()` method for `performance::check_model()` now also evaluates the

diff --git a/R/plot.dw_data_tabulate.R b/R/plot.dw_data_tabulate.R
@@ -0,0 +1,137 @@
+#' Plot tabulated data.
+#'
+#' @param x Object created by `datawizard::data_tabulate()`.
+#' @param label_values Logical. Should values and percentages be displayed at the
+#'   top of each bar.
+#' @param show_na Should missing values be dropped? Can be `"if_any"` (default) to show
+#'   the missing category only if any missing values are present, `"always"` to
+#'   always show the missing category, or `"never"` to never show the missing
+#'   category.
+#' @param na_label The label given to missing values when they are shown.
+#' @param error_bar Logical. Should error bars be displayed?
+#'   If `TRUE`, confidence intervals computed using the Wilson method are shown.
+#'   See Brown et al. (2001) for details.
+#' @param ci Confidence Interval (CI) level. Default to `0.95` (⁠95%⁠).
+#' @param fill_col Color to use for category columns (default: "#87CEFA").
+#' @param color_error_bar Color to use for error bars (default: "#607B8B").
+#' @param ... Unused
+#'
+#' @references
+#' Brown, L. D., Cai, T. T., & DasGupta, A. (2001).
+#' Interval estimation for a binomial proportion.
+#' _Statistical Science, 16_(2), 101–133. \doi{10.1214/ss/1009213286}
+#'
+#' @rdname plot.dw_data_tabulate
+#' @export
+
+plot.dw_data_tabulates <- function(x, label_values = TRUE,
+                                   show_na = c("if_any", "always", "never"),
+                                   na_label = "(Missing)",
+                                   error_bar = TRUE, ci = .95,
+                                   fill_col = "#87CEFA",
+                                   color_error_bar = "#607B8B",
+                                   ...) {
+  show_na <- match.arg(show_na, choices = c("if_any", "always", "never"))
+  if (length(x) == 1) {
+    plot.dw_data_tabulate(
+      x[[1]], label_values = label_values,
+      show_na = show_na, na_label = na_label,
+      error_bar = error_bar, ci = ci,
+      fill_col = fill_col, color_error_bar = color_error_bar
+    )
+  } else {
+    lapply(x, plot.dw_data_tabulate,
+           label_values = label_values,
+           show_na = show_na, na_label = na_label,
+           error_bar = error_bar, ci = ci,
+           fill_col = fill_col, color_error_bar = color_error_bar
+    )
+  }
+}
+
+#' @rdname plot.dw_data_tabulate
+#'
+#' @export
+
+plot.dw_data_tabulate <- function(x, label_values = TRUE,
+                                  show_na = c("if_any", "always", "never"),
+                                  na_label = "(Missing)",
+                                  error_bar = TRUE, ci = .95,
+                                  fill_col = "#87CEFA",
+                                  color_error_bar = "#607B8B",
+                                  ...) {
+  show_na <- match.arg(show_na, choices = c("if_any", "always", "never"))
+  dat <- as.data.frame(x)
+
+  if (show_na == "if_any") {
+    if (any(is.na(dat$Value))) {
+      show_na <- ifelse(dat[is.na(dat$Value), "N"] > 0, "always", "never")
+    } else {
+      show_na <- "never"
+    }
+  }
+
+  if (show_na == "never") {
+    dat <- dat[!is.na(dat$Value), ]
+    dat$output <- dat[[which(startsWith(names(dat), "Valid"))]]
+  } else {
+    dat$output <- dat[[which(startsWith(names(dat), "Raw"))]]
+
+    # deal with missing values
+    dat$Value <- as.character(dat$Value)
+    dat$Value[is.na(dat$Value)] <- na_label
+    dat$Value <- factor(
+      dat$Value,
+      levels = c(setdiff(dat$Value, na_label), na_label)
+    )
+  }
+
+  if (isTRUE(error_bar)) {
+    total_n <- sum(dat$N)
+    props <- dat$output / 100
+    dat <- cbind(dat, CI = ci, .wilson_ci(prop = props, total_n = total_n, ci = ci) * total_n)
+    dat$label <- paste0(dat$N, " (", round(dat$output, 2), "%)")
+  } else {
+    dat$label <- paste0(dat$N, "\n(", round(dat$output, 2), "%)")
+  }
+
+  out <- ggplot2::ggplot(dat) +
+    ggplot2::aes(x = .data$Value, y = .data$N) +
+    ggplot2::geom_col(fill = fill_col) +
+    ggplot2::labs(title = unique(dat$Variable)) +
+    theme_modern()
+
+  if (isTRUE(label_values)) {
+    if (isTRUE(error_bar)) {
+      out <- out +
+        ggplot2::geom_text(ggplot2::aes(label = .data$label), vjust = -1, hjust = 1.2) +
+        ggplot2::coord_cartesian(ylim = c(0, max(dat$CI_high)))
+    } else {
+      out <- out +
+        ggplot2::geom_text(ggplot2::aes(label = .data$label), vjust = -0.5) +
+        ggplot2::coord_cartesian(ylim = c(0, max(dat$N) * 1.2))
+    }
+  }
+
+  # add confidence intervals for frequencies
+  if (isTRUE(error_bar)) {
+    out <- out +
+      ggplot2::geom_linerange(
+        ggplot2::aes(ymin = .data$CI_low, ymax = .data$CI_high),
+        color = color_error_bar
+      )
+  }
+
+  out
+}
+
+.wilson_ci <- function(prop, total_n, ci = .95) {
+  z <- stats::qnorm((1 - ci) / 2, lower.tail = FALSE)
+  z2 <- z^2
+  p1 <- prop + 0.5 * z2 / total_n
+  p2 <- z * sqrt((prop * (1 - prop) + 0.25 * z2 / total_n) / total_n)
+  p3 <- 1 + z2 / total_n
+  CI_low <- (p1 - p2) / p3
+  CI_high <- (p1 + p2) / p3
+  return(data.frame(CI_low = CI_low, CI_high = CI_high))
+}
diff --git a/man/plot.dw_data_tabulate.Rd b/man/plot.dw_data_tabulate.Rd
diff --git a/tests/testthat/test-plot.dw_data_tabulate.R b/tests/testthat/test-plot.dw_data_tabulate.R
@@ -0,0 +1,9 @@
+test_that("`plot.dw_data_tabulate()` works with single table", {
+  x <- datawizard::data_tabulate(mtcars, select = "cyl")
+  expect_s3_class(plot(x), "gg")
+})
+
+test_that("`plot.dw_data_tabulate()` works with multiple tables", {
+  x <- datawizard::data_tabulate(mtcars, select = c("cyl", "carb", "am"))
+  expect_true(is.list(plot(x)))
+})