Unexpected missing values in data_tabulate() (#518)

* Unexpected missing values in `data_tabulate()` Fixes #514 * reverse usage * fix tests (FALSE -> TRUE) * docs, news * Update NEWS.md Co-authored-by: Indrajeet Patil <[email protected]> * add comment --------- Co-authored-by: Indrajeet Patil <[email protected]>
easystats · Jun 23, 2024 · ebe48b4 · ebe48b4
1 parent 218fbbb
commit ebe48b4
Show file tree

Hide file tree

Showing 10 changed files with 98 additions and 85 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.11.0.3
+Version: 0.11.0.4
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,11 @@
 
 BREAKING CHANGES
 
+* The argument `include_na` in `data_tabulate()` and `data_summary()` has been
+  renamed into `remove_na`. Consequently, to mimic former behaviour, `FALSE` and
+  `TRUE` need to be switched (i.e. `remove_na = TRUE` is equivalent to the former
+  `include_na = FALSE`).
+
 * Class names for objects returned by `data_tabulate()` have been changed to
   `datawizard_table` and `datawizard_crosstable` (resp. the plural forms,
   `*_tables`), to provide a clearer and more consistent naming scheme.

diff --git a/R/data_summary.R b/R/data_summary.R
@@ -8,9 +8,9 @@
 #' @param by Optional character string, indicating the name of a variable in `x`.
 #' If supplied, the data will be split by this variable and summary statistics
 #' will be computed for each group.
-#' @param include_na Logical. If `TRUE`, missing values are included as a level
-#' in the grouping variable. If `FALSE`, missing values are omitted from the
-#' grouping variable.
+#' @param remove_na Logical. If `TRUE`, missing values are omitted from the
+#' grouping variable. If `FALSE` (default), missing values are included as a
+#' level in the grouping variable.
 #' @param ... One or more named expressions that define the new variable name
 #' and the function to compute the summary statistic. Example:
 #' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided
@@ -57,8 +57,8 @@ data_summary <- function(x, ...) {
 
 
 #' @export
-data_summary.matrix <- function(x, ..., by = NULL, include_na = TRUE) {
-  data_summary(as.data.frame(x), ..., by = by, include_na = include_na)
+data_summary.matrix <- function(x, ..., by = NULL, remove_na = FALSE) {
+  data_summary(as.data.frame(x), ..., by = by, remove_na = remove_na)
 }
 
 
@@ -70,7 +70,7 @@ data_summary.default <- function(x, ...) {
 
 #' @rdname data_summary
 #' @export
-data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) {
+data_summary.data.frame <- function(x, ..., by = NULL, remove_na = FALSE) {
   dots <- eval(substitute(alist(...)))
 
   # do we have any expression at all?
@@ -103,10 +103,10 @@ data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) {
     }
     # split data, add NA levels, if requested
     l <- lapply(x[by], function(i) {
-      if (include_na && anyNA(i)) {
-        addNA(i)
-      } else {
+      if (remove_na || !anyNA(i)) {
         i
+      } else {
+        addNA(i)
       }
     })
     split_data <- split(x, l, drop = TRUE)
@@ -137,7 +137,7 @@ data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) {
 
 
 #' @export
-data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) {
+data_summary.grouped_df <- function(x, ..., by = NULL, remove_na = FALSE) {
   # extract group variables
   grps <- attr(x, "groups", exact = TRUE)
   group_variables <- data_remove(grps, ".rows")
@@ -148,7 +148,7 @@ data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) {
   # remove information specific to grouped df's
   attr(x, "groups") <- NULL
   class(x) <- "data.frame"
-  data_summary(x, ..., by = by, include_na = include_na)
+  data_summary(x, ..., by = by, remove_na = remove_na)
 }
 
 

diff --git a/R/data_tabulate.R b/R/data_tabulate.R
@@ -15,7 +15,7 @@
 #' factor levels are dropped from the frequency table.
 #' @param name Optional character string, which includes the name that is used
 #' for printing.
-#' @param include_na Logical, if `TRUE`, missing values are included in the
+#' @param remove_na Logical, if `FALSE`, missing values are included in the
 #' frequency or crosstable, else missing values are omitted.
 #' @param collapse Logical, if `TRUE` collapses multiple tables into one larger
 #' table for printing. This affects only printing, not the returned object.
@@ -40,7 +40,7 @@
 #' (missing) values by default. The first column indicates values of `x`, the
 #' first row indicates values of `by` (including missing values). The last row
 #' and column contain the total frequencies for each row and column, respectively.
-#' Setting `include_na = FALSE` will omit missing values from the crosstable.
+#' Setting `remove_na = FALSE` will omit missing values from the crosstable.
 #' Setting `proportions` to `"row"` or `"column"` will add row or column
 #' percentages. Setting `proportions` to `"full"` will add relative frequencies
 #' for the full table.
@@ -62,7 +62,7 @@
 #' data_tabulate(efc$c172code)
 #'
 #' # drop missing values
-#' data_tabulate(efc$c172code, include_na = FALSE)
+#' data_tabulate(efc$c172code, remove_na = TRUE)
 #'
 #' # data frame
 #' data_tabulate(efc, c("e42dep", "c172code"))
@@ -109,7 +109,7 @@
 #'   efc$c172code,
 #'   by = efc$e16sex,
 #'   proportions = "column",
-#'   include_na = FALSE
+#'   remove_na = TRUE
 #' )
 #'
 #' # round percentages
@@ -133,7 +133,7 @@ data_tabulate.default <- function(x,
                                   by = NULL,
                                   drop_levels = FALSE,
                                   weights = NULL,
-                                  include_na = TRUE,
+                                  remove_na = FALSE,
                                   proportions = NULL,
                                   name = NULL,
                                   verbose = TRUE,
@@ -163,7 +163,7 @@ data_tabulate.default <- function(x,
       x,
       by = by,
       weights = weights,
-      include_na = include_na,
+      remove_na = remove_na,
       proportions = proportions,
       obj_name = obj_name,
       group_variable = group_variable
@@ -172,30 +172,34 @@ data_tabulate.default <- function(x,
 
   # frequency table
   if (is.null(weights)) {
-    if (include_na) {
-      freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
-    } else {
+    if (remove_na) {
+      # we have a `.default` and a `.data.frame` method for `data_tabulate()`.
+      # since this is the default, `x` can be an object which cannot be used
+      # with `table()`, that's why we add `tryCatch()` here. Below we give an
+      # informative error message for non-supported objects.
       freq_table <- tryCatch(table(x), error = function(e) NULL)
+    } else {
+      freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
     }
-  } else if (include_na) {
-    # weighted frequency table, including NA
+  } else if (remove_na) {
+    # weighted frequency table, excluding NA
     freq_table <- tryCatch(
       stats::xtabs(
         weights ~ x,
-        data = data.frame(weights = weights, x = addNA(x)),
-        na.action = stats::na.pass,
-        addNA = TRUE
+        data = data.frame(weights = weights, x = x),
+        na.action = stats::na.omit,
+        addNA = FALSE
       ),
       error = function(e) NULL
     )
   } else {
-    # weighted frequency table, excluding NA
+    # weighted frequency table, including NA
     freq_table <- tryCatch(
       stats::xtabs(
         weights ~ x,
-        data = data.frame(weights = weights, x = x),
-        na.action = stats::na.omit,
-        addNA = FALSE
+        data = data.frame(weights = weights, x = addNA(x)),
+        na.action = stats::na.pass,
+        addNA = TRUE
       ),
       error = function(e) NULL
     )
@@ -218,12 +222,12 @@ data_tabulate.default <- function(x,
 
   out$`Raw %` <- 100 * out$N / sum(out$N)
   # if we have missing values, we add a row with NA
-  if (include_na) {
-    out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA)
-    valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE)
-  } else {
+  if (remove_na) {
     out$`Valid %` <- 100 * out$N / sum(out$N)
     valid_n <- sum(out$N, na.rm = TRUE)
+  } else {
+    out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA)
+    valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE)
   }
   out$`Cumulative %` <- cumsum(out$`Valid %`)
 
@@ -271,7 +275,7 @@ data_tabulate.data.frame <- function(x,
                                      by = NULL,
                                      drop_levels = FALSE,
                                      weights = NULL,
-                                     include_na = TRUE,
+                                     remove_na = FALSE,
                                      proportions = NULL,
                                      collapse = FALSE,
                                      verbose = TRUE,
@@ -297,7 +301,7 @@ data_tabulate.data.frame <- function(x,
       proportions = proportions,
       drop_levels = drop_levels,
       weights = weights,
-      include_na = include_na,
+      remove_na = remove_na,
       name = i,
       verbose = verbose,
       ...
@@ -326,7 +330,7 @@ data_tabulate.grouped_df <- function(x,
                                      proportions = NULL,
                                      drop_levels = FALSE,
                                      weights = NULL,
-                                     include_na = TRUE,
+                                     remove_na = FALSE,
                                      collapse = FALSE,
                                      verbose = TRUE,
                                      ...) {
@@ -362,7 +366,7 @@ data_tabulate.grouped_df <- function(x,
       verbose = verbose,
       drop_levels = drop_levels,
       weights = weights,
-      include_na = include_na,
+      remove_na = remove_na,
       by = by,
       proportions = proportions,
       group_variable = group_variable,

diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R
@@ -3,7 +3,7 @@
 .crosstable <- function(x,
                         by,
                         weights = NULL,
-                        include_na = TRUE,
+                        remove_na = FALSE,
                         proportions = NULL,
                         obj_name = NULL,
                         group_variable = NULL) {
@@ -12,30 +12,34 @@
   }
   # frequency table
   if (is.null(weights)) {
-    if (include_na) {
-      x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL)
-    } else {
+    # we have a `.default` and a `.data.frame` method for `data_tabulate()`.
+    # since this is the default, `x` can be an object which cannot be used
+    # with `table()`, that's why we add `tryCatch()` here. Below we give an
+    # informative error message for non-supported objects.
+    if (remove_na) {
       x_table <- tryCatch(table(x, by), error = function(e) NULL)
+    } else {
+      x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL)
     }
-  } else if (include_na) {
-    # weighted frequency table, including NA
+  } else if (remove_na) {
+    # weighted frequency table, excluding NA
     x_table <- tryCatch(
       stats::xtabs(
         weights ~ x + by,
-        data = data.frame(weights = weights, x = addNA(x), by = addNA(by)),
-        na.action = stats::na.pass,
-        addNA = TRUE
+        data = data.frame(weights = weights, x = x, by = by),
+        na.action = stats::na.omit,
+        addNA = FALSE
       ),
       error = function(e) NULL
     )
   } else {
-    # weighted frequency table, excluding NA
+    # weighted frequency table, including NA
     x_table <- tryCatch(
       stats::xtabs(
         weights ~ x + by,
-        data = data.frame(weights = weights, x = x, by = by),
-        na.action = stats::na.omit,
-        addNA = FALSE
+        data = data.frame(weights = weights, x = addNA(x), by = addNA(by)),
+        na.action = stats::na.pass,
+        addNA = TRUE
       ),
       error = function(e) NULL
     )

diff --git a/man/data_summary.Rd b/man/data_summary.Rd
diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd