Merge branch 'main' into unnormalize-grouped-data

easystats · Aug 30, 2023 · 5fd51c8 · 5fd51c8
2 parents ff912ee + 10599b2
commit 5fd51c8
Show file tree

Hide file tree

Showing 9 changed files with 150 additions and 121 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -15,6 +15,9 @@ CHANGES
 * `recode_into()` gains an `overwrite` argument to skip overwriting already
   recoded cases when multiple recode patterns apply to the same case.
 
+* `data_read()` now passes the `encoding` argument to `data.table::fread()`.
+  This allows to read files with non-ASCII characters.
+
 * `datawizard` moves from the GPL-3 license to the MIT license.
 
 * `unnormalize()` and `unstandardize()` now work with grouped data (#415).
@@ -31,6 +34,10 @@ BUG FIXES
   naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as
   faulty syntax. 
 
+* Fixed issue in `empty_column()` for strings with invalid multibyte strings.
+  For such data frames or files, `empty_column()` or `data_read()` no longer
+  fails.
+
 # datawizard 0.8.0
 
 BREAKING CHANGES

diff --git a/R/contrs.R b/R/contrs.R
@@ -5,80 +5,81 @@
 #' @inheritParams stats::contr.sum
 #'
 #' @details
-#' In effects coding, unlike treatment/dummy coding ([stats::contr.treatment()]), each
-#' contrast sums to 0. In regressions models, this results in an intercept that
-#' represents the (unweighted) average of the group means. In ANOVA settings,
-#' this also guarantees that lower order effects represent _main_ effects (and
-#' not _simple_ or _conditional_ effects, as is the case when using R's default
-#' [stats::contr.treatment()]).
+#' In effects coding, unlike treatment/dummy coding
+#' ([stats::contr.treatment()]), each contrast sums to 0. In regressions models,
+#' this results in an intercept that represents the (unweighted) average of the
+#' group means. In ANOVA settings, this also guarantees that lower order effects
+#' represent _main_ effects (and not _simple_ or _conditional_ effects, as is
+#' the case when using R's default [stats::contr.treatment()]).
 #' \cr\cr
-#' Deviation coding (`contr.deviation`) is a type of effects coding.
-#' With deviation coding, the coefficients for factor variables are interpreted
-#' as the difference of each factor level from the base level
-#' (this is the same interpretation as with treatment/dummy coding).
-#' For example, for a factor `group` with levels "A", "B", and "C", with `contr.devation`,
-#' the intercept represents the overall mean (average of the group means for the 3 groups),
+#' Deviation coding (`contr.deviation`) is a type of effects coding. With
+#' deviation coding, the coefficients for factor variables are interpreted as
+#' the difference of each factor level from the base level (this is the same
+#' interpretation as with treatment/dummy coding). For example, for a factor
+#' `group` with levels "A", "B", and "C", with `contr.devation`, the intercept
+#' represents the overall mean (average of the group means for the 3 groups),
 #' and the coefficients `groupB` and `groupC` represent the differences between
 #' the A group mean and the B and C group means, respectively.
 #' \cr\cr
-#' Sum coding ([stats::contr.sum()]) is another type of effects coding.
-#' With sum coding, the coefficients for factor variables are interpreted
-#' as the difference of each factor level from **the grand (across-groups) mean**.
-#' For example, for a factor `group` with levels "A", "B", and "C", with `contr.sum`,
-#' the intercept represents the overall mean (average of the group means for the 3 groups),
-#' and the coefficients `group1` and `group2` represent the differences the
+#' Sum coding ([stats::contr.sum()]) is another type of effects coding. With sum
+#' coding, the coefficients for factor variables are interpreted as the
+#' difference of each factor level from **the grand (across-groups) mean**. For
+#' example, for a factor `group` with levels "A", "B", and "C", with
+#' `contr.sum`, the intercept represents the overall mean (average of the group
+#' means for the 3 groups), and the coefficients `group1` and `group2` represent
+#' the differences the
 #' **A** and **B** group means from the overall mean, respectively.
 #'
 #' @seealso [stats::contr.sum()]
 #'
 #' @examples
-#' \dontrun{
-#' data("mtcars")
+#' if (FALSE) {
+#'   data("mtcars")
 #'
-#' mtcars <- data_modify(mtcars, cyl = factor(cyl))
+#'   mtcars <- data_modify(mtcars, cyl = factor(cyl))
 #'
-#' c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl))
-#' solve(c.treatment)
-#' #>            4 6 8
-#' #> Intercept  1 0 0  # mean of the 1st level
-#' #> 6         -1 1 0  # 2nd level - 1st level
-#' #> 8         -1 0 1  # 3rd level - 1st level
+#'   c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl))
+#'   solve(c.treatment)
+#'   #>            4 6 8
+#'   #> Intercept  1 0 0  # mean of the 1st level
+#'   #> 6         -1 1 0  # 2nd level - 1st level
+#'   #> 8         -1 0 1  # 3rd level - 1st level
 #'
-#' contrasts(mtcars$cyl) <- contr.sum
-#' c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl))
-#' solve(c.sum)
-#' #>                4      6      8
-#' #> Intercept  0.333  0.333  0.333   # overall mean
-#' #>            0.667 -0.333 -0.333   # deviation of 1st from overall mean
-#' #>           -0.333  0.667 -0.333   # deviation of 2nd from overall mean
+#'   contrasts(mtcars$cyl) <- contr.sum
+#'   c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl))
+#'   solve(c.sum)
+#'   #>                4      6      8
+#'   #> Intercept  0.333  0.333  0.333   # overall mean
+#'   #>            0.667 -0.333 -0.333   # deviation of 1st from overall mean
+#'   #>           -0.333  0.667 -0.333   # deviation of 2nd from overall mean
 #'
 #'
-#' contrasts(mtcars$cyl) <- contr.deviation
-#' c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl))
-#' solve(c.deviation)
-#' #>                4     6     8
-#' #> Intercept  0.333 0.333 0.333   # overall mean
-#' #> 6         -1.000 1.000 0.000   # 2nd level - 1st level
-#' #> 8         -1.000 0.000 1.000   # 3rd level - 1st level
+#'   contrasts(mtcars$cyl) <- contr.deviation
+#'   c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl))
+#'   solve(c.deviation)
+#'   #>                4     6     8
+#'   #> Intercept  0.333 0.333 0.333   # overall mean
+#'   #> 6         -1.000 1.000 0.000   # 2nd level - 1st level
+#'   #> 8         -1.000 0.000 1.000   # 3rd level - 1st level
 #'
-#' ## With Interactions -----------------------------------------
-#' mtcars <- data_modify(mtcars, am = factor(am))
-#' mtcars <- data_arrange(mtcars, select = c("cyl", "am"))
+#'   ## With Interactions -----------------------------------------
+#'   mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation))
+#'   mtcars <- data_arrange(mtcars, select = c("cyl", "am"))
 #'
-#' mm <- unique(model.matrix(~ cyl * am, data = mtcars))
-#' rownames(mm) <- c(
-#'   "cyl4.am0", "cyl4.am1", "cyl6.am0",
-#'   "cyl6.am1", "cyl8.am0", "cyl8.am1"
-#' )
+#'   mm <- unique(model.matrix(~ cyl * am, data = mtcars))
+#'   rownames(mm) <- c(
+#'     "cyl4.am0", "cyl4.am1", "cyl6.am0",
+#'     "cyl6.am1", "cyl8.am0", "cyl8.am1"
+#'   )
 #'
-#' solve(mm)
-#' #>             cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1
-#' #> (Intercept)    0.167    0.167    0.167    0.167    0.167    0.167  # overall mean
-#' #> cyl6          -0.500   -0.500    0.500    0.500    0.000    0.000  # cyl MAIN eff: 2nd - 1st
-#' #> cyl8          -0.500   -0.500    0.000    0.000    0.500    0.500  # cyl MAIN eff: 2nd - 1st
-#' #> am1           -0.333    0.333   -0.333    0.333   -0.333    0.333  # am MAIN eff
-#' #> cyl6:am1       1.000   -1.000   -1.000    1.000    0.000    0.000
-#' #> cyl8:am1       1.000   -1.000    0.000    0.000   -1.000    1.000
+#'   solve(mm)
+#'   #>             cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1
+#'   #> (Intercept)    0.167    0.167    0.167    0.167    0.167    0.167  # overall mean
+#'   #> cyl6          -0.500   -0.500    0.500    0.500    0.000    0.000  # cyl MAIN eff: 2nd - 1st
+#'   #> cyl8          -0.500   -0.500    0.000    0.000    0.500    0.500  # cyl MAIN eff: 2nd - 1st
+#'   #> am1           -0.333    0.333   -0.333    0.333   -0.333    0.333  # am MAIN eff
+#'   #> cyl6:am1       1.000   -1.000   -1.000    1.000    0.000    0.000
+#'   #> cyl8:am1       1.000   -1.000    0.000    0.000   -1.000    1.000
 #' }
 #'
 #' @export

diff --git a/R/data_read.R b/R/data_read.R
@@ -271,7 +271,11 @@ data_read <- function(path,
 
 .read_text <- function(path, encoding, verbose, ...) {
   if (insight::check_if_installed("data.table", quietly = TRUE)) {
-    out <- data.table::fread(input = path, ...)
+    # set proper default encoding-value for fread
+    if (is.null(encoding)) {
+      encoding <- "unknown"
+    }
+    out <- data.table::fread(input = path, encoding = encoding, ...)
     class(out) <- "data.frame"
     return(out)
   }

diff --git a/R/demean.R b/R/demean.R
@@ -28,13 +28,13 @@
 #' @inheritParams center
 #'
 #' @return
-#'
 #' A data frame with the group-/de-meaned variables, which get the suffix
 #' `"_between"` (for the group-meaned variable) and `"_within"` (for the
 #' de-meaned variable) by default.
 #'
 #' @seealso If grand-mean centering (instead of centering within-clusters)
-#'   is required, see [center()].
+#'   is required, see [center()]. See [`performance::check_heterogeneity_bias()`]
+#'   to check for heterogeneity bias.
 #'
 #' @details
 #'

diff --git a/R/remove_empty.R b/R/remove_empty.R
@@ -63,7 +63,7 @@ empty_columns <- function(x) {
   } else {
     all_na <- colSums(is.na(x)) == nrow(x)
     all_empty <- vapply(x, function(i) {
-      (is.character(i) || is.factor(i)) && max(c(0, nchar(as.character(i))), na.rm = TRUE) == 0
+      (is.character(i) || is.factor(i)) && !any(nzchar(as.character(i[!is.na(i)])))
     }, FUN.VALUE = logical(1L))
 
     which(all_na | all_empty)

diff --git a/man/contr.deviation.Rd b/man/contr.deviation.Rd
diff --git a/man/demean.Rd b/man/demean.Rd
diff --git a/tests/testthat/test-contr.deviation.R b/tests/testthat/test-contr.deviation.R
@@ -21,8 +21,10 @@ test_that("contr.deviation | snapshot", {
   expect_snapshot(solve(c.deviation))
 
   mm <- unique(model.matrix(~ cyl * am, data = mtcars))
-  rownames(mm) <- c("cyl4.am0", "cyl4.am1", "cyl6.am0",
-                    "cyl6.am1", "cyl8.am0", "cyl8.am1")
+  rownames(mm) <- c(
+    "cyl4.am0", "cyl4.am1", "cyl6.am0",
+    "cyl6.am1", "cyl8.am0", "cyl8.am1"
+  )
 
   expect_snapshot(solve(mm))
 })
diff --git a/tests/testthat/test-empty-dataframe.R b/tests/testthat/test-empty-dataframe.R
@@ -59,3 +59,16 @@ test_that("empty_columns with only NA characters", {
   )
   expect_identical(empty_columns(tmp), c(var2 = 2L))
 })
+
+
+test_that("works with non-ascii chars", {
+  tmp <- data.frame(
+    a = c(1, 2, 3, NA, 5),
+    b = c("", NA, "", NA, ""),
+    c = c(NA, NA, NA, NA, NA),
+    d = c("test", "Se\x96ora", "works fine", "this too", "yeah"),
+    e = c("", "", "", "", ""),
+    stringsAsFactors = FALSE
+  )
+  expect_identical(empty_columns(tmp), c(b = 2L, c = 3L, e = 5L))
+})