From 0050749fc67e388c66d22ee8385b9d061c9620bf Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Mon, 19 Aug 2024 00:39:01 +0200
Subject: [PATCH] finalze, add tests

---
 NEWS.md                      | 50 ++++++++++++++++-------------
 R/demean.R                   | 62 ++++++++++++++++++++++++------------
 man/demean.Rd                | 33 +++++++++++++++----
 tests/testthat/test-demean.R | 43 +++++++++++++++++++++++++
 4 files changed, 139 insertions(+), 49 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 5e500313c..752227f4d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,10 @@
+# datawizard 0.12.3
+
+CHANGES
+
+* `demean()` (and `degroup()`) now also work for nested designs, if argument
+  `nested = TRUE` and  `by` specifies more than one variable.
+
 # datawizard 0.12.2
 
 * Remove `htmltools` from `Suggests` in an attempt of fixing an error in CRAN
@@ -73,8 +80,8 @@ BREAKING CHANGES
 
 * The following arguments were deprecated in 0.5.0 and are now removed:
 
-  * in `data_to_wide()`: `colnames_from`, `rows_from`, `sep` 
-  * in `data_to_long()`: `colnames_to` 
+  * in `data_to_wide()`: `colnames_from`, `rows_from`, `sep`
+  * in `data_to_long()`: `colnames_to`
   * in `data_partition()`: `training_proportion`
 
 NEW FUNCTIONS
@@ -93,7 +100,7 @@ CHANGES
   argument, to compute weighted frequency tables. `include_na` allows to include
   or omit missing values from the table. Furthermore, a `by` argument was added,
   to compute crosstables (#479, #481).
-  
+
 # datawizard 0.9.1
 
 CHANGES
@@ -144,7 +151,7 @@ CHANGES
 
 * `unnormalize()` and `unstandardize()` now work with grouped data (#415).
 
-* `unnormalize()` now errors instead of emitting a warning if it doesn't have the 
+* `unnormalize()` now errors instead of emitting a warning if it doesn't have the
   necessary info (#415).
 
 BUG FIXES
@@ -167,7 +174,7 @@ BUG FIXES
 
 * Fixed issue in `data_filter()` where functions containing a `=` (e.g. when
   naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as
-  faulty syntax. 
+  faulty syntax.
 
 * Fixed issue in `empty_column()` for strings with invalid multibyte strings.
   For such data frames or files, `empty_column()` or `data_read()` no longer
@@ -204,14 +211,14 @@ CHANGES
 
 NEW FUNCTIONS
 
-* `rowid_as_column()` to complement `rownames_as_column()` (and to mimic 
-  `tibble::rowid_to_column()`). Note that its behavior is different from 
+* `rowid_as_column()` to complement `rownames_as_column()` (and to mimic
+  `tibble::rowid_to_column()`). Note that its behavior is different from
   `tibble::rowid_to_column()` for grouped data. See the Details section in the
   docs.
 
 * `data_unite()`, to merge values of multiple variables into one new variable.
 
-* `data_separate()`, as counterpart to `data_unite()`, to separate a single 
+* `data_separate()`, as counterpart to `data_unite()`, to separate a single
   variable into multiple new variables.
 
 * `data_modify()`, to create new variables, or modify or remove existing
@@ -234,7 +241,7 @@ BUG FIXES
 
 * `center()` and `standardize()` did not work for grouped data frames (of class
   `grouped_df`) when `force = TRUE`.
-  
+
 * The `data.frame` method of `describe_distribution()` returns `NULL` instead of
   an error if no valid variable were passed (for example a factor variable with
   `include_factors = FALSE`) (#421).
@@ -262,12 +269,12 @@ BUG FIXES
 
 # datawizard 0.7.0
 
-BREAKING CHANGES 
+BREAKING CHANGES
 
 * In selection patterns, expressions like `-var1:var3` to exclude all variables
   between `var1` and `var3` are no longer accepted. The correct expression is
   `-(var1:var3)`. This is for 2 reasons:
-  
+
   * to be consistent with the behavior for numerics (`-1:2` is not accepted but
     `-(1:2)` is);
   * to be consistent with `dplyr::select()`, which throws a warning and only
@@ -279,8 +286,8 @@ NEW FUNCTIONS
   or more variables into a new variable.
 
 * `mean_sd()` and `median_mad()` for summarizing vectors to their mean (or
-  median) and a range of one SD (or MAD) above and below.  
-  
+  median) and a range of one SD (or MAD) above and below.
+
 * `data_write()` as counterpart to `data_read()`, to write data frames into
   CSV, SPSS, SAS, Stata files and many other file types. One advantage over
   existing functions to write data in other packages is that labelled (numeric)
@@ -296,8 +303,8 @@ MINOR CHANGES
 
 * `data_rename()` gets a `verbose` argument.
 * `winsorize()` now errors if the threshold is incorrect (previously, it provided
-  a warning and returned the unchanged data). The argument `verbose` is now 
-  useless but is kept for backward compatibility. The documentation now contains   
+  a warning and returned the unchanged data). The argument `verbose` is now
+  useless but is kept for backward compatibility. The documentation now contains
   details about the valid values for `threshold` (#357).
 * In all functions that have arguments `select` and/or `exclude`, there is now
   one warning per misspelled variable. The previous behavior was to have only one
@@ -318,7 +325,7 @@ BUG FIXES
 * Fix unexpected warning in `convert_na_to()` when `select` is a list (#352).
 * Fixed issue with correct labelling of numeric variables with more than nine
   unique values and associated value labels.
-  
+
 
 # datawizard 0.6.5
 
@@ -350,7 +357,7 @@ NEW FUNCTIONS
 * `data_codebook()`: to generate codebooks of data frames.
 
 * New functions to deal with duplicates: `data_duplicated()` (keep all duplicates,
-  including the first occurrence) and `data_unique()` (returns the data, excluding 
+  including the first occurrence) and `data_unique()` (returns the data, excluding
   all duplicates except one instance of each, based on the selected method).
 
 MINOR CHANGES
@@ -360,15 +367,15 @@ MINOR CHANGES
 * The `include_bounds` argument in `normalize()` can now also be a numeric
   value, defining the limit to the upper and lower bound (i.e. the distance
   to 1 and 0).
-  
-* `data_filter()` now works with grouped data. 
+
+* `data_filter()` now works with grouped data.
 
 BUG FIXES
 
 * `data_read()` no longer prints message for empty columns when the data
   actually had no empty columns.
-  
- * `data_to_wide()` now drops columns that are not in `id_cols` (if specified), 
+
+ * `data_to_wide()` now drops columns that are not in `id_cols` (if specified),
   `names_from`, or `values_from`. This is the behaviour observed in `tidyr::pivot_wider()`.
 
 # datawizard 0.6.3
@@ -800,4 +807,3 @@ NEW FUNCTIONS
 # datawizard 0.1.0
 
 * First release.
-
diff --git a/R/demean.R b/R/demean.R
index 8b84fa63b..194719a3c 100644
--- a/R/demean.R
+++ b/R/demean.R
@@ -11,7 +11,7 @@
 #' @param x A data frame.
 #' @param select Character vector (or formula) with names of variables to select
 #'   that should be group- and de-meaned.
-#' @param by Character vector (or formula) with the name of the variable(s) that
+#' @param by Character vector (or formula) with the name of the variable that
 #'   indicates the group- or cluster-ID. For cross-classified or nested designs,
 #'   `by` can also identify two or more variables as group- or cluster-IDs. If
 #'   the data is nested and should be treated as such, set `nested = TRUE`. Else,
@@ -20,7 +20,8 @@
 #'
 #'   For nested designs, `by` can be:
 #'   - a character vector with the name of the variable that indicates the
-#'     levels, ordered from *highest* level to *lowest* (e.g. `by = c("L3", "L2"`).
+#'     levels, ordered from *highest* level to *lowest* (e.g.
+#'     `by = c("L4", "L3", "L2")`.
 #'   - a character vector with variable names in the format `by = "L4/L3/L2"`,
 #'     where the levels are separated by `/`.
 #'
@@ -47,7 +48,10 @@
 #' @return
 #' A data frame with the group-/de-meaned variables, which get the suffix
 #' `"_between"` (for the group-meaned variable) and `"_within"` (for the
-#' de-meaned variable) by default.
+#' de-meaned variable) by default. For cross-classified or nested designs,
+#' the name pattern of the group-meaned variables is the name of the centered
+#' variable followed by the name of the variable that indicates the related
+#' grouping level, e.g. `predictor_L3_between` and `predictor_L2_between`.
 #'
 #' @seealso If grand-mean centering (instead of centering within-clusters)
 #'   is required, see [`center()`]. See [`performance::check_heterogeneity_bias()`]
@@ -178,19 +182,30 @@
 #'
 #' @section De-meaning for cross-classified designs:
 #'
-#' `demean()` can also handle cross-classified designs, where the data has two
-#' or more groups at the higher (i.e. second) level. In such cases, the
+#' `demean()` can handle cross-classified designs, where the data has two or
+#' more groups at the higher (i.e. second) level. In such cases, the
 #' `by`-argument can identify two or more variables that represent the
 #'  cross-classified group- or cluster-IDs. The de-meaned variables for
 #' cross-classified designs are simply subtracting all group means from each
 #' individual value, i.e. _fully cluster-mean-centering_ (see _Guo et al. 2024_
 #' for details). Note that de-meaning for cross-classified designs is *not*
 #' equivalent to de-meaning of nested data structures from models with three or
-#' more levels. Set `nested = TRUE` to explicitly assume a nested design. for
+#' more levels. Set `nested = TRUE` to explicitly assume a nested design. For
 #' cross-classified designs, de-meaning is supposed to work for models like
 #' `y ~ x + (1|level3) + (1|level2)`, but *not* for models like
 #' `y ~ x + (1|level3/level2)`.
 #'
+#' @section De-meaning for nested designs:
+#'
+#' _Brincks et al. (2017)_ have suggested an algorithm to center variables for
+#' nested designs, which is implememented in `demean()`. For nested designs,
+#' set `nested = TRUE` *and* specify the variables that indicate the different
+#' levels in descending order in the `by` argument. E.g.,
+#' `by = c("level4", "level3, "level2")` assumes a model like
+#' `y ~ x + (1|level4/level3/level2)`. An alternative notation for the
+#' `by`-argument would be `by = c("level4/level3/level2")`, similar to the
+#' formula notation.
+#'
 #' @section Analysing panel data with mixed models using lme4:
 #'
 #' A description of how to translate the formulas described in *Bell et al. 2018*
@@ -200,35 +215,40 @@
 #' @references
 #'
 #'   - Bafumi J, Gelman A. 2006. Fitting Multilevel Models When Predictors
-#'   and Group Effects Correlate. In. Philadelphia, PA: Annual meeting of the
-#'   American Political Science Association.
+#'     and Group Effects Correlate. In. Philadelphia, PA: Annual meeting of the
+#'     American Political Science Association.
 #'
 #'   - Bell A, Fairbrother M, Jones K. 2019. Fixed and Random Effects
-#'   Models: Making an Informed Choice. Quality & Quantity (53); 1051-1074
+#'     Models: Making an Informed Choice. Quality & Quantity (53); 1051-1074
 #'
 #'   - Bell A, Jones K. 2015. Explaining Fixed Effects: Random Effects
-#'   Modeling of Time-Series Cross-Sectional and Panel Data. Political Science
-#'   Research and Methods, 3(1), 133–153.
+#'     Modeling of Time-Series Cross-Sectional and Panel Data. Political Science
+#'     Research and Methods, 3(1), 133–153.
+#'
+#'   - Brincks, A. M., Enders, C. K., Llabre, M. M., Bulotsky-Shearer, R. J.,
+#'     Prado, G., and Feaster, D. J. (2017). Centering Predictor Variables in
+#'     Three-Level Contextual Models. Multivariate Behavioral Research, 52(2),
+#'     149–163. https://doi.org/10.1080/00273171.2016.1256753
 #'
 #'   - Gelman A, Hill J. 2007. Data Analysis Using Regression and
-#'   Multilevel/Hierarchical Models. Analytical Methods for Social Research.
-#'   Cambridge, New York: Cambridge University Press
+#'     Multilevel/Hierarchical Models. Analytical Methods for Social Research.
+#'     Cambridge, New York: Cambridge University Press
 #'
 #'   - Giesselmann M, Schmidt-Catran, AW. 2020. Interactions in fixed
-#'   effects regression models. Sociological Methods & Research, 1–28.
-#'   https://doi.org/10.1177/0049124120914934
+#'     effects regression models. Sociological Methods & Research, 1–28.
+#'     https://doi.org/10.1177/0049124120914934
 #'
 #'   - Guo Y, Dhaliwal J, Rights JD. 2024. Disaggregating level-specific effects
-#'   in cross-classified multilevel models. Behavior Research Methods, 56(4),
-#'   3023–3057.
+#'     in cross-classified multilevel models. Behavior Research Methods, 56(4),
+#'     3023–3057.
 #'
 #'   - Heisig JP, Schaeffer M, Giesecke J. 2017. The Costs of Simplicity:
-#'   Why Multilevel Models May Benefit from Accounting for Cross-Cluster
-#'   Differences in the Effects of Controls. American Sociological Review 82
-#'   (4): 796–827.
+#'     Why Multilevel Models May Benefit from Accounting for Cross-Cluster
+#'     Differences in the Effects of Controls. American Sociological Review 82
+#'     (4): 796–827.
 #'
 #'   - Hoffman L. 2015. Longitudinal analysis: modeling within-person
-#'   fluctuation and change. New York: Routledge
+#'     fluctuation and change. New York: Routledge
 #'
 #' @examples
 #'
diff --git a/man/demean.Rd b/man/demean.Rd
index b2e9b463d..40d394baa 100644
--- a/man/demean.Rd
+++ b/man/demean.Rd
@@ -50,7 +50,7 @@ detrend(
 \item{select}{Character vector (or formula) with names of variables to select
 that should be group- and de-meaned.}
 
-\item{by}{Character vector (or formula) with the name of the variable(s) that
+\item{by}{Character vector (or formula) with the name of the variable that
 indicates the group- or cluster-ID. For cross-classified or nested designs,
 \code{by} can also identify two or more variables as group- or cluster-IDs. If
 the data is nested and should be treated as such, set \code{nested = TRUE}. Else,
@@ -60,7 +60,8 @@ design is assumed.
 For nested designs, \code{by} can be:
 \itemize{
 \item a character vector with the name of the variable that indicates the
-levels, ordered from \emph{highest} level to \emph{lowest} (e.g. \verb{by = c("L3", "L2"}).
+levels, ordered from \emph{highest} level to \emph{lowest} (e.g.
+\code{by = c("L4", "L3", "L2")}.
 \item a character vector with variable names in the format \code{by = "L4/L3/L2"},
 where the levels are separated by \code{/}.
 }
@@ -94,7 +95,10 @@ or \code{"max"}.}
 \value{
 A data frame with the group-/de-meaned variables, which get the suffix
 \code{"_between"} (for the group-meaned variable) and \code{"_within"} (for the
-de-meaned variable) by default.
+de-meaned variable) by default. For cross-classified or nested designs,
+the name pattern of the group-meaned variables is the name of the centered
+variable followed by the name of the variable that indicates the related
+grouping level, e.g. \code{predictor_L3_between} and \code{predictor_L2_between}.
 }
 \description{
 \code{demean()} computes group- and de-meaned versions of a variable that can be
@@ -244,20 +248,33 @@ the term as interaction for the \code{select}-argument, e.g. \code{select = "a*b
 \section{De-meaning for cross-classified designs}{
 
 
-\code{demean()} can also handle cross-classified designs, where the data has two
-or more groups at the higher (i.e. second) level. In such cases, the
+\code{demean()} can handle cross-classified designs, where the data has two or
+more groups at the higher (i.e. second) level. In such cases, the
 \code{by}-argument can identify two or more variables that represent the
 cross-classified group- or cluster-IDs. The de-meaned variables for
 cross-classified designs are simply subtracting all group means from each
 individual value, i.e. \emph{fully cluster-mean-centering} (see \emph{Guo et al. 2024}
 for details). Note that de-meaning for cross-classified designs is \emph{not}
 equivalent to de-meaning of nested data structures from models with three or
-more levels. Set \code{nested = TRUE} to explicitly assume a nested design. for
+more levels. Set \code{nested = TRUE} to explicitly assume a nested design. For
 cross-classified designs, de-meaning is supposed to work for models like
 \code{y ~ x + (1|level3) + (1|level2)}, but \emph{not} for models like
 \code{y ~ x + (1|level3/level2)}.
 }
 
+\section{De-meaning for nested designs}{
+
+
+\emph{Brincks et al. (2017)} have suggested an algorithm to center variables for
+nested designs, which is implememented in \code{demean()}. For nested designs,
+set \code{nested = TRUE} \emph{and} specify the variables that indicate the different
+levels in descending order in the \code{by} argument. E.g.,
+\verb{by = c("level4", "level3, "level2")} assumes a model like
+\code{y ~ x + (1|level4/level3/level2)}. An alternative notation for the
+\code{by}-argument would be \code{by = c("level4/level3/level2")}, similar to the
+formula notation.
+}
+
 \section{Analysing panel data with mixed models using lme4}{
 
 
@@ -302,6 +319,10 @@ Models: Making an Informed Choice. Quality & Quantity (53); 1051-1074
 \item Bell A, Jones K. 2015. Explaining Fixed Effects: Random Effects
 Modeling of Time-Series Cross-Sectional and Panel Data. Political Science
 Research and Methods, 3(1), 133–153.
+\item Brincks, A. M., Enders, C. K., Llabre, M. M., Bulotsky-Shearer, R. J.,
+Prado, G., and Feaster, D. J. (2017). Centering Predictor Variables in
+Three-Level Contextual Models. Multivariate Behavioral Research, 52(2),
+149–163. https://doi.org/10.1080/00273171.2016.1256753
 \item Gelman A, Hill J. 2007. Data Analysis Using Regression and
 Multilevel/Hierarchical Models. Analytical Methods for Social Research.
 Cambridge, New York: Cambridge University Press
diff --git a/tests/testthat/test-demean.R b/tests/testthat/test-demean.R
index 46f174a9a..6e169f9c0 100644
--- a/tests/testthat/test-demean.R
+++ b/tests/testthat/test-demean.R
@@ -185,3 +185,46 @@ test_that("demean, sanity checks", {
     regex = "Variables \"neg_c_8\" and \"c173code\" were not found"
   )
 })
+
+
+test_that("demean for nested designs (by > 1), nested = TRUE", {
+  data(efc, package = "datawizard")
+  dat <- na.omit(efc)
+  dat$e42dep <- factor(dat$e42dep)
+  dat$c172code <- factor(dat$c172code)
+
+  x_ijk <- dat$c12hour
+  xbar_k <- ave(x_ijk, dat$e42dep, FUN = mean)
+  xbar_jk <- ave(x_ijk, dat$e42dep, dat$c172code, FUN = mean)
+
+  L3_between <- xbar_k
+  L2_between <- xbar_jk - xbar_k
+  L1_within <- x_ijk - xbar_jk
+
+  out <- degroup(
+    dat,
+    select = "c12hour",
+    by = c("e42dep", "c172code"),
+    nested = TRUE,
+    suffix_demean = "_within"
+  )
+
+  expect_equal(
+    out$c12hour_within,
+    L1_within,
+    tolerance = 1e-4,
+    ignore_attr = TRUE
+  )
+  expect_equal(
+    out$c12hour_e42dep_between,
+    L3_between,
+    tolerance = 1e-4,
+    ignore_attr = TRUE
+  )
+  expect_equal(
+    out$c12hour_c172code_between,
+    L2_between,
+    tolerance = 1e-4,
+    ignore_attr = TRUE
+  )
+})