Merge pull request #176 from UCD-SERG/test-stratify-data

Test stratify data
UCD-SERG · Jul 17, 2024 · 4e04563 · 4e04563
2 parents ca99c04 + fdda4c2
commit 4e04563
Show file tree

Hide file tree

Showing 13 changed files with 359 additions and 75 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: serocalculator
 Type: Package
 Title: Estimating Infection Rates from Serological Data
-Version: 1.2.0
+Version: 1.2.0.9000
 Authors@R: c(
   person(given = "Peter", family = "Teunis", email = "[email protected]", role = c("aut", "cph"), comment = "Author of the method and original code."),
   person(given = "Kristina", family = "Lai", email = "[email protected]", role = c("aut", "cre")), 

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,5 @@
+# serocalculator (development version)
+
 # serocalculator 1.2.0
 * Added `test-summary.pop_data` test
 

diff --git a/R/curve_param_names.R b/R/curve_param_names.R
@@ -0,0 +1 @@
+curve_param_names = c("y1", "alpha", "r", "antigen_iso")
diff --git a/R/est.incidence.R b/R/est.incidence.R
@@ -3,7 +3,7 @@
 #' This function models seroincidence using maximum likelihood estimation; that is, it finds the value of the seroincidence parameter which maximizes the likelihood (i.e., joint probability) of the data.
 #' @inheritParams log_likelihood
 #' @inheritParams stats::nlm
-#' @param pop_data [data.frame()] with cross-sectional serology data per antibody and age, and additional columns
+#' @param pop_data a [data.frame] with cross-sectional serology data per antibody and age, and additional columns
 #' @param lambda_start starting guess for incidence rate, in years/event.
 #' @param antigen_isos Character vector with one or more antibody names. Values must match `pop_data`
 #' @param build_graph whether to graph the log-likelihood function across a range of incidence rates (lambda values)
@@ -29,7 +29,8 @@
 #'   pop_data = xs_data %>% filter(Country == "Pakistan"),
 #'   curve_params = curves,
 #'   noise_params = noise %>% filter(Country == "Pakistan"),
-#'   antigen_isos = c("HlyE_IgG", "HlyE_IgA")
+#'   antigen_isos = c("HlyE_IgG", "HlyE_IgA"),
+#'   iterlim = 5 # limit iterations for the purpose of this example
 #' )
 #'
 #' summary(est1)

diff --git a/R/est.incidence.by.R b/R/est.incidence.by.R
@@ -4,15 +4,20 @@
 #' Function to estimate seroincidences based on cross-section serology data and longitudinal
 #' response model.
 #'
-#' @param pop_data [data.frame()] with cross-sectional serology data per antibody and age, and additional columns to identify possible `strata`.
-#' @param strata [character()] vector of stratum-defining variables. Values must be variable names in `pop_data`.
+#' @param pop_data a [data.frame] with cross-sectional serology data per antibody and age, and additional columns corresponding to each element of the `strata` input
+#' @param strata a [character] vector of stratum-defining variables. Values must be variable names in `pop_data`.
 #' @param curve_strata_varnames A subset of `strata`. Values must be variable names in `curve_params`. Default = "".
 #' @param noise_strata_varnames A subset of `strata`. Values must be variable names in `noise_params`. Default = "".
 #' @param num_cores Number of processor cores to use for calculations when computing by strata. If set to more than 1 and package \pkg{parallel} is available, then the computations are executed in parallel. Default = 1L.
 
 #' @details
 #'
-#' If `strata` is left empty, a warning will be produced, recommending that you use `est.incidence()` for unstratified analyses, and then the data will be passed to `est.incidence()`. If for some reason you want to use `est.incidence.by()` with no strata instead of calling `est.incidence()`, you may use `NA`, `NULL`, or "" as the `strata` argument to avoid that warning.
+#' If `strata` is left empty, a warning will be produced,
+#' recommending that you use [est.incidence()] for unstratified analyses,
+#' and then the data will be passed to [est.incidence()].
+#' If for some reason you want to use [est.incidence.by()]
+#' with no strata instead of calling [est.incidence()],
+#' you may use `NA`, `NULL`, or `""` as the `strata` argument to avoid that warning.
 #'
 #'
 #' @inheritParams est.incidence
@@ -45,6 +50,7 @@
 #'   noise_params = noise %>% filter(Country == "Pakistan"),
 #'   antigen_isos = c("HlyE_IgG", "HlyE_IgA"),
 #'   #num_cores = 8 # Allow for parallel processing to decrease run time
+#'   iterlim = 5 # limit iterations for the purpose of this example
 #' )
 #'
 #' summary(est2)

diff --git a/R/noise_param_names.R b/R/noise_param_names.R
@@ -0,0 +1 @@
+noise_param_names = c("nu", "eps", "y.low", "y.high", "antigen_iso")
diff --git a/R/stratify_data.R b/R/stratify_data.R
@@ -1,37 +1,73 @@
-stratify_data <- function(
-    data,
-    antigen_isos,
-    curve_params,
-    noise_params,
-    strata_varnames = "",
-    curve_strata_varnames = NULL,
-    noise_strata_varnames = NULL) {
-  if (is.null(strata_varnames) || all(strata_varnames == "")) {
+#' @title Split data by stratum
+#' @description Split biomarker data, decay curve parameters, and noise parameters
+#' to prepare for stratified incidence estimation.
+#' @param strata_varnames [character()] vector of names of variables in `data` to stratify by
+#' @inheritParams est.incidence.by
+#'
+#' @returns a `"biomarker_data_and_params.list"` object (a [list] with extra attributes `"strata"`, `"antigen_isos"`, etc)
+#' @keywords internal
+#' @examples
+#' \dontrun{
+#' library(dplyr)
+#'
+#' xs_data <- load_pop_data("https://osf.io/download//n6cp3/")
+#'
+#' curve <- load_curve_params("https://osf.io/download/rtw5k/") %>%
+#'   filter(antigen_iso %in% c("HlyE_IgA", "HlyE_IgG")) %>%
+#'   slice(1:100, .by = antigen_iso) # Reduce dataset for the purposes of this example
+#'
+#' noise <- load_noise_params("https://osf.io/download//hqy4v/")
+#'
+#' stratified_data =
+#'   stratify_data(
+#'    data = xs_data,
+#'    curve_params = curve,
+#'    noise_params = noise,
+#'    strata_varnames = "catchment",
+#'    curve_strata_varnames = NULL,
+#'    noise_strata_varnames = NULL
+#'    )
+#' }
+stratify_data <- function(data,
+                          curve_params,
+                          noise_params,
+                          strata_varnames = "",
+                          curve_strata_varnames = NULL,
+                          noise_strata_varnames = NULL,
+                          antigen_isos = data %>% attr("antigen_isos")) {
+  no_strata = is.null(strata_varnames) || all(strata_varnames == "")
+
+  curve_params =
+    curve_params %>%
+    filter(.data[["antigen_iso"]] %in% antigen_isos)
+
+  noise_params =
+    noise_params %>%
+    filter(.data[["antigen_iso"]] %in% antigen_isos)
+
+  if (no_strata) {
     all_data <-
       list(
-        pop_data = data %>% select(
-          attributes(data)$value_var,
-          attributes(data)$age_var,
-          "antigen_iso"
-        ),
-        curve_params = curve_params %>% select("y1", "alpha", "r", "antigen_iso"),
-        noise_params = noise_params %>% select("nu", "eps", "y.low", "y.high", "antigen_iso")
+        pop_data = data %>% select(all_of(
+          c(
+            data %>% get_value_var(),
+            data %>% get_age_var(),
+            "antigen_iso"
+          )
+        )),
+        curve_params =
+          curve_params %>%
+          select(all_of(curve_param_names)),
+        noise_params =
+          noise_params %>%
+          select(all_of(noise_param_names))
       ) %>%
-      structure(
-        class = union(
-          "biomarker_data_and_params",
-          "list"
-        )
-      )
+      structure(class = union("biomarker_data_and_params", "list"))
 
     stratumDataList <-
-      list( # est.incidence.by() expects a list.
-        `all data` = all_data
-      ) %>%
-      structure(
-        antigen_isos = antigen_isos,
-        strata = tibble(Stratum = NA)
-      )
+      list(# est.incidence.by() expects a list.
+        `all data` = all_data) %>%
+      structure(antigen_isos = antigen_isos, strata = tibble(Stratum = NA))
 
 
     return(stratumDataList)
@@ -44,14 +80,14 @@ stratify_data <- function(
   strata_vars_curve_params <-
     warn.missing.strata(
       data = curve_params,
-      strata = strata %>% select(curve_strata_varnames),
+      strata = strata %>% select(all_of(curve_strata_varnames)),
       dataname = "curve_params"
     )
 
   strata_vars_noise_params <-
     warn.missing.strata(
       data = noise_params,
-      strata = strata %>% select(noise_strata_varnames),
+      strata = strata %>% select(all_of(noise_strata_varnames)),
       dataname = "noise_params"
     )
 
@@ -71,62 +107,51 @@ stratify_data <- function(
       list(
         pop_data =
           data %>%
-            semi_join(
-              cur_stratum_vals,
-              by = strata_varnames
-            ) %>%
-            select(
-              attributes(data)$value_var,
-              attributes(data)$age_var,
+          semi_join(cur_stratum_vals, by = strata_varnames) %>%
+          select(all_of(
+            c(
+              data %>% get_value_var(),
+              data %>% get_age_var(),
               "antigen_iso"
             )
+          ))
       )
 
     if (length(strata_vars_curve_params) == 0) {
       data_and_params_cur_stratum$curve_params <-
-        curve_params %>% select("y1", "alpha", "r", "antigen_iso")
+        curve_params %>% select(all_of(curve_param_names))
     } else {
       data_and_params_cur_stratum$curve_params <-
         curve_params %>%
-        semi_join(
-          cur_stratum_vals,
-          by = strata_vars_curve_params
-        ) %>%
-        select("y1", "alpha", "r", "antigen_iso")
+        semi_join(cur_stratum_vals, by = strata_vars_curve_params) %>%
+        select(all_of(curve_param_names))
     }
 
     if (length(strata_vars_noise_params) == 0) {
       data_and_params_cur_stratum$noise_params <-
         noise_params %>%
-        select("nu", "eps", "y.low", "y.high", "antigen_iso")
+        select(all_of(noise_param_names))
     } else {
       data_and_params_cur_stratum$noise_params <-
         noise_params %>%
-        semi_join(
-          cur_stratum_vals,
-          by = strata_vars_noise_params
-        ) %>%
-        select("nu", "eps", "y.low", "y.high", "antigen_iso")
+        semi_join(cur_stratum_vals, by = strata_vars_noise_params) %>%
+        select(all_of(noise_param_names))
     }
 
     stratumDataList[[cur_stratum]] <-
       data_and_params_cur_stratum %>%
-      structure(
-        class = union(
-          "biomarker_data_and_params",
-          class(data_and_params_cur_stratum)
-        )
-      )
+      structure(class = union(
+        "biomarker_data_and_params",
+        class(data_and_params_cur_stratum)
+      ))
   }
 
 
 
-  return(
-    structure(
-      stratumDataList,
-      antigen_isos = antigen_isos,
-      strata = strata,
-      class = c("biomarker_data_and_params.list", "list")
-    )
-  )
+  return(structure(
+    stratumDataList,
+    antigen_isos = antigen_isos,
+    strata = strata,
+    class = c("biomarker_data_and_params.list", "list")
+  ))
 }
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -86,6 +86,7 @@ Wiens
 Zota
 al
 behaviour
+biomarker
 campylobacteriosis
 de
 der

diff --git a/man/est.incidence.Rd b/man/est.incidence.Rd
diff --git a/man/est.incidence.by.Rd b/man/est.incidence.by.Rd