diff --git a/DESCRIPTION b/DESCRIPTION index a1b4a88e..e7072383 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: serocalculator Type: Package Title: Estimating Infection Rates from Serological Data -Version: 1.2.0 +Version: 1.2.0.9000 Authors@R: c( person(given = "Peter", family = "Teunis", email = "p.teunis@emory.edu", role = c("aut", "cph"), comment = "Author of the method and original code."), person(given = "Kristina", family = "Lai", email = "kwlai@ucdavis.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 61d31397..41549599 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# serocalculator (development version) + # serocalculator 1.2.0 * Added `test-summary.pop_data` test diff --git a/R/curve_param_names.R b/R/curve_param_names.R new file mode 100644 index 00000000..dcb1034f --- /dev/null +++ b/R/curve_param_names.R @@ -0,0 +1 @@ +curve_param_names = c("y1", "alpha", "r", "antigen_iso") diff --git a/R/est.incidence.R b/R/est.incidence.R index 3ebeec1f..edcb834a 100644 --- a/R/est.incidence.R +++ b/R/est.incidence.R @@ -3,7 +3,7 @@ #' This function models seroincidence using maximum likelihood estimation; that is, it finds the value of the seroincidence parameter which maximizes the likelihood (i.e., joint probability) of the data. #' @inheritParams log_likelihood #' @inheritParams stats::nlm -#' @param pop_data [data.frame()] with cross-sectional serology data per antibody and age, and additional columns +#' @param pop_data a [data.frame] with cross-sectional serology data per antibody and age, and additional columns #' @param lambda_start starting guess for incidence rate, in years/event. #' @param antigen_isos Character vector with one or more antibody names. Values must match `pop_data` #' @param build_graph whether to graph the log-likelihood function across a range of incidence rates (lambda values) @@ -29,7 +29,8 @@ #' pop_data = xs_data %>% filter(Country == "Pakistan"), #' curve_params = curves, #' noise_params = noise %>% filter(Country == "Pakistan"), -#' antigen_isos = c("HlyE_IgG", "HlyE_IgA") +#' antigen_isos = c("HlyE_IgG", "HlyE_IgA"), +#' iterlim = 5 # limit iterations for the purpose of this example #' ) #' #' summary(est1) diff --git a/R/est.incidence.by.R b/R/est.incidence.by.R index 216021d4..6395f5b3 100644 --- a/R/est.incidence.by.R +++ b/R/est.incidence.by.R @@ -4,15 +4,20 @@ #' Function to estimate seroincidences based on cross-section serology data and longitudinal #' response model. #' -#' @param pop_data [data.frame()] with cross-sectional serology data per antibody and age, and additional columns to identify possible `strata`. -#' @param strata [character()] vector of stratum-defining variables. Values must be variable names in `pop_data`. +#' @param pop_data a [data.frame] with cross-sectional serology data per antibody and age, and additional columns corresponding to each element of the `strata` input +#' @param strata a [character] vector of stratum-defining variables. Values must be variable names in `pop_data`. #' @param curve_strata_varnames A subset of `strata`. Values must be variable names in `curve_params`. Default = "". #' @param noise_strata_varnames A subset of `strata`. Values must be variable names in `noise_params`. Default = "". #' @param num_cores Number of processor cores to use for calculations when computing by strata. If set to more than 1 and package \pkg{parallel} is available, then the computations are executed in parallel. Default = 1L. #' @details #' -#' If `strata` is left empty, a warning will be produced, recommending that you use `est.incidence()` for unstratified analyses, and then the data will be passed to `est.incidence()`. If for some reason you want to use `est.incidence.by()` with no strata instead of calling `est.incidence()`, you may use `NA`, `NULL`, or "" as the `strata` argument to avoid that warning. +#' If `strata` is left empty, a warning will be produced, +#' recommending that you use [est.incidence()] for unstratified analyses, +#' and then the data will be passed to [est.incidence()]. +#' If for some reason you want to use [est.incidence.by()] +#' with no strata instead of calling [est.incidence()], +#' you may use `NA`, `NULL`, or `""` as the `strata` argument to avoid that warning. #' #' #' @inheritParams est.incidence @@ -45,6 +50,7 @@ #' noise_params = noise %>% filter(Country == "Pakistan"), #' antigen_isos = c("HlyE_IgG", "HlyE_IgA"), #' #num_cores = 8 # Allow for parallel processing to decrease run time +#' iterlim = 5 # limit iterations for the purpose of this example #' ) #' #' summary(est2) diff --git a/R/noise_param_names.R b/R/noise_param_names.R new file mode 100644 index 00000000..4bc5503d --- /dev/null +++ b/R/noise_param_names.R @@ -0,0 +1 @@ +noise_param_names = c("nu", "eps", "y.low", "y.high", "antigen_iso") diff --git a/R/stratify_data.R b/R/stratify_data.R index 6865bbc8..3681d794 100644 --- a/R/stratify_data.R +++ b/R/stratify_data.R @@ -1,37 +1,73 @@ -stratify_data <- function( - data, - antigen_isos, - curve_params, - noise_params, - strata_varnames = "", - curve_strata_varnames = NULL, - noise_strata_varnames = NULL) { - if (is.null(strata_varnames) || all(strata_varnames == "")) { +#' @title Split data by stratum +#' @description Split biomarker data, decay curve parameters, and noise parameters +#' to prepare for stratified incidence estimation. +#' @param strata_varnames [character()] vector of names of variables in `data` to stratify by +#' @inheritParams est.incidence.by +#' +#' @returns a `"biomarker_data_and_params.list"` object (a [list] with extra attributes `"strata"`, `"antigen_isos"`, etc) +#' @keywords internal +#' @examples +#' \dontrun{ +#' library(dplyr) +#' +#' xs_data <- load_pop_data("https://osf.io/download//n6cp3/") +#' +#' curve <- load_curve_params("https://osf.io/download/rtw5k/") %>% +#' filter(antigen_iso %in% c("HlyE_IgA", "HlyE_IgG")) %>% +#' slice(1:100, .by = antigen_iso) # Reduce dataset for the purposes of this example +#' +#' noise <- load_noise_params("https://osf.io/download//hqy4v/") +#' +#' stratified_data = +#' stratify_data( +#' data = xs_data, +#' curve_params = curve, +#' noise_params = noise, +#' strata_varnames = "catchment", +#' curve_strata_varnames = NULL, +#' noise_strata_varnames = NULL +#' ) +#' } +stratify_data <- function(data, + curve_params, + noise_params, + strata_varnames = "", + curve_strata_varnames = NULL, + noise_strata_varnames = NULL, + antigen_isos = data %>% attr("antigen_isos")) { + no_strata = is.null(strata_varnames) || all(strata_varnames == "") + + curve_params = + curve_params %>% + filter(.data[["antigen_iso"]] %in% antigen_isos) + + noise_params = + noise_params %>% + filter(.data[["antigen_iso"]] %in% antigen_isos) + + if (no_strata) { all_data <- list( - pop_data = data %>% select( - attributes(data)$value_var, - attributes(data)$age_var, - "antigen_iso" - ), - curve_params = curve_params %>% select("y1", "alpha", "r", "antigen_iso"), - noise_params = noise_params %>% select("nu", "eps", "y.low", "y.high", "antigen_iso") + pop_data = data %>% select(all_of( + c( + data %>% get_value_var(), + data %>% get_age_var(), + "antigen_iso" + ) + )), + curve_params = + curve_params %>% + select(all_of(curve_param_names)), + noise_params = + noise_params %>% + select(all_of(noise_param_names)) ) %>% - structure( - class = union( - "biomarker_data_and_params", - "list" - ) - ) + structure(class = union("biomarker_data_and_params", "list")) stratumDataList <- - list( # est.incidence.by() expects a list. - `all data` = all_data - ) %>% - structure( - antigen_isos = antigen_isos, - strata = tibble(Stratum = NA) - ) + list(# est.incidence.by() expects a list. + `all data` = all_data) %>% + structure(antigen_isos = antigen_isos, strata = tibble(Stratum = NA)) return(stratumDataList) @@ -44,14 +80,14 @@ stratify_data <- function( strata_vars_curve_params <- warn.missing.strata( data = curve_params, - strata = strata %>% select(curve_strata_varnames), + strata = strata %>% select(all_of(curve_strata_varnames)), dataname = "curve_params" ) strata_vars_noise_params <- warn.missing.strata( data = noise_params, - strata = strata %>% select(noise_strata_varnames), + strata = strata %>% select(all_of(noise_strata_varnames)), dataname = "noise_params" ) @@ -71,62 +107,51 @@ stratify_data <- function( list( pop_data = data %>% - semi_join( - cur_stratum_vals, - by = strata_varnames - ) %>% - select( - attributes(data)$value_var, - attributes(data)$age_var, + semi_join(cur_stratum_vals, by = strata_varnames) %>% + select(all_of( + c( + data %>% get_value_var(), + data %>% get_age_var(), "antigen_iso" ) + )) ) if (length(strata_vars_curve_params) == 0) { data_and_params_cur_stratum$curve_params <- - curve_params %>% select("y1", "alpha", "r", "antigen_iso") + curve_params %>% select(all_of(curve_param_names)) } else { data_and_params_cur_stratum$curve_params <- curve_params %>% - semi_join( - cur_stratum_vals, - by = strata_vars_curve_params - ) %>% - select("y1", "alpha", "r", "antigen_iso") + semi_join(cur_stratum_vals, by = strata_vars_curve_params) %>% + select(all_of(curve_param_names)) } if (length(strata_vars_noise_params) == 0) { data_and_params_cur_stratum$noise_params <- noise_params %>% - select("nu", "eps", "y.low", "y.high", "antigen_iso") + select(all_of(noise_param_names)) } else { data_and_params_cur_stratum$noise_params <- noise_params %>% - semi_join( - cur_stratum_vals, - by = strata_vars_noise_params - ) %>% - select("nu", "eps", "y.low", "y.high", "antigen_iso") + semi_join(cur_stratum_vals, by = strata_vars_noise_params) %>% + select(all_of(noise_param_names)) } stratumDataList[[cur_stratum]] <- data_and_params_cur_stratum %>% - structure( - class = union( - "biomarker_data_and_params", - class(data_and_params_cur_stratum) - ) - ) + structure(class = union( + "biomarker_data_and_params", + class(data_and_params_cur_stratum) + )) } - return( - structure( - stratumDataList, - antigen_isos = antigen_isos, - strata = strata, - class = c("biomarker_data_and_params.list", "list") - ) - ) + return(structure( + stratumDataList, + antigen_isos = antigen_isos, + strata = strata, + class = c("biomarker_data_and_params.list", "list") + )) } diff --git a/inst/WORDLIST b/inst/WORDLIST index 11befee7..aee74866 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -86,6 +86,7 @@ Wiens Zota al behaviour +biomarker campylobacteriosis de der diff --git a/man/est.incidence.Rd b/man/est.incidence.Rd index a8859c19..58b19797 100644 --- a/man/est.incidence.Rd +++ b/man/est.incidence.Rd @@ -19,7 +19,7 @@ est.incidence( ) } \arguments{ -\item{pop_data}{\code{\link[=data.frame]{data.frame()}} with cross-sectional serology data per antibody and age, and additional columns} +\item{pop_data}{a \link{data.frame} with cross-sectional serology data per antibody and age, and additional columns} \item{curve_params}{a \code{\link[=data.frame]{data.frame()}} containing MCMC samples of parameters from the Bayesian posterior distribution of a longitudinal decay curve model. The parameter columns must be named: \itemize{ @@ -104,7 +104,8 @@ est1 <- est.incidence( pop_data = xs_data \%>\% filter(Country == "Pakistan"), curve_params = curves, noise_params = noise \%>\% filter(Country == "Pakistan"), - antigen_isos = c("HlyE_IgG", "HlyE_IgA") + antigen_isos = c("HlyE_IgG", "HlyE_IgA"), + iterlim = 5 # limit iterations for the purpose of this example ) summary(est1) diff --git a/man/est.incidence.by.Rd b/man/est.incidence.by.Rd index 89f93d47..123a4b16 100644 --- a/man/est.incidence.by.Rd +++ b/man/est.incidence.by.Rd @@ -21,7 +21,7 @@ est.incidence.by( ) } \arguments{ -\item{pop_data}{\code{\link[=data.frame]{data.frame()}} with cross-sectional serology data per antibody and age, and additional columns to identify possible \code{strata}.} +\item{pop_data}{a \link{data.frame} with cross-sectional serology data per antibody and age, and additional columns corresponding to each element of the \code{strata} input} \item{curve_params}{a \code{\link[=data.frame]{data.frame()}} containing MCMC samples of parameters from the Bayesian posterior distribution of a longitudinal decay curve model. The parameter columns must be named: \itemize{ @@ -43,7 +43,7 @@ est.incidence.by( \item \code{y.high}: upper limit of detection for the current antigen isotype }} -\item{strata}{\code{\link[=character]{character()}} vector of stratum-defining variables. Values must be variable names in \code{pop_data}.} +\item{strata}{a \link{character} vector of stratum-defining variables. Values must be variable names in \code{pop_data}.} \item{curve_strata_varnames}{A subset of \code{strata}. Values must be variable names in \code{curve_params}. Default = "".} @@ -103,7 +103,12 @@ Function to estimate seroincidences based on cross-section serology data and lon response model. } \details{ -If \code{strata} is left empty, a warning will be produced, recommending that you use \code{est.incidence()} for unstratified analyses, and then the data will be passed to \code{est.incidence()}. If for some reason you want to use \code{est.incidence.by()} with no strata instead of calling \code{est.incidence()}, you may use \code{NA}, \code{NULL}, or "" as the \code{strata} argument to avoid that warning. +If \code{strata} is left empty, a warning will be produced, +recommending that you use \code{\link[=est.incidence]{est.incidence()}} for unstratified analyses, +and then the data will be passed to \code{\link[=est.incidence]{est.incidence()}}. +If for some reason you want to use \code{\link[=est.incidence.by]{est.incidence.by()}} +with no strata instead of calling \code{\link[=est.incidence]{est.incidence()}}, +you may use \code{NA}, \code{NULL}, or \code{""} as the \code{strata} argument to avoid that warning. } \examples{ @@ -124,6 +129,7 @@ est2 <- est.incidence.by( noise_params = noise \%>\% filter(Country == "Pakistan"), antigen_isos = c("HlyE_IgG", "HlyE_IgA"), #num_cores = 8 # Allow for parallel processing to decrease run time + iterlim = 5 # limit iterations for the purpose of this example ) summary(est2) diff --git a/man/stratify_data.Rd b/man/stratify_data.Rd new file mode 100644 index 00000000..7dacbcbb --- /dev/null +++ b/man/stratify_data.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stratify_data.R +\name{stratify_data} +\alias{stratify_data} +\title{Split data by stratum} +\usage{ +stratify_data( + data, + curve_params, + noise_params, + strata_varnames = "", + curve_strata_varnames = NULL, + noise_strata_varnames = NULL, + antigen_isos = data \%>\% attr("antigen_isos") +) +} +\arguments{ +\item{curve_params}{a \code{\link[=data.frame]{data.frame()}} containing MCMC samples of parameters from the Bayesian posterior distribution of a longitudinal decay curve model. The parameter columns must be named: +\itemize{ +\item \code{antigen_iso}: a \code{\link[=character]{character()}} vector indicating antigen-isotype combinations +\item \code{iter}: an \code{\link[=integer]{integer()}} vector indicating MCMC sampling iterations +\item \code{y0}: baseline antibody level at $t=0$ ($y(t=0)$) +\item \code{y1}: antibody peak level (ELISA units) +\item \code{t1}: duration of infection +\item \code{alpha}: antibody decay rate (1/days for the current longitudinal parameter sets) +\item \code{r}: shape factor of antibody decay +}} + +\item{noise_params}{a \code{\link[=data.frame]{data.frame()}} (or \code{\link[tibble:tibble]{tibble::tibble()}}) containing the following variables, specifying noise parameters for each antigen isotype: +\itemize{ +\item \code{antigen_iso}: antigen isotype whose noise parameters are being specified on each row +\item \code{nu}: biological noise +\item \code{eps}: measurement noise +\item \code{y.low}: lower limit of detection for the current antigen isotype +\item \code{y.high}: upper limit of detection for the current antigen isotype +}} + +\item{strata_varnames}{\code{\link[=character]{character()}} vector of names of variables in \code{data} to stratify by} + +\item{curve_strata_varnames}{A subset of \code{strata}. Values must be variable names in \code{curve_params}. Default = "".} + +\item{noise_strata_varnames}{A subset of \code{strata}. Values must be variable names in \code{noise_params}. Default = "".} + +\item{antigen_isos}{Character vector with one or more antibody names. Values must match \code{pop_data}} +} +\value{ +a \code{"biomarker_data_and_params.list"} object (a \link{list} with extra attributes \code{"strata"}, \code{"antigen_isos"}, etc) +} +\description{ +Split biomarker data, decay curve parameters, and noise parameters +to prepare for stratified incidence estimation. +} +\examples{ +\dontrun{ +library(dplyr) + +xs_data <- load_pop_data("https://osf.io/download//n6cp3/") + +curve <- load_curve_params("https://osf.io/download/rtw5k/") \%>\% + filter(antigen_iso \%in\% c("HlyE_IgA", "HlyE_IgG")) \%>\% + slice(1:100, .by = antigen_iso) # Reduce dataset for the purposes of this example + +noise <- load_noise_params("https://osf.io/download//hqy4v/") + +stratified_data = + stratify_data( + data = xs_data, + curve_params = curve, + noise_params = noise, + strata_varnames = "catchment", + curve_strata_varnames = NULL, + noise_strata_varnames = NULL + ) +} +} +\keyword{internal} diff --git a/tests/testthat/_snaps/stratify_data.md b/tests/testthat/_snaps/stratify_data.md new file mode 100644 index 00000000..82e6d68b --- /dev/null +++ b/tests/testthat/_snaps/stratify_data.md @@ -0,0 +1,103 @@ +# stratify_data() produces consistent results + + Code + stratified_data + Output + $`Stratum 1` + $pop_data + # A tibble: 588 x 3 + value age antigen_iso + + 1 5.69 18 HlyE_IgA + 2 4.21 18 HlyE_IgG + 3 1.23 7.3 HlyE_IgA + 4 3.00 7.3 HlyE_IgG + 5 1.08 2.6 HlyE_IgA + 6 0.217 2.6 HlyE_IgG + 7 1.43 3.9 HlyE_IgA + 8 0.956 3.9 HlyE_IgG + 9 3.06 13 HlyE_IgA + 10 9.57 13 HlyE_IgG + # i 578 more rows + + $curve_params + # A tibble: 200 x 4 + y1 alpha r antigen_iso + + 1 63.5 0.000581 1.75 HlyE_IgA + 2 288. 0.000459 2.66 HlyE_IgA + 3 432. 0.000277 1.61 HlyE_IgA + 4 30.6 0.00127 1.87 HlyE_IgA + 5 160. 0.00140 1.40 HlyE_IgA + 6 525. 0.000294 2.26 HlyE_IgA + 7 30.8 0.00459 1.55 HlyE_IgA + 8 41.3 0.00234 1.48 HlyE_IgA + 9 248. 0.0000467 8.50 HlyE_IgA + 10 319. 0.000448 1.75 HlyE_IgA + # i 190 more rows + + $noise_params + # A tibble: 2 x 5 + nu eps y.low y.high antigen_iso + + 1 2.60 0.279 0.508 5000000 HlyE_IgA + 2 2.36 0.146 1.59 5000000 HlyE_IgG + + attr(,"class") + [1] "biomarker_data_and_params" "list" + + $`Stratum 2` + $pop_data + # A tibble: 400 x 3 + value age antigen_iso + + 1 0.568 13.2 HlyE_IgA + 2 2.15 13.2 HlyE_IgG + 3 0.779 11 HlyE_IgA + 4 1.89 11 HlyE_IgG + 5 1.90 12 HlyE_IgA + 6 8.09 12 HlyE_IgG + 7 1.41 16 HlyE_IgA + 8 2.37 16 HlyE_IgG + 9 7.12 7.6 HlyE_IgA + 10 11.6 7.6 HlyE_IgG + # i 390 more rows + + $curve_params + # A tibble: 200 x 4 + y1 alpha r antigen_iso + + 1 63.5 0.000581 1.75 HlyE_IgA + 2 288. 0.000459 2.66 HlyE_IgA + 3 432. 0.000277 1.61 HlyE_IgA + 4 30.6 0.00127 1.87 HlyE_IgA + 5 160. 0.00140 1.40 HlyE_IgA + 6 525. 0.000294 2.26 HlyE_IgA + 7 30.8 0.00459 1.55 HlyE_IgA + 8 41.3 0.00234 1.48 HlyE_IgA + 9 248. 0.0000467 8.50 HlyE_IgA + 10 319. 0.000448 1.75 HlyE_IgA + # i 190 more rows + + $noise_params + # A tibble: 2 x 5 + nu eps y.low y.high antigen_iso + + 1 2.60 0.279 0.508 5000000 HlyE_IgA + 2 2.36 0.146 1.59 5000000 HlyE_IgG + + attr(,"class") + [1] "biomarker_data_and_params" "list" + + attr(,"antigen_isos") + [1] HlyE_IgA HlyE_IgG + Levels: HlyE_IgA HlyE_IgG + attr(,"strata") + # A tibble: 2 x 3 + Stratum catchment n + + 1 Stratum 1 aku 294 + 2 Stratum 2 kgh 200 + attr(,"class") + [1] "biomarker_data_and_params.list" "list" + diff --git a/tests/testthat/test-stratify_data.R b/tests/testthat/test-stratify_data.R new file mode 100644 index 00000000..546a623b --- /dev/null +++ b/tests/testthat/test-stratify_data.R @@ -0,0 +1,61 @@ +test_that("stratify_data() produces consistent results", { + + library(dplyr) + library(readr) + + xs_data <- + read_rds("https://osf.io/download//n6cp3/") %>% + as_pop_data() %>% + filter(Country == "Pakistan") + + curve <- + load_curve_params("https://osf.io/download/rtw5k/") %>% + filter(antigen_iso %in% c("HlyE_IgA", "HlyE_IgG")) %>% + slice(1:100, .by = antigen_iso) # Reduce dataset for the purposes of this example + + noise <- + load_noise_params("https://osf.io/download//hqy4v/") %>% + filter(Country == "Pakistan") + + stratified_data = + stratify_data( + data = xs_data, + curve_params = curve, + noise_params = noise, + strata_varnames = "catchment", + curve_strata_varnames = NULL, + noise_strata_varnames = NULL + ) + + expect_snapshot(stratified_data) +}) + +test_that("stratify_data() warns about missing data", { + + library(dplyr) + library(readr) + + xs_data <- + read_rds("https://osf.io/download//n6cp3/") %>% + as_pop_data() %>% + filter(Country == "Nepal") + + curve <- + load_curve_params("https://osf.io/download/rtw5k/") %>% + filter(antigen_iso %in% c("HlyE_IgA", "HlyE_IgG")) %>% + slice(1:100, .by = antigen_iso) # Reduce dataset for the purposes of this example + + noise <- + load_noise_params("https://osf.io/download//hqy4v/") %>% + filter(Country == "Nepal") + + stratify_data( + data = xs_data, + curve_params = curve, + noise_params = noise, + strata_varnames = "catchment", + curve_strata_varnames = NULL, + noise_strata_varnames = NULL + ) |> + expect_warning(regexp = "The number of observations in `data` varies") +})