diff --git a/.Rbuildignore b/.Rbuildignore index 0582014a..cb0b7ed2 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -16,3 +16,5 @@ ^.lintr$ ^DEVELOPMENT.md$ man-roxygen +^.venv$ +^sandbox.R$ \ No newline at end of file diff --git a/.gitignore b/.gitignore index de393a31..8dc001be 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ docs renv/ renv.lock .Rprofile +sandbox.R \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 333bf13c..790b36a5 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,7 +50,8 @@ Imports: tidyselect (>= 1.2.0), tsibble, utils, - vctrs + vctrs, + waldo Suggests: covidcast, devtools, diff --git a/NAMESPACE b/NAMESPACE index 03364f16..904b2d24 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -55,10 +55,8 @@ export("%>%") export(archive_cases_dv_subset) export(arrange) export(arrange_canonical) -export(as_diagonal_slide_computation) export(as_epi_archive) export(as_epi_df) -export(as_time_slide_computation) export(as_tsibble) export(autoplot) export(clone) diff --git a/R/autoplot.R b/R/autoplot.R index 23f480fe..eef5aa12 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -55,7 +55,7 @@ autoplot.epi_df <- function( key_cols <- key_colnames(object) non_key_cols <- setdiff(names(object), key_cols) - geo_and_other_keys <- kill_time_value(key_cols) + geo_and_other_keys <- key_colnames(object, exclude = "time_value") # --- check for numeric variables allowed <- purrr::map_lgl(object[non_key_cols], is.numeric) diff --git a/R/epi_df.R b/R/epi_df.R index 420ce2dc..c8d052d9 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -184,18 +184,14 @@ new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value = metadata$other_keys <- other_keys # Reorder columns (geo_value, time_value, ...) - if (sum(dim(x)) != 0) { - cols_to_put_first <- c("geo_value", "time_value", other_keys) - x <- x[, c( - cols_to_put_first, - # All other columns - names(x)[!(names(x) %in% cols_to_put_first)] - )] + if (nrow(x) > 0) { + x <- x %>% relocate(all_of(c("geo_value", other_keys, "time_value")), .before = 1) } # Apply epi_df class, attach metadata, and return class(x) <- c("epi_df", class(x)) attributes(x)$metadata <- metadata + return(x) } @@ -281,6 +277,7 @@ as_epi_df.tbl_df <- function( if (".time_value_counts" %in% other_keys) { cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"") } + duplicated_time_values <- x %>% group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>% filter(dplyr::n() > 1) %>% diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 11d84e6a..b592cd91 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -397,8 +397,8 @@ epix_slide.grouped_epi_archive <- function( )), capture.output(print(waldo::compare( res[[comp_nms[[comp_i]]]], comp_value[[comp_i]], - x_arg = rlang::expr_deparse(expr(`$`(label, !!sym(comp_nms[[comp_i]])))), - y_arg = rlang::expr_deparse(expr(`$`(comp_value, !!sym(comp_nms[[comp_i]])))) + x_arg = rlang::expr_deparse(dplyr::expr(`$`(label, !!sym(comp_nms[[comp_i]])))), # nolint: object_usage_linter + y_arg = rlang::expr_deparse(dplyr::expr(`$`(comp_value, !!sym(comp_nms[[comp_i]])))) ))), cli::format_message(c( "You likely want to rename or remove this column in your output, or debug why it has a different value." diff --git a/R/key_colnames.R b/R/key_colnames.R index b0119764..49c32674 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -2,39 +2,46 @@ #' #' @param x a data.frame, tibble, or epi_df #' @param ... additional arguments passed on to methods -#' -#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL` +#' @param other_keys an optional character vector of other keys to include +#' @param exclude an optional character vector of keys to exclude +#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL`. #' @keywords internal #' @export key_colnames <- function(x, ...) { UseMethod("key_colnames") } +#' @rdname key_colnames +#' @method key_colnames default #' @export key_colnames.default <- function(x, ...) { character(0L) } +#' @rdname key_colnames +#' @method key_colnames data.frame #' @export -key_colnames.data.frame <- function(x, other_keys = character(0L), ...) { +key_colnames.data.frame <- function(x, other_keys = character(0L), exclude = character(0L), ...) { assert_character(other_keys) - nm <- c("geo_value", "time_value", other_keys) + assert_character(exclude) + nm <- setdiff(c("geo_value", other_keys, "time_value"), exclude) intersect(nm, colnames(x)) } +#' @rdname key_colnames +#' @method key_colnames epi_df #' @export -key_colnames.epi_df <- function(x, ...) { +key_colnames.epi_df <- function(x, exclude = character(0L), ...) { + assert_character(exclude) other_keys <- attr(x, "metadata")$other_keys - c("geo_value", "time_value", other_keys) + setdiff(c("geo_value", other_keys, "time_value"), exclude) } +#' @rdname key_colnames +#' @method key_colnames epi_archive #' @export -key_colnames.epi_archive <- function(x, ...) { +key_colnames.epi_archive <- function(x, exclude = character(0L), ...) { + assert_character(exclude) other_keys <- attr(x, "metadata")$other_keys - c("geo_value", "time_value", other_keys) -} - -kill_time_value <- function(v) { - assert_character(v) - v[v != "time_value"] + setdiff(c("geo_value", other_keys, "time_value"), exclude) } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index be34211b..0304d9a6 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -731,7 +731,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' library(dplyr) #' #' # Reference time points for which we want to compute slide values: -#' versions <- seq(as.Date("2020-06-01"), +#' versions <- seq(as.Date("2020-06-02"), #' as.Date("2020-06-15"), #' by = "1 day" #' ) @@ -780,7 +780,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' .versions = versions #' ) %>% #' ungroup() %>% -#' arrange(geo_value, time_value) +#' arrange(geo_value, version) #' #' # --- Advanced: --- #' diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index c859f249..901b9b32 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -41,10 +41,13 @@ as_tibble.epi_df <- function(x, ...) { #' @export as_tsibble.epi_df <- function(x, key, ...) { if (missing(key)) key <- c("geo_value", attributes(x)$metadata$other_keys) - return(as_tsibble(tibble::as_tibble(x), - key = tidyselect::all_of(key), index = "time_value", - ... - )) + return( + as_tsibble( + tibble::as_tibble(x), + key = tidyselect::all_of(key), index = "time_value", + ... + ) + ) } #' Base S3 methods for an `epi_df` object @@ -150,10 +153,10 @@ dplyr_reconstruct.epi_df <- function(data, template) { # keep any grouping that has been applied: res <- NextMethod() - cn <- names(res) + col_names <- names(res) # Duplicate columns, cli_abort - dup_col_names <- cn[duplicated(cn)] + dup_col_names <- col_names[duplicated(col_names)] if (length(dup_col_names) != 0) { cli_abort(c( "Duplicate column names are not allowed", @@ -163,7 +166,7 @@ dplyr_reconstruct.epi_df <- function(data, template) { )) } - not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn) + not_epi_df <- !("time_value" %in% col_names) || !("geo_value" %in% col_names) if (not_epi_df) { # If we're calling on an `epi_df` from one of our own functions, we need to @@ -182,7 +185,7 @@ dplyr_reconstruct.epi_df <- function(data, template) { # Amend additional metadata if some other_keys cols are dropped in the subset old_other_keys <- attr(template, "metadata")$other_keys - attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn] + attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% col_names] res } @@ -424,9 +427,13 @@ arrange_col_canonical.epi_df <- function(x, ...) { x %>% dplyr::relocate(dplyr::all_of(cols), .before = 1) } +#' Group an `epi_df` object by default keys +#' @param x an `epi_df` +#' @param exclude character vector of column names to exclude from grouping +#' @return a grouped `epi_df` #' @export -group_epi_df <- function(x) { - cols <- kill_time_value(key_colnames(x)) +group_epi_df <- function(x, exclude = character()) { + cols <- key_colnames(x, exclude = exclude) x %>% group_by(across(all_of(cols))) } @@ -437,7 +444,7 @@ group_epi_df <- function(x) { #' the resulting `epi_df` will have `geo_value` set to `"total"`. #' #' @param .x an `epi_df` -#' @param value_col character vector of the columns to aggregate +#' @param sum_cols character vector of the columns to aggregate #' @param group_cols character vector of column names to group by. "time_value" is #' included by default. #' @return an `epi_df` object diff --git a/R/outliers.R b/R/outliers.R index 8be492dd..c2187de0 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -161,8 +161,7 @@ detect_outlr <- function(x = seq_along(y), y, #' group_by(geo_value) %>% #' mutate(outlier_info = detect_outlr_rm( #' x = time_value, y = cases -#' )) %>% -#' unnest(outlier_info) +#' )) detect_outlr_rm <- function(x = seq_along(y), y, n = 21, log_transform = FALSE, detect_negatives = FALSE, @@ -189,7 +188,7 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, # Calculate lower and upper thresholds and replacement value z <- z %>% - epi_slide(fitted = median(y), .window_size = n, .align = "center") %>% + epi_slide(fitted = median(y, na.rm = TRUE), .window_size = n, .align = "center") %>% dplyr::mutate(resid = y - fitted) %>% roll_iqr( n = n, @@ -256,9 +255,8 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, #' group_by(geo_value) %>% #' mutate(outlier_info = detect_outlr_stl( #' x = time_value, y = cases, -#' seasonal_period = 7 -#' )) %>% # weekly seasonality for daily data -#' unnest(outlier_info) +#' seasonal_period = 7 # weekly seasonality for daily data +#' )) detect_outlr_stl <- function(x = seq_along(y), y, n_trend = 21, n_seasonal = 21, @@ -359,7 +357,7 @@ roll_iqr <- function(z, n, detection_multiplier, min_radius, z %>% epi_slide( - roll_iqr = stats::IQR(resid), + roll_iqr = stats::IQR(resid, na.rm = TRUE), .window_size = n, .align = "center" ) %>% dplyr::mutate( diff --git a/R/revision_analysis.R b/R/revision_analysis.R index be83d68c..7be0cd24 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -81,8 +81,8 @@ revision_summary <- function(epi_arch, should_compactify = TRUE) { arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT)) if (length(arg) == 0) { - first_non_key <- !(names(epi_arch$DT) %in% c(key_colnames(epi_arch), "version")) - arg <- names(epi_arch$DT)[first_non_key][1] + # Choose the first column that's not a key or version + arg <- setdiff(names(epi_arch$DT), c(key_colnames(epi_arch), "version"))[[1]] } else if (length(arg) > 1) { cli_abort("Not currently implementing more than one column at a time. Run each separately") } @@ -99,11 +99,9 @@ revision_summary <- function(epi_arch, # # revision_tibble keys <- key_colnames(epi_arch) - names(epi_arch$DT) - revision_behavior <- - epi_arch$DT %>% - select(c(geo_value, time_value, all_of(keys), version, !!arg)) + revision_behavior <- epi_arch$DT %>% + select(all_of(unique(c("geo_value", "time_value", keys, "version", arg)))) if (!is.null(min_waiting_period)) { revision_behavior <- revision_behavior %>% filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period) diff --git a/R/slide.R b/R/slide.R index 192597da..5a7fbd6a 100644 --- a/R/slide.R +++ b/R/slide.R @@ -122,8 +122,7 @@ epi_slide <- function( assert_class(.x, "epi_df") if (checkmate::test_class(.x, "grouped_df")) { expected_group_keys <- .x %>% - key_colnames() %>% - kill_time_value() %>% + key_colnames(exclude = "time_value") %>% sort() if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) { cli_abort( @@ -134,12 +133,11 @@ epi_slide <- function( ) } } else { - .x <- group_epi_df(.x) + .x <- group_epi_df(.x, exclude = "time_value") } if (nrow(.x) == 0L) { return(.x) } - # If `.f` is missing, interpret ... as an expression for tidy evaluation if (missing(.f)) { used_data_masking <- TRUE @@ -191,6 +189,20 @@ epi_slide <- function( assert_logical(.all_rows, len = 1) + # Check for duplicated time values within groups + duplicated_time_values <- .x %>% + group_epi_df() %>% + filter(dplyr::n() > 1) %>% + ungroup() + if (nrow(duplicated_time_values) > 0) { + bad_data <- capture.output(duplicated_time_values) + cli_abort( + "as_epi_df: some groups in a resulting dplyr computation have duplicated time values. + epi_df requires a unique time_value per group.", + body = c("Sample groups:", bad_data) + ) + } + # Begin handling completion. This will create a complete time index between # the smallest and largest time values in the data. This is used to ensure # that the slide function is called with a complete window of data. Each slide @@ -241,7 +253,7 @@ epi_slide <- function( .keep = TRUE ) %>% bind_rows() %>% - filter(.data$.real) %>% + filter(.real) %>% select(-.real) %>% arrange_col_canonical() %>% group_by(!!!.x_groups) @@ -275,11 +287,16 @@ epi_slide_one_group <- function( missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] .data_group <- bind_rows( .data_group, - tibble(time_value = c( - missing_times, - .date_seq_list$pad_early_dates, - .date_seq_list$pad_late_dates - ), .real = FALSE) + dplyr::bind_cols( + .group_key, + tibble( + time_value = c( + missing_times, + .date_seq_list$pad_early_dates, + .date_seq_list$pad_late_dates + ), .real = FALSE + ) + ) ) %>% arrange(.data$time_value) @@ -405,8 +422,8 @@ epi_slide_one_group <- function( )), capture.output(print(waldo::compare( res[[comp_nms[[comp_i]]]], slide_values[[comp_i]], - x_arg = rlang::expr_deparse(expr(`$`(existing, !!sym(comp_nms[[comp_i]])))), - y_arg = rlang::expr_deparse(expr(`$`(comp_value, !!sym(comp_nms[[comp_i]])))) + x_arg = rlang::expr_deparse(dplyr::expr(`$`(existing, !!sym(comp_nms[[comp_i]])))), # nolint: object_usage_linter + y_arg = rlang::expr_deparse(dplyr::expr(`$`(comp_value, !!sym(comp_nms[[comp_i]])))) # nolint: object_usage_linter ))), cli::format_message(c( ">" = "You likely want to rename or remove this column from your slide @@ -711,7 +728,7 @@ epi_slide_opt <- function( # positions of user-provided `col_names` into string column names. We avoid # using `names(pos)` directly for robustness and in case we later want to # allow users to rename fields via tidyselection. - if (class(quo_get_expr(enquo(.col_names))) == "character") { + if (inherits(quo_get_expr(enquo(.col_names)), "character")) { pos <- eval_select(dplyr::all_of(.col_names), data = .x, allow_rename = FALSE) } else { pos <- eval_select(enquo(.col_names), data = .x, allow_rename = FALSE) diff --git a/R/utils.R b/R/utils.R index 1873eb1c..bb30264a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -355,32 +355,9 @@ assert_sufficient_f_args <- function(.f, ..., .ref_time_value_label) { #' #' @template ref-time-value-label #' -#' @examples -#' f1 <- as_slide_computation(~ .z - .x$time_value, -#' .ref_time_value_long_varnames = character(0L), -#' .ref_time_value_label = "third argument" -#' ) -#' f1(tibble::tibble(time_value = 10), tibble::tibble(), 12) -#' -#' f2 <- as_time_slide_computation(~ .ref_time_value - .x$time_value) -#' f2(tibble::tibble(time_value = 10), tibble::tibble(), 12) -#' -#' f3 <- as_diagonal_slide_computation(~ .version - .x$time_value) -#' f3(tibble::tibble(time_value = 10), tibble::tibble(), 12) -#' -#' f4 <- as_diagonal_slide_computation(~ .ref_time_value - .x$time_value) -#' f4(tibble::tibble(time_value = 10), tibble::tibble(), 12) -#' -#' g <- as_time_slide_computation(~ -1 * .) -#' g(4) -#' -#' h <- as_time_slide_computation(~ .x - .group_key) -#' h(6, 3) -#' #' @importFrom rlang is_function new_function f_env is_environment missing_arg #' f_rhs is_formula caller_arg caller_env -#' -#' @noRd +#' @keywords internal as_slide_computation <- function(.f, ..., .ref_time_value_long_varnames, .ref_time_value_label) { arg <- caller_arg(.f) call <- caller_env() @@ -542,8 +519,7 @@ as_slide_computation <- function(.f, ..., .ref_time_value_long_varnames, .ref_ti } #' @rdname as_slide_computation -#' @export -#' @noRd +#' @keywords internal as_time_slide_computation <- function(.f, ...) { as_slide_computation( .f, ..., @@ -553,8 +529,7 @@ as_time_slide_computation <- function(.f, ...) { } #' @rdname as_slide_computation -#' @export -#' @noRd +#' @keywords internal as_diagonal_slide_computation <- function(.f, ...) { as_slide_computation( .f, ..., diff --git a/_pkgdown.yml b/_pkgdown.yml index 62f006fe..1bc7f795 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -48,7 +48,6 @@ articles: - aggregation - outliers - archive - - advanced - compactify repo: diff --git a/man-roxygen/basic-slide-details.R b/man-roxygen/basic-slide-details.R index 64570976..df87f882 100644 --- a/man-roxygen/basic-slide-details.R +++ b/man-roxygen/basic-slide-details.R @@ -9,7 +9,7 @@ #' boundary of the dataset) and will attempt to perform the computation #' anyway. The issue of what to do with partial computations (those run on #' incomplete windows) is therefore left up to the user, either through the -#' specified function or formula `f`, or through post-processing. +#' specified function or formula, or through post-processing. #' #' Let's look at some window examples, assuming that the reference time value #' is "tv". With .align = "right" and .window_size = 3, the window will be: @@ -60,8 +60,8 @@ #' "pronoun"-like bindings available: #' * .x, which is like `.x` in [`dplyr::group_modify`]; an ordinary object #' like an `epi_df` rather than an rlang [pronoun][rlang::as_data_pronoun] -#' like [`.data`]; this allows you to use additional {dplyr}, {tidyr}, and -#' {epiprocess} operations. If you have multiple expressions in `...`, this +#' like [`.data`]; this allows you to use additional `dplyr`, `tidyr`, and +#' `epiprocess` operations. If you have multiple expressions in `...`, this #' won't let you refer to the output of the earlier expressions, but `.data` #' will. #' * .group_key, which is like `.y` in [`dplyr::group_modify`]. diff --git a/man/as_slide_computation.Rd b/man/as_slide_computation.Rd new file mode 100644 index 00000000..3db5a940 --- /dev/null +++ b/man/as_slide_computation.Rd @@ -0,0 +1,113 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{as_slide_computation} +\alias{as_slide_computation} +\alias{as_time_slide_computation} +\alias{as_diagonal_slide_computation} +\title{Generate a \verb{epi[x]_slide} computation function from a function, formula, or quosure} +\source{ +This code and documentation are based on +\href{https://github.com/r-lib/rlang/blob/c55f6027928d3104ed449e591e8a225fcaf55e13/R/fn.R#L343-L427}{\code{as_function}} +from Hadley Wickham's \code{rlang} package. + +Below is the original license for the \code{rlang} package. + +MIT License + +Copyright (c) 2020 rlang authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +Portions of the original code used in this adaptation: +\enumerate{ +\item Much of the documentation and examples +\item The general flow of the function, including branching conditions +\item Error conditions and wording +\item The chunk converting a formula into a function, see +https://github.com/r-lib/rlang/blob/c55f6027928d3104ed449e591e8a225fcaf55e13/R/fn.R#L411-L418 +} + +Changes made include: +\enumerate{ +\item Updates to documentation due to new functionality +\item The removal of function-as-string processing logic and helper arg +\code{env} +\item The addition of an output function wrapper that defines a data mask +for evaluating quosures +\item Calling an argument-checking function +\item Replacing rlang error functions with internal error functions +} +} +\usage{ +as_slide_computation( + .f, + ..., + .ref_time_value_long_varnames, + .ref_time_value_label +) + +as_time_slide_computation(.f, ...) + +as_diagonal_slide_computation(.f, ...) +} +\arguments{ +\item{...}{Additional arguments to pass to the function or formula +specified via \code{x}. If \code{x} is a quosure, any arguments passed via \code{...} +will be ignored.} + +\item{.ref_time_value_long_varnames}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} +Character vector. What variable names should we allow formulas and +data-masking tidy evaluation to use to refer to \code{ref_time_value} for the +computation (in addition to \code{.z} in formulas)? E.g., \code{".ref_time_value"} or +\code{c(".ref_time_value", ".version")}.} + +\item{.ref_time_value_label}{String; how to describe/label the \code{ref_time_value} in +error messages; e.g., "reference time value" or "version".} + +\item{f}{A function, one-sided formula, or quosure. + +If a \strong{function}, the function is returned as-is, with no +modifications. + +If a \strong{formula}, e.g. \code{~ mean(.x$cases)}, it is converted to a function +with up to three arguments: \code{.x} (single argument), or \code{.x} and \code{.y} +(two arguments), or \code{.x}, \code{.y}, and \code{.z} (three arguments). The \code{.} +placeholder can be used instead of \code{.x}, \code{.group_key} can be used in +place of \code{.y}, and \code{.ref_time_value} can be used in place of \code{.z}. This +allows you to create very compact anonymous functions (lambdas) with up +to three inputs. Functions created from formulas have a special class. +Use \code{inherits(fn, "epiprocess_slide_computation")} to test for it. + +If a \strong{quosure}, in the case that \code{f} was not provided to the parent +\verb{epi[x]_slide} call and the \code{...} is interpreted as an expression for +tidy evaluation, it is evaluated within a wrapper function. The wrapper +sets up object access via a data mask.} +} +\description{ +\code{as_slide_computation()} transforms a one-sided formula or a +quosure into a function; functions are returned as-is or with light +modifications to calculate \code{ref_time_value}. + +This code extends \code{rlang::as_function} to create functions that take three +arguments. The arguments can be accessed via the idiomatic \code{.}, \code{.x}, and +\code{.y}, extended to include \code{.z}; positional references \code{..1} and \code{..2}, +extended to include \code{..3}; and also by \verb{epi[x]_slide}-specific names +\code{.group_key} and \code{.ref_time_value}. +} +\keyword{internal} diff --git a/man/detect_outlr_rm.Rd b/man/detect_outlr_rm.Rd index 333c4a7b..b57c4445 100644 --- a/man/detect_outlr_rm.Rd +++ b/man/detect_outlr_rm.Rd @@ -65,6 +65,5 @@ incidence_num_outlier_example \%>\% group_by(geo_value) \%>\% mutate(outlier_info = detect_outlr_rm( x = time_value, y = cases - )) \%>\% - unnest(outlier_info) + )) } diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd index 695c2de7..fb69e8da 100644 --- a/man/detect_outlr_stl.Rd +++ b/man/detect_outlr_stl.Rd @@ -96,7 +96,6 @@ incidence_num_outlier_example \%>\% group_by(geo_value) \%>\% mutate(outlier_info = detect_outlr_stl( x = time_value, y = cases, - seasonal_period = 7 - )) \%>\% # weekly seasonality for daily data - unnest(outlier_info) + seasonal_period = 7 # weekly seasonality for daily data + )) } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 323fdf4d..74929eb1 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -109,7 +109,7 @@ keep NAs around. boundary of the dataset) and will attempt to perform the computation anyway. The issue of what to do with partial computations (those run on incomplete windows) is therefore left up to the user, either through the -specified function or formula \code{f}, or through post-processing. +specified function or formula, or through post-processing. Let's look at some window examples, assuming that the reference time value is "tv". With .align = "right" and .window_size = 3, the window will be: @@ -165,8 +165,8 @@ In addition to \code{\link{.data}} and \code{\link{.env}}, we make some addition \itemize{ \item .x, which is like \code{.x} in \code{\link[dplyr:group_map]{dplyr::group_modify}}; an ordinary object like an \code{epi_df} rather than an rlang \link[rlang:as_data_mask]{pronoun} -like \code{\link{.data}}; this allows you to use additional {dplyr}, {tidyr}, and -{epiprocess} operations. If you have multiple expressions in \code{...}, this +like \code{\link{.data}}; this allows you to use additional \code{dplyr}, \code{tidyr}, and +\code{epiprocess} operations. If you have multiple expressions in \code{...}, this won't let you refer to the output of the earlier expressions, but \code{.data} will. \item .group_key, which is like \code{.y} in \code{\link[dplyr:group_map]{dplyr::group_modify}}. diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 1f301846..1326cc18 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -170,7 +170,7 @@ necessary (as it its purpose). library(dplyr) # Reference time points for which we want to compute slide values: -versions <- seq(as.Date("2020-06-01"), +versions <- seq(as.Date("2020-06-02"), as.Date("2020-06-15"), by = "1 day" ) @@ -219,7 +219,7 @@ archive_cases_dv_subset \%>\% .versions = versions ) \%>\% ungroup() \%>\% - arrange(geo_value, time_value) + arrange(geo_value, version) # --- Advanced: --- diff --git a/man/group_epi_df.Rd b/man/group_epi_df.Rd new file mode 100644 index 00000000..5895a52f --- /dev/null +++ b/man/group_epi_df.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_df.R +\name{group_epi_df} +\alias{group_epi_df} +\title{Group an \code{epi_df} object by default keys} +\usage{ +group_epi_df(x, exclude = character()) +} +\arguments{ +\item{x}{an \code{epi_df}} + +\item{exclude}{character vector of column names to exclude from grouping} +} +\value{ +a grouped \code{epi_df} +} +\description{ +Group an \code{epi_df} object by default keys +} diff --git a/man/key_colnames.Rd b/man/key_colnames.Rd index fbaa3c11..f5e13837 100644 --- a/man/key_colnames.Rd +++ b/man/key_colnames.Rd @@ -2,17 +2,33 @@ % Please edit documentation in R/key_colnames.R \name{key_colnames} \alias{key_colnames} +\alias{key_colnames.default} +\alias{key_colnames.data.frame} +\alias{key_colnames.epi_df} +\alias{key_colnames.epi_archive} \title{Grab any keys associated to an epi_df} \usage{ key_colnames(x, ...) + +\method{key_colnames}{default}(x, ...) + +\method{key_colnames}{data.frame}(x, other_keys = character(0L), exclude = character(0L), ...) + +\method{key_colnames}{epi_df}(x, exclude = character(0L), ...) + +\method{key_colnames}{epi_archive}(x, exclude = character(0L), ...) } \arguments{ \item{x}{a data.frame, tibble, or epi_df} \item{...}{additional arguments passed on to methods} + +\item{other_keys}{an optional character vector of other keys to include} + +\item{exclude}{an optional character vector of keys to exclude} } \value{ -If an \code{epi_df}, this returns all "keys". Otherwise \code{NULL} +If an \code{epi_df}, this returns all "keys". Otherwise \code{NULL}. } \description{ Grab any keys associated to an epi_df diff --git a/man/sum_groups_epi_df.Rd b/man/sum_groups_epi_df.Rd index 8b4c13ba..f1ba8474 100644 --- a/man/sum_groups_epi_df.Rd +++ b/man/sum_groups_epi_df.Rd @@ -9,10 +9,10 @@ sum_groups_epi_df(.x, sum_cols = "value", group_cols = character()) \arguments{ \item{.x}{an \code{epi_df}} +\item{sum_cols}{character vector of the columns to aggregate} + \item{group_cols}{character vector of column names to group by. "time_value" is included by default.} - -\item{value_col}{character vector of the columns to aggregate} } \value{ an \code{epi_df} object diff --git a/tests/testthat/test-arrange-canonical.R b/tests/testthat/test-arrange-canonical.R index 939d2f32..24d3f5f9 100644 --- a/tests/testthat/test-arrange-canonical.R +++ b/tests/testthat/test-arrange-canonical.R @@ -8,14 +8,13 @@ test_that("canonical arrangement works", { expect_error(arrange_canonical(tib)) tib <- tib %>% as_epi_df(other_keys = "demo_grp") - expect_equal(names(tib), c("geo_value", "time_value", "demo_grp", "x")) + expect_equal(names(tib), c("geo_value", "demo_grp", "time_value", "x")) - tib_cols_shuffled <- tib %>% select(geo_value, time_value, x, demo_grp) - - tib_sorted <- arrange_canonical(tib_cols_shuffled) - expect_equal(names(tib_sorted), c("geo_value", "time_value", "demo_grp", "x")) + tib_sorted <- tib %>% + arrange_canonical() + expect_equal(names(tib_sorted), c("geo_value", "demo_grp", "time_value", "x")) expect_equal(tib_sorted$geo_value, rep(c("ca", "ga"), each = 4)) - expect_equal(tib_sorted$time_value, c(1, 1, 2, 2, 1, 1, 2, 2)) - expect_equal(tib_sorted$demo_grp, rep(letters[1:2], times = 4)) - expect_equal(tib_sorted$x, c(8, 6, 7, 5, 4, 2, 3, 1)) + expect_equal(tib_sorted$time_value, c(1, 2, 1, 2, 1, 2, 1, 2)) + expect_equal(tib_sorted$demo_grp, c("a", "a", "b", "b", "a", "a", "b", "b")) + expect_equal(tib_sorted$x, c(8, 7, 6, 5, 4, 3, 2, 1)) }) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index e8416693..d644e9a7 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -53,7 +53,7 @@ get_test_dataset <- function(n, time_type = "day", other_keys = FALSE) { } df %>% arrange_canonical() %>% - group_epi_df() + group_epi_df(exclude = "time_value") } test_data <- get_test_dataset(num_rows_per_group, "day") @@ -82,10 +82,10 @@ epi_slide_sum_test <- function( .x %>% mutate(.real = TRUE) %>% - group_epi_df() %>% + group_epi_df(exclude = "time_value") %>% complete(time_value = vctrs::vec_c(!!!date_seq_list, .name_spec = rlang::zap())) %>% arrange_canonical() %>% - group_epi_df() %>% + group_epi_df(exclude = "time_value") %>% mutate( slide_value = slider::slide_index_sum( .data$value, @@ -246,7 +246,7 @@ for (p in (param_combinations %>% transpose())) { mutate(slide_value = list(slide_value)) %>% ungroup() %>% as_epi_df(as_of = attr(test_data, "metadata")$as_of, other_keys = attr(test_data, "metadata")$other_keys) %>% - group_epi_df() + group_epi_df(exclude = "time_value") expect_equal( out %>% select(-slide_value), @@ -268,7 +268,7 @@ for (p in (param_combinations %>% transpose())) { mutate(slide_value = list(slide_value)) %>% ungroup() %>% as_epi_df(as_of = attr(test_data, "metadata")$as_of, other_keys = attr(test_data, "metadata")$other_keys) %>% - group_epi_df() + group_epi_df(exclude = "time_value") expect_equal( out %>% select(-slide_value), expected_out %>% select(-slide_value) diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index f1bca059..3e5c180b 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -69,21 +69,20 @@ test_that("Subsetting drops & does not drop the epi_df class appropriately", { expect_equal(ncol(col_subset2), 2L) # Row and col subset that contains geo_value and time_value - should be epi_df - row_col_subset2 <- toy_epi_df[2:3, 1:3] + row_col_subset2 <- toy_epi_df[2:3, c(1, 4)] att_row_col_subset2 <- attr(row_col_subset2, "metadata") expect_true(is_epi_df(row_col_subset2)) expect_equal(nrow(row_col_subset2), 2L) - expect_equal(ncol(row_col_subset2), 3L) + expect_equal(ncol(row_col_subset2), 2L) expect_identical(att_row_col_subset2$geo_type, att_toy$geo_type) expect_identical(att_row_col_subset2$time_type, att_toy$time_type) expect_identical(att_row_col_subset2$as_of, att_toy$as_of) - expect_identical(att_row_col_subset2$other_keys, att_toy$other_keys[1]) }) test_that("When duplicate cols in subset should abort", { expect_error(toy_epi_df[, c(2, 2:3, 4, 4, 4)], - "Duplicated column names: time_value, indic_var2", + "Duplicated column names: indic_var1, time_value", fixed = TRUE ) expect_error(toy_epi_df[1:4, c(1, 2:4, 1)], @@ -94,7 +93,7 @@ test_that("When duplicate cols in subset should abort", { test_that("Correct metadata when subset includes some of other_keys", { # Only include other_var of indic_var1 - only_indic_var1 <- toy_epi_df[, c(1:3, 5:6)] + only_indic_var1 <- toy_epi_df[, c(1:2, 4:6)] att_only_indic_var1 <- attr(only_indic_var1, "metadata") expect_true(is_epi_df(only_indic_var1)) @@ -106,7 +105,7 @@ test_that("Correct metadata when subset includes some of other_keys", { expect_identical(att_only_indic_var1$other_keys, att_toy$other_keys[-2]) # Only include other_var of indic_var2 - only_indic_var2 <- toy_epi_df[, c(1:2, 4:6)] + only_indic_var2 <- toy_epi_df[, c(1, 3:6)] att_only_indic_var2 <- attr(only_indic_var2, "metadata") expect_true(is_epi_df(only_indic_var2)) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 9135e5a9..a159f98e 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -232,6 +232,24 @@ test_that("as_slide_computation raises errors as expected", { ) }) +test_that("as_slide_computation works", { + f1 <- as_slide_computation(~ .z - .x$time_value, + .ref_time_value_long_varnames = character(0L), + .ref_time_value_label = "third argument" + ) + expect_equal(f1(tibble::tibble(time_value = 10), tibble::tibble(), 12), 2) + f2 <- as_time_slide_computation(~ .ref_time_value - .x$time_value) + expect_equal(f2(tibble::tibble(time_value = 10), tibble::tibble(), 12), 2) + f3 <- as_diagonal_slide_computation(~ .version - .x$time_value) + expect_equal(f3(tibble::tibble(time_value = 10), tibble::tibble(), 12), 2) + f4 <- as_diagonal_slide_computation(~ .ref_time_value - .x$time_value) + expect_equal(f4(tibble::tibble(time_value = 10), tibble::tibble(), 12), 2) + g <- as_time_slide_computation(~ -1 * .) + expect_equal(g(4), -4) + h <- as_time_slide_computation(~ .x - .group_key) + expect_equal(h(6, 3), 3) +}) + test_that("guess_period works", { # Error cases: expect_error(guess_period(numeric(0L)), class = "epiprocess__guess_period__not_enough_times") diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd deleted file mode 100644 index 65f9ce05..00000000 --- a/vignettes/advanced.Rmd +++ /dev/null @@ -1,488 +0,0 @@ ---- -title: Advanced sliding with nonstandard outputs -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Advanced sliding with nonstandard outputs} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -In this vignette, we discuss how to use the sliding functionality in the -`epiprocess` package with less common grouping schemes or with computations that -have advanced output structures. The output of a slide computation should either -be an atomic value/vector, or a data frame. This data frame can have multiple -columns, multiple rows, or both. - -During basic usage (e.g., when all optional arguments are set to their defaults): - -* `epi_slide(edf, , .....)`: - * keeps **all** columns of `edf`, adds computed column(s) - * outputs **one row per row in `edf`** (recycling outputs from - computations appropriately if there are multiple time series bundled - together inside any group(s)) - * maintains the grouping or ungroupedness of `edf` - * is roughly analogous to (the non-sliding) **`dplyr::mutate` followed by - `dplyr::arrange(time_value, .by_group = TRUE)`** - * outputs an **`epi_df`** if the required columns are present, otherwise a - tibble -* `epix_slide(ea, , .....)`: - * keeps **grouping and `time_value`** columns of `ea`, adds computed - column(s) - * outputs **any number of rows** (computations are allowed to output any - number of elements/rows, and no recycling is performed) - * maintains the grouping or ungroupedness of `ea`, unless it was explicitly - grouped by zero variables; this isn't supported by `grouped_df` and it will - automatically turn into an ungrouped tibble - * is roughly analogous to (the non-sliding) **`dplyr::group_modify`** - * outputs a **tibble** - -These differences in basic behavior make some common slide operations require less boilerplate: - -* predictors and targets calculated with `epi_slide` are automatically lined up - with each other and with the signals from which they were calculated; and -* computations for an `epix_slide` can output data frames with any number of - rows, containing models, forecasts, evaluations, etc., and will not be - recycled. - -When using more advanced features, more complex rules apply: - -* Generalization: `epi_slide(edf, ....., .ref_time_values=my_ref_time_values)` - will output one row for every row in `edf` with `time_value` appearing inside - `my_ref_time_values`, and is analogous to a `dplyr::mutate`&`dplyr::arrange` - followed by `dplyr::filter` to those `.ref_time_values`. We call this property - **size stability**, and describe how it is achieved in the following sections. - The default behavior described above is a special case of this general rule - based on a default value of `.ref_time_values`. -* Exception/feature: `epi_slide(edf, ....., .ref_time_values=my_ref_time_values, - .all_rows=TRUE)` will not just output rows for `my_ref_time_values`, but - instead will output one row per row in `edf`. -* Clarification: `ea %>% group_by(....., .drop=FALSE) %>% - epix_slide(, .....)` will call the computation on any missing - groups according to `dplyr`'s `.drop=FALSE` rules, resulting in additional - output rows. - -Below we demonstrate some advanced use cases of sliding with different output -structures. We focus on `epi_slide()` for the most part, though some of the -behavior we demonstrate also carries over to `epix_slide()`. - -## Recycling outputs - -When a computation returns a single atomic value, `epi_slide()` will internally -try to recycle the output so that it is size stable (in the sense described -above). We can use this to our advantage, for example, in order to compute a -trailing average marginally over geo values, which we demonstrate below in a -simple synthetic example. - -```{r message = FALSE} -library(epiprocess) -library(dplyr) -set.seed(123) - -edf <- tibble( - geo_value = rep(c("ca", "fl", "pa"), each = 3), - time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), - x = seq_along(geo_value) + 0.01 * rnorm(length(geo_value)), -) %>% - as_epi_df(as_of = as.Date("2024-03-20")) - -# 2-day trailing average, per geo value -edf %>% - group_by(geo_value) %>% - epi_slide(x_2dav = mean(x), .window_size = 2) %>% - ungroup() - -# 2-day trailing average, marginally -edf %>% - epi_slide(x_2dav = mean(x), .window_size = 2) -``` - -```{r, include = FALSE} -# More checks (not included) -edf %>% - epi_slide(x_2dav = mean(x), .window_size = 2, .ref_time_values = as.Date("2020-06-02")) - -edf %>% - # pretend that observations about time_value t are reported in version t (nowcasts) - mutate(version = time_value) %>% - as_epi_archive() %>% - group_by(geo_value) %>% - epix_slide(x_2dav = mean(x), .before = 1, .versions = as.Date("2020-06-02")) %>% - ungroup() - -edf %>% - # pretend that observations about time_value t are reported in version t (nowcasts) - mutate(version = time_value) %>% - as_epi_archive() %>% - group_by(geo_value) %>% - epix_slide(~ mean(.x$x), .before = 1, .ref_time_values = as.Date("2020-06-02")) %>% - ungroup() -``` - -When the slide computation returns an atomic vector (rather than a single value) -`epi_slide()` checks whether its return length ensures size stability, and if -so, uses it to fill the new column. For example, this next computation gives the -same result as the last one. - -```{r} -edf %>% - epi_slide(y_2dav = rep(mean(x), 3), .window_size = 2) -``` - -However, if the output is an atomic vector (rather than a single value) and it -is *not* size stable, then `epi_slide()` throws an error. For example, below we -are trying to return 2 things for 3 states. - -```{r, error = TRUE} -edf %>% - epi_slide(x_2dav = rep(mean(x), 2), .window_size = 2) -``` - -## Multi-column outputs - -Now we move on to outputs that are data frames with a single row but multiple -columns. Working with this type of output structure has in fact has already been -demonstrated in the [slide -vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html). - -```{r} -edf2 <- edf %>% - group_by(geo_value) %>% - epi_slide( - a = list(data.frame(x_2dav = mean(x), x_2dma = mad(x))), - .window_size = 2 - ) %>% - ungroup() - -class(edf2$a) -length(edf2$a) -edf2$a[[2]] -``` - -If you do not wrap the data.frame in a list above, the resulting `epi_df` has -multiple new columns containing the slide values. The default is to name these -unnested columns by prefixing the name assigned to the list column (here `a`) -onto the column names of the output data frame from the slide computation (here -`x_2dav` and `x_2dma`) separated by "_". - -```{r} -edf %>% - group_by(geo_value) %>% - epi_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - .window_size = 2 - ) %>% - ungroup() -``` - -Furthermore, `epi_slide()` will recycle the single row data frame as needed in -order to make the result size stable, just like the case for atomic values (note -that we are not grouping here by geo_value). - -```{r} -edf %>% - epi_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - .window_size = 2 - ) -``` - -## Multi-row outputs - -For a slide computation that outputs a data frame with more than one row, the -behavior is analogous to a slide computation that outputs an atomic vector. -Meaning, `epi_slide()` will check that the result is size stable, and if so, -will fill the new column(s) in the resulting `epi_df` object appropriately. - -This can be convenient for modeling in the following sense: we can, for example, -fit a sliding, data-versioning-unaware nowcasting or forecasting model by -pooling data from different locations, and then return separate forecasts from -this common model for each location. We use our synthetic example to demonstrate -this idea abstractly but simply by forecasting (actually, nowcasting) `y` from -`x` by fitting a time-windowed linear model that pooling data across all -locations. - -```{r} -edf$y <- 2 * edf$x + 0.05 * rnorm(length(edf$x)) - -edf %>% - epi_slide(function(d, group_key, ref_time_value) { - obj <- lm(y ~ x, data = d) - return( - predict( - obj, - newdata = d %>% group_by(geo_value) %>% filter(time_value == max(time_value)), - interval = "prediction", - level = 0.9 - ) %>% - as.data.frame() %>% - list() - ) - }, .window_size = 2) -``` - -The above example focused on simplicity to show how to work with multi-row -outputs. Note however, the following issues in this example: - -* The `lm` fitting data includes the testing instances, as no training-test split was performed. -* Adding a simple training-test split would not factor in reporting latency properly. -* Data revisions are not taken into account. - -All three of these factors contribute to unrealistic retrospective forecasts and -overly optimistic retrospective performance evaluations. Instead, one should -favor an `epix_slide` for more realistic "pseudoprospective" forecasts. Using -`epix_slide` also makes it easier to express certain types of forecasts; while -in `epi_slide`, forecasts for additional aheads or quantile levels would need to -be expressed as additional columns, or nested inside list columns, `epix_slide` -does not perform size stability checks or recycling, allowing computations to -output any number of rows. - -## Version-aware forecasting, revisited - -We revisit the COVID-19 forecasting example from the [archive -vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) in order -to demonstrate the preceding points regarding forecast evaluation in a more -realistic setting. First, we fetch the versioned data and build the archive. - -```{r, message = FALSE, warning = FALSE, eval =FALSE} -library(epidatr) -library(data.table) -library(ggplot2) -theme_set(theme_bw()) - -y1 <- pub_covidcast( - source = "doctor-visits", - signals = "smoothed_adj_cli", - geo_type = "state", - time_type = "day", - geo_values = "ca,fl", - time_values = epirange(20200601, 20211201), - issues = epirange(20200601, 20211201) -) - -y2 <- pub_covidcast( - source = "jhu-csse", - signal = "confirmed_7dav_incidence_prop", - geo_type = "state", - time_type = "day", - geo_values = "ca,fl", - time_values = epirange(20200601, 20211201), - issues = epirange(20200601, 20211201) -) - -x <- y1 %>% - select(geo_value, time_value, - version = issue, - percent_cli = value - ) %>% - as_epi_archive(compactify = FALSE) - -# mutating merge operation: -x <- epix_merge( - x, - y2 %>% - select(geo_value, time_value, - version = issue, - case_rate_7d_av = value - ) %>% - as_epi_archive(compactify = FALSE), - sync = "locf", - compactify = FALSE -) -``` - -```{r, message = FALSE, echo =FALSE} -library(data.table) -library(ggplot2) -theme_set(theme_bw()) - -x <- archive_cases_dv_subset$DT %>% - filter(geo_value %in% c("ca", "fl")) %>% - as_epi_archive(compactify = FALSE) -``` - -Next, we extend the ARX function to handle multiple geo values, since in the -present case, we will not be grouping by geo value and each slide computation -will be run on multiple geo values at once. Note that, because `epix_slide()` -only returns the grouping variables, `time_value`, and the slide computations in -the eventual returned tibble, we need to include `geo_value` as a column in the -output data frame from our ARX computation. - -```{r} -library(tidyr) -library(purrr) - -prob_arx_args <- function(lags = c(0, 7, 14), - ahead = 7, - min_train_window = 20, - lower_level = 0.05, - upper_level = 0.95, - symmetrize = TRUE, - intercept = FALSE, - nonneg = TRUE) { - return(list( - lags = lags, - ahead = ahead, - min_train_window = min_train_window, - lower_level = lower_level, - upper_level = upper_level, - symmetrize = symmetrize, - intercept = intercept, - nonneg = nonneg - )) -} - -prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { - # Return NA if insufficient training data - if (length(y) < args$min_train_window + max(args$lags) + args$ahead) { - return(data.frame( - geo_value = unique(geo_value), # Return geo value! - point = NA, lower = NA, upper = NA - )) - } - - # Set up x, y, lags list - if (!missing(x)) { - x <- data.frame(x, y) - } else { - x <- data.frame(y) - } - if (!is.list(args$lags)) args$lags <- list(args$lags) - args$lags <- rep(args$lags, length.out = ncol(x)) - - # Build features and response for the AR model, and then fit it - dat <- - tibble(i = seq_len(ncol(x)), lag = args$lags) %>% - unnest(lag) %>% - mutate(name = paste0("x", seq_len(nrow(.)))) %>% # nolint: object_usage_linter - # One list element for each lagged feature - pmap(function(i, lag, name) { - tibble( - geo_value = geo_value, - time_value = time_value + lag, # Shift back - !!name := x[, i] - ) - }) %>% - # One list element for the response vector - c(list( - tibble( - geo_value = geo_value, - time_value = time_value - args$ahead, # Shift forward - y = y - ) - )) %>% - # Combine them together into one data frame - reduce(full_join, by = c("geo_value", "time_value")) %>% - arrange(time_value) - if (args$intercept) dat$x0 <- rep(1, nrow(dat)) - obj <- lm(y ~ . + 0, data = select(dat, -geo_value, -time_value)) - - # Use LOCF to fill NAs in the latest feature values (do this by geo value) - setDT(dat) # Convert to a data.table object by reference - cols <- setdiff(names(dat), c("geo_value", "time_value")) - dat[, (cols) := nafill(.SD, type = "locf"), .SDcols = cols, by = "geo_value"] - - # Make predictions - test_time_value <- max(time_value) - point <- predict( - obj, - newdata = dat %>% - dplyr::group_by(geo_value) %>% - dplyr::filter(time_value == test_time_value) - ) - - # Compute bands - r <- residuals(obj) - s <- ifelse(args$symmetrize, -1, NA) # Should the residuals be symmetrized? - q <- quantile(c(r, s * r), probs = c(args$lower, args$upper), na.rm = TRUE) - lower <- point + q[1] - upper <- point + q[2] - - # Clip at zero if we need to, then return - if (args$nonneg) { - point <- pmax(point, 0) - lower <- pmax(lower, 0) - upper <- pmax(upper, 0) - } - return(data.frame( - geo_value = unique(geo_value), # Return geo value! - point = point, lower = lower, upper = upper - )) -} -``` - -We now make forecasts on the archive and compare to forecasts on the latest -data. - -```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -# Latest snapshot of data, and forecast dates -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) -fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), - by = "1 month" -) - -# Simple function to produce forecasts k weeks ahead -k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { - if (as_of) { - x %>% - epix_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, - args = prob_arx_args(ahead = ahead) - ), - .before = 219, .versions = fc_time_values - ) %>% - mutate( - target_date = .data$time_value + ahead, as_of = TRUE, - geo_value = .data$fc$geo_value - ) - } else { - x_latest %>% - epi_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, - args = prob_arx_args(ahead = ahead) - ), - .window_size = 220, .ref_time_values = fc_time_values - ) %>% - mutate(target_date = .data$time_value + ahead, as_of = FALSE) - } -} - -# Generate the forecasts, and bind them together -fc <- bind_rows( - k_week_ahead(x, ahead = 7, as_of = TRUE), - k_week_ahead(x, ahead = 14, as_of = TRUE), - k_week_ahead(x, ahead = 21, as_of = TRUE), - k_week_ahead(x, ahead = 28, as_of = TRUE), - k_week_ahead(x, ahead = 7, as_of = FALSE), - k_week_ahead(x, ahead = 14, as_of = FALSE), - k_week_ahead(x, ahead = 21, as_of = FALSE), - k_week_ahead(x, ahead = 28, as_of = FALSE) -) - -# Plot them, on top of latest COVID-19 case rates -ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + - geom_ribbon(aes(ymin = fc$lower, ymax = fc$upper), alpha = 0.4) + - geom_line( - data = x_latest, aes(x = time_value, y = case_rate_7d_av), - inherit.aes = FALSE, color = "gray50" - ) + - geom_line(aes(y = fc$point)) + - geom_point(aes(y = fc$point), size = 0.5) + - geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + - facet_grid(vars(geo_value), vars(as_of), scales = "free") + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") -``` - -We can see that these forecasts, which come from training an ARX model jointly -over CA and FL, exhibit generally less variability and wider prediction bands -compared to the ones from the archive vignette, which come from training a -separate ARX model on each state. As in the archive vignette, we can see a -difference between version-aware (right column) and -unaware (left column) -forecasting, as well. - -## Attribution -The `case_rate_7d_av` data used in this document is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. - -The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index 585b5b0a..9d205f53 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -52,13 +52,12 @@ x <- jhu_csse_county_level_subset ## Converting to `tsibble` format For manipulating and wrangling time series data, the -[`tsibble`](https://tsibble.tidyverts.org/index.html) already provides a whole -bunch of useful tools. A tsibble object (formerly, of class `tbl_ts`) is -basically a tibble (data frame) but with two specially-marked columns: an -**index** column representing the time variable (defining an order from past to -present), and a **key** column identifying a unique observational unit for each -time point. In fact, the key can be made up of any number of columns, not just a -single one. +[`tsibble`](https://tsibble.tidyverts.org/index.html) already provides a host of +useful tools. A tsibble object (formerly, of class `tbl_ts`) is basically a +tibble (data frame) but with two specially-marked columns: an **index** column +representing the time variable (defining an order from past to present), and a +**key** column identifying a unique observational unit for each time point. In +fact, the key can be made up of any number of columns, not just a single one. In an `epi_df` object, the index variable is `time_value`, and the key variable is typically `geo_value` (though this need not always be the case: for example, @@ -113,11 +112,13 @@ Let's first remove certain dates from our data set to create gaps: ```{r} # First make geo value more readable for tables, plots, etc. x <- x %>% - mutate(geo_value = paste( - substr(county_name, 1, nchar(county_name) - 7), - name_to_abbr(state_name), - sep = ", " - )) %>% + mutate( + geo_value = paste( + substr(county_name, 1, nchar(county_name) - 7), + name_to_abbr(state_name), + sep = ", " + ) + ) %>% select(geo_value, time_value, cases) xt <- as_tsibble(x) %>% filter(cases >= 3) diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 07413126..62eea2aa 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -51,6 +51,10 @@ library(data.table) library(dplyr) library(purrr) library(ggplot2) +dv <- archive_cases_dv_subset$DT %>% + select(-case_rate_7d_av) %>% + rename(issue = version, value = percent_cli) %>% + tibble() ``` ## Getting data into `epi_archive` format @@ -72,7 +76,7 @@ format, with `issue` playing the role of `version`. We can now use redundant version updates in `as_epi_archive` using compactify, please refer to the [compactify vignette](articles/compactify.html). -```{r, eval=FALSE} +```{r} x <- dv %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% as_epi_archive(compactify = TRUE) @@ -81,15 +85,6 @@ class(x) print(x) ``` -```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset$DT %>% - select(geo_value, time_value, version, percent_cli) %>% - as_epi_archive(compactify = TRUE) - -class(x) -print(x) -``` - An `epi_archive` is consists of a primary field `DT`, which is a data table (from the `data.table` package) that has the columns `geo_value`, `time_value`, `version` (and possibly additional ones), and other metadata fields, such as @@ -127,17 +122,27 @@ object is instantiated, if they are not explicitly specified in the function call (as it did in the case above). ## Summarizing Revision Behavior -There are many ways to examine the ways that signals change across different revisions. -The simplest that is included directly in epiprocess is `revision_summary()`, which computes simple summary statistics for each key (by default, `(geo_value,time_value)` pairs), such as the lag to the first value (latency). In addition to the per key summary, it also returns an overall summary: + +There are many ways to examine the ways that signals change across different +revisions. The simplest that is included directly in epiprocess is +`revision_summary()`, which computes simple summary statistics for each key (by +default, `(geo_value,time_value)` pairs), such as the lag to the first value +(latency). In addition to the per key summary, it also returns an overall +summary: + ```{r} revision_details <- revision_summary(x, print_inform = TRUE) ``` -So as was mentioned at the top, this is clearly a data set where basically everything has some amount of revisions, only 0.37% have no revision at all, and 0.92 have fewer than 3. -Over 94% change by more than 10%. -On the other hand, most are within plus or minus 20% within 5-9 days, so the revisions converge relatively quickly, even if the revisions continue for longer. +So as was mentioned at the top, this is clearly a data set where basically +everything has some amount of revisions, only 0.37% have no revision at all, and +0.92 have fewer than 3. Over 94% change by more than 10%. On the other hand, +most are within plus or minus 20% within 5-9 days, so the revisions converge +relatively quickly, even if the revisions continue for longer. + +To do more detailed analysis than is possible with the above printing, we have +`revision_details`: -To do more detailed analysis than is possible with the above printing, we have `revision_details`: ```{r} revision_details %>% group_by(geo_value) %>% @@ -150,13 +155,16 @@ revision_details %>% time_near_latest = mean(time_near_latest) ) ``` -Most of the states have similar stats on most of these features, except for Florida, which takes nearly double the amount of time to get close to the right value, with California not too far behind. + +Most of the states have similar stats on most of these features, except for +Florida, which takes nearly double the amount of time to get close to the right +value, with California not too far behind. ## Producing snapshots in `epi_df` form -A key method of an `epi_archive` class is `epix_as_of()`, which generates a snapshot -of the archive in `epi_df` format. This represents the most up-to-date values of -the signal variables as of a given version. +A key method of an `epi_archive` class is `epix_as_of()`, which generates a +snapshot of the archive in `epi_df` format. This represents the most up-to-date +values of the signal variables as of a given version. ```{r} x_snapshot <- epix_as_of(x, as.Date("2021-06-01")) @@ -180,6 +188,7 @@ latest snapshot `x_latest` that the archive can provide). ```{r, fig.width = 8, fig.height = 7} theme_set(theme_bw()) +x_latest <- epix_as_of(x, x$versions_end) self_max <- max(x$DT$version) versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") snapshots <- map_dfr(versions, function(v) { @@ -237,7 +246,7 @@ When merging archives, unless the archives have identical data release patterns, download the currently available version data for one of the archives, but not the other). -```{r, message = FALSE, warning = FALSE,eval=FALSE} +```{r, message = FALSE, warning = FALSE, eval=FALSE} y <- pub_covidcast( source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", @@ -337,15 +346,13 @@ Next we slide this forecaster over the working `epi_archive` object, in order to forecast COVID-19 case rates 7 days into the future. ```{r} -fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), - by = "1 month" -) +fc_time_values <- seq(as.Date("2020-08-01"), as.Date("2021-11-30"), by = "1 month") z <- x %>% group_by(geo_value) %>% epix_slide( - fc = prob_arx(x = percent_cli, y = case_rate_7d_av), .before = 119, + fc = prob_arx(x = percent_cli, y = case_rate_7d_av, ahead = 7), + .before = 119, .versions = fc_time_values ) %>% ungroup() @@ -353,8 +360,6 @@ z <- x %>% head(z, 10) ``` - - We get back a tibble `z` with the grouping variables (here geo value), the (reference) time values, and a ["packed"][tidyr::pack] data frame column `fc` containing `fc$point`, `fc$lower`, and `fc$upper` that correspond to the point @@ -377,22 +382,22 @@ points in time and forecast horizons. The former comes from using x_latest <- epix_as_of(x, x$versions_end) # Simple function to produce forecasts k weeks ahead -k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { +forecast_k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - group_by(.data$geo_value) %>% + group_by(geo_value) %>% epix_slide( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), .before = 119, .versions = fc_time_values ) %>% - mutate(target_date = .data$time_value + ahead, as_of = TRUE) %>% + mutate(target_date = .data$version + ahead, as_of = TRUE) %>% ungroup() } else { x_latest %>% - group_by(.data$geo_value) %>% + group_by(geo_value) %>% epi_slide( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), .window_size = 120, - .versions = fc_time_values + .ref_time_values = fc_time_values ) %>% mutate(target_date = .data$time_value + ahead, as_of = FALSE) %>% ungroup() @@ -401,14 +406,14 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { # Generate the forecasts, and bind them together fc <- bind_rows( - k_week_ahead(x, ahead = 7, as_of = TRUE), - k_week_ahead(x, ahead = 14, as_of = TRUE), - k_week_ahead(x, ahead = 21, as_of = TRUE), - k_week_ahead(x, ahead = 28, as_of = TRUE), - k_week_ahead(x, ahead = 7, as_of = FALSE), - k_week_ahead(x, ahead = 14, as_of = FALSE), - k_week_ahead(x, ahead = 21, as_of = FALSE), - k_week_ahead(x, ahead = 28, as_of = FALSE) + forecast_k_week_ahead(x, ahead = 7, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 14, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 21, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 28, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 7, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 14, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 21, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 28, as_of = FALSE) ) # Plot them, on top of latest COVID-19 case rates @@ -447,9 +452,250 @@ to look for more robust forecasting methodology. The forecasters that appear in the vignettes in the current package are only meant to demo the slide functionality with some of the most basic forecasting methodology possible. +## Sliding version-aware computations with geo-pooling + +First, we fetch the versioned data and build the archive. + +```{r, message = FALSE, warning = FALSE, eval =FALSE} +library(epidatr) +library(data.table) +library(ggplot2) +theme_set(theme_bw()) + +y1 <- pub_covidcast( + source = "doctor-visits", + signals = "smoothed_adj_cli", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl", + time_values = epirange(20200601, 20211201), + issues = epirange(20200601, 20211201) +) + +y2 <- pub_covidcast( + source = "jhu-csse", + signal = "confirmed_7dav_incidence_prop", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl", + time_values = epirange(20200601, 20211201), + issues = epirange(20200601, 20211201) +) + +x <- y1 %>% + select(geo_value, time_value, + version = issue, + percent_cli = value + ) %>% + as_epi_archive(compactify = FALSE) + +# mutating merge operation: +x <- epix_merge( + x, + y2 %>% + select(geo_value, time_value, + version = issue, + case_rate_7d_av = value + ) %>% + as_epi_archive(compactify = FALSE), + sync = "locf", + compactify = FALSE +) +``` + +```{r, message = FALSE, echo =FALSE} +library(data.table) +library(ggplot2) +theme_set(theme_bw()) + +x <- archive_cases_dv_subset$DT %>% + filter(geo_value %in% c("ca", "fl")) %>% + as_epi_archive(compactify = FALSE) +``` + +Next, we extend the ARX function to handle multiple geo values, since in the +present case, we will not be grouping by geo value and each slide computation +will be run on multiple geo values at once. Note that, because `epix_slide()` +only returns the grouping variables, `time_value`, and the slide computations in +the eventual returned tibble, we need to include `geo_value` as a column in the +output data frame from our ARX computation. + +```{r} +library(tidyr) +library(purrr) + +prob_arx_args <- function(lags = c(0, 7, 14), + ahead = 7, + min_train_window = 20, + lower_level = 0.05, + upper_level = 0.95, + symmetrize = TRUE, + intercept = FALSE, + nonneg = TRUE) { + return(list( + lags = lags, + ahead = ahead, + min_train_window = min_train_window, + lower_level = lower_level, + upper_level = upper_level, + symmetrize = symmetrize, + intercept = intercept, + nonneg = nonneg + )) +} + +prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { + # Return NA if insufficient training data + if (length(y) < args$min_train_window + max(args$lags) + args$ahead) { + return(data.frame( + geo_value = unique(geo_value), # Return geo value! + point = NA, lower = NA, upper = NA + )) + } + + # Set up x, y, lags list + if (!missing(x)) { + x <- data.frame(x, y) + } else { + x <- data.frame(y) + } + if (!is.list(args$lags)) args$lags <- list(args$lags) + args$lags <- rep(args$lags, length.out = ncol(x)) + + # Build features and response for the AR model, and then fit it + dat <- tibble(i = seq_len(ncol(x)), lag = args$lags) %>% + unnest(lag) %>% + mutate(name = paste0("x", seq_len(nrow(.)))) %>% # nolint: object_usage_linter + # One list element for each lagged feature + pmap(function(i, lag, name) { + tibble( + geo_value = geo_value, + time_value = time_value + lag, # Shift back + !!name := x[, i] + ) + }) %>% + # One list element for the response vector + c(list( + tibble( + geo_value = geo_value, + time_value = time_value - args$ahead, # Shift forward + y = y + ) + )) %>% + # Combine them together into one data frame + reduce(full_join, by = c("geo_value", "time_value")) %>% + arrange(time_value) + if (args$intercept) dat$x0 <- rep(1, nrow(dat)) + obj <- lm(y ~ . + 0, data = select(dat, -geo_value, -time_value)) + + # Use LOCF to fill NAs in the latest feature values (do this by geo value) + setDT(dat) # Convert to a data.table object by reference + cols <- setdiff(names(dat), c("geo_value", "time_value")) + dat[, (cols) := nafill(.SD, type = "locf"), .SDcols = cols, by = "geo_value"] + + # Make predictions + test_time_value <- max(time_value) + point <- predict( + obj, + newdata = dat %>% + dplyr::group_by(geo_value) %>% + dplyr::filter(time_value == test_time_value) + ) + + # Compute bands + r <- residuals(obj) + s <- ifelse(args$symmetrize, -1, NA) # Should the residuals be symmetrized? + q <- quantile(c(r, s * r), probs = c(args$lower, args$upper), na.rm = TRUE) + lower <- point + q[1] + upper <- point + q[2] + + # Clip at zero if we need to, then return + if (args$nonneg) { + point <- pmax(point, 0) + lower <- pmax(lower, 0) + upper <- pmax(upper, 0) + } + return(data.frame( + geo_value = unique(geo_value), # Return geo value! + point = point, lower = lower, upper = upper + )) +} +``` + +We now make forecasts on the archive and compare to forecasts on the latest +data. + +```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} +# Latest snapshot of data, and forecast dates +x_latest <- epix_as_of(x, version = max(x$DT$version)) +fc_time_values <- seq(as.Date("2020-08-01"), + as.Date("2021-11-30"), + by = "1 month" +) + +# Simple function to produce forecasts k weeks ahead +forecast_k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { + if (as_of) { + x %>% + epix_slide( + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, + args = prob_arx_args(ahead = ahead) + ), + .before = 219, .versions = fc_time_values + ) %>% + mutate( + target_date = .data$version + ahead, as_of = TRUE, + geo_value = .data$fc$geo_value + ) + } else { + x_latest %>% + epi_slide( + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, + args = prob_arx_args(ahead = ahead) + ), + .window_size = 220, .ref_time_values = fc_time_values + ) %>% + mutate(target_date = .data$time_value + ahead, as_of = FALSE) + } +} + +# Generate the forecasts, and bind them together +fc <- bind_rows( + forecast_k_week_ahead(x, ahead = 7, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 14, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 21, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 28, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 7, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 14, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 21, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 28, as_of = FALSE) +) + +# Plot them, on top of latest COVID-19 case rates +ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + + geom_ribbon(aes(ymin = fc$lower, ymax = fc$upper), alpha = 0.4) + + geom_line( + data = x_latest, aes(x = time_value, y = case_rate_7d_av), + inherit.aes = FALSE, color = "gray50" + ) + + geom_line(aes(y = fc$point)) + + geom_point(aes(y = fc$point), size = 0.5) + + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + + facet_grid(vars(geo_value), vars(as_of), scales = "free") + + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + + labs(x = "Date", y = "Reported COVID-19 case rates") + + theme(legend.position = "none") +``` + +We can see that these forecasts, which come from training an ARX model jointly +over CA and FL, exhibit generally less variability and wider prediction bands +compared to the ones from the archive vignette, which come from training a +separate ARX model on each state. As in the archive vignette, we can see a +difference between version-aware (right column) and -unaware (left column) +forecasting, as well. + ## Attribution + This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. - - diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index e6c78aba..b1840bb2 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -128,9 +128,7 @@ columns required for an `epi_df` object (along with many others). We can use frame into `epi_df` format. ```{r, message = FALSE} -x <- as_epi_df(cases, - as_of = max(cases$issue) -) %>% +x <- as_epi_df(cases, as_of = max(cases$issue)) %>% select(geo_value, time_value, total_cases = value) class(x) @@ -176,9 +174,11 @@ attributes(x)$metadata ``` ## Using additional key columns in `epi_df` + In the following examples we will show how to create an `epi_df` with additional keys. ### Converting a `tsibble` that has county code as an extra key + ```{r} ex1 <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), @@ -200,10 +200,10 @@ The metadata now includes `county_code` as an extra key. attr(ex1, "metadata") ``` - ### Dealing with misspecified column names `epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error. + ```{r, error = TRUE} data.frame( # misnamed @@ -211,12 +211,13 @@ data.frame( # extra key pol = rep(c("blue", "swing", "swing"), each = 3), # misnamed - reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), - value = seq_along(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) + reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = 9), + value = 1:9 + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, 9)) ) %>% as_epi_df(as_of = as.Date("2024-03-20")) ``` The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`. + ```{r} ex2 <- tibble( # misnamed @@ -240,7 +241,6 @@ ex2 <- ex2 %>% attr(ex2, "metadata") ``` - ### Adding additional keys to an `epi_df` object In the above examples, all the keys are added to objects that are not `epi_df` objects. We illustrate how to add keys to an `epi_df` object. diff --git a/vignettes/growth_rate.Rmd b/vignettes/growth_rate.Rmd index abef646f..acbb53ee 100644 --- a/vignettes/growth_rate.Rmd +++ b/vignettes/growth_rate.Rmd @@ -22,6 +22,7 @@ library(tidyr) ``` The data is fetched with the following query: + ```{r, message = FALSE, eval=F} x <- pub_covidcast( source = "jhu-csse", @@ -38,7 +39,6 @@ x <- pub_covidcast( The data has 1,158 rows and 3 columns. - ```{r, echo=FALSE} data(jhu_csse_daily_subset) x <- jhu_csse_daily_subset %>% diff --git a/vignettes/outliers.Rmd b/vignettes/outliers.Rmd index ea3c30ac..1a2cfa41 100644 --- a/vignettes/outliers.Rmd +++ b/vignettes/outliers.Rmd @@ -127,11 +127,14 @@ vote across the base methods to determine whether a value is an outlier. ```{r} x <- x %>% group_by(geo_value) %>% - mutate(outlier_info = detect_outlr( - x = time_value, y = cases, - methods = detection_methods, - combiner = "median" - )) %>% + mutate( + outlier_info = detect_outlr( + x = time_value, + y = cases, + methods = detection_methods, + combiner = "median" + ) + ) %>% ungroup() %>% unnest(outlier_info) @@ -240,10 +243,9 @@ ggplot(y, aes(x = time_value)) + More advanced correction functionality will be coming at some point in the future. - ## Attribution + This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. [From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. - diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd index 7ec6cc9b..92d8456d 100644 --- a/vignettes/slide.Rmd +++ b/vignettes/slide.Rmd @@ -11,21 +11,19 @@ A central tool in the `epiprocess` package is `epi_slide()`, which is based on the powerful functionality provided in the [`slider`](https://cran.r-project.org/web/packages/slider) package. In `epiprocess`, to "slide" means to apply a computation---represented as a -function or formula---over a sliding/rolling data window. Suitable -groupings can always be achieved by a preliminary call to `group_by()`. +function or formula---over a sliding/rolling data window. The function always +applies the slide inside each group and the grouping is assumed to be across all +group keys of the `epi_df` (this is the grouping used by default if you do not +group the `epi_df` with a `group_by()`). -By default, the meaning of one time step is inferred from the `time_value` -column of the `epi_df` object under consideration, based on the way this column -understands addition and subtraction. For example, if the time values are coded -as `Date` objects, then one time step is one day, since `as.Date("2022-01-01") + -1` equals `as.Date("2022-01-02")`. Alternatively, the time step can be specified -manually in the call to `epi_slide()`; you can read the documentation for more -details. +By default, the `.window_size` units depend on the `time_type` of the `epi_df`, +which is determined from the types in the `time_value` column of the `epi_df`. +See the "Details" in `epi_slide()` for more. As in getting started guide, we'll fetch daily reported COVID-19 cases from CA, FL, NY, and TX (note: here we're using new, not cumulative cases) using the -[`epidatr`](https://github.com/cmu-delphi/epidatr) package, -and then convert this to `epi_df` format. +[`epidatr`](https://github.com/cmu-delphi/epidatr) package, and then convert +this to `epi_df` format. ```{r, message = FALSE, warning=FALSE} library(epidatr) @@ -34,8 +32,9 @@ library(dplyr) ``` The data is fetched with the following query: + ```{r, message = FALSE, eval=F} -x <- pub_covidcast( +edf <- pub_covidcast( source = "jhu-csse", signals = "confirmed_incidence_num", geo_type = "state", @@ -52,99 +51,106 @@ The data has 2,684 rows and 3 columns. ```{r, echo=FALSE} data(jhu_csse_daily_subset) -x <- jhu_csse_daily_subset %>% +edf <- jhu_csse_daily_subset %>% select(geo_value, time_value, cases) %>% arrange(geo_value, time_value) %>% as_epi_df() ``` -## Optimized rolling mean +## Optimized rolling mean and sums -We first demonstrate how to apply a 7-day trailing average to the daily cases -in order to smooth the signal, by passing in the name of the column(s) we -want to average for the first argument of `epi_slide_mean()`. `epi_slide_mean -()` can only be used for averaging. To do this computation per state, we -first call `group_by()`. +For the two most common sliding operations, we offer two optimized versions: +`epi_slide_mean()` and `epi_slide_sum()`. This example gets the 7-day trailing +average of the daily cases. Note that the name of the column(s) that we want to +average is specified as the first argument of `epi_slide_mean()`. ```{r} -x %>% +edf %>% group_by(geo_value) %>% - epi_slide_mean("cases", .window_size = 7) %>% + epi_slide_mean("cases", .window_size = 7, na.rm = TRUE) %>% ungroup() %>% head(10) ``` -The calculation is done using `data.table::frollmean`, whose behavior can be -adjusted by passing relevant arguments via `...`. +Note that we passed `na.rm = TRUE` to `data.table::frollmean()` via `...` to +`epi_slide_mean`. + +The following computes the 7-day trailing sum of daily cases (and passed `na.rm` +to `data.table::frollsum()` similarly): + +```{r} +edf %>% + group_by(geo_value) %>% + epi_slide_sum("cases", .window_size = 7, na.rm = TRUE) %>% + ungroup() %>% + head(10) +``` -## Slide with a formula +## General sliding with a formula -The previous computation can also be performed using `epi_slide()`, which is -more flexible but quite a bit slower than `epi_slide_mean()`. It is -recommended to use `epi_slide_mean()` when possible. +The previous computations can also be performed using `epi_slide()`, which can +be used for more general sliding computations (but is much slower for the +specific cases of mean and sum). The same 7-day trailing average of daily cases can be computed by passing in a -formula for the first argument of `epi_slide()`. To do this per state, we -first call `group_by()`. +formula for the first argument of `epi_slide()`: ```{r} -x %>% +edf %>% group_by(geo_value) %>% - epi_slide(~ mean(.x$cases), .window_size = 7) %>% + epi_slide(~ mean(.x$cases, na.rm = TRUE), .window_size = 7) %>% ungroup() %>% head(10) ``` -The formula specified has access to all non-grouping columns present in the -original `epi_df` object (and must refer to them with the prefix `.x$`). As we -can see, the function `epi_slide()` returns an `epi_df` object with a new column -appended that contains the results (from sliding), named `slide_value` as the -default. We can of course change this post hoc, or we can instead specify a new -name up front using the `.new_col_name` argument: +If your formula returns a data.frame, then the columns of the data.frame +will be unpacked into the resulting `epi_df`. For example, the following +computes the 7-day trailing average of daily cases and the 7-day trailing sum of +daily cases: ```{r} -x <- x %>% +edf %>% group_by(geo_value) %>% - epi_slide(~ mean(.x$cases), .window_size = 7, .new_col_name = "cases_7dav") %>% - ungroup() - -head(x, 10) + epi_slide( + ~ data.frame(cases_mean = mean(.x$cases, na.rm = TRUE), cases_sum = sum(.x$cases, na.rm = TRUE)), + .window_size = 7 + ) %>% + ungroup() %>% + head(10) ``` +Note that this formula has access to all non-grouping columns present in the +original `epi_df` object and must refer to them with the prefix `.x$...`. As we +can see, the function `epi_slide()` returns an `epi_df` object with a new column +appended that contains the results (from sliding), named `slide_value` as the +default. + Some other information is available in additional variables: * `.group_key` is a one-row tibble containing the values of the grouping variables for the associated group * `.ref_time_value` is the reference time value the time window was based on -Like in `group_modify()`, there are alternative names for these variables as -well: `.` can be used instead of `.x`, `.y` instead of `.group_key`, and `.z` -instead of `.ref_time_value`. - -## Slide with a function - -We can also pass a function for the first argument in `epi_slide()`. In this -case, the passed function must accept the following arguments: - -In this case, the passed function `.f` must accept the following arguments: a -data frame with the same column names as the original object, minus any grouping -variables, containing the time window data for one group-`.ref_time_value` -combination; followed by a one-row tibble containing the values of the grouping -variables for the associated group; followed by the associated `.ref_time_value`. -It can accept additional arguments; `epi_slide()` will forward any `...` args it -receives to `.f`. - -Recreating the last example of a 7-day trailing average: - ```{r} -x <- x %>% +# Returning geo_value in the formula +edf %>% group_by(geo_value) %>% - epi_slide(function(x, gk, rtv) mean(x$cases), .window_size = 7, .new_col_name = "cases_7dav") %>% - ungroup() + epi_slide(~ .x$geo_value[[1]], .window_size = 7) %>% + ungroup() %>% + head(10) -head(x, 10) +# Returning time_value in the formula +edf %>% + group_by(geo_value) %>% + epi_slide(~ .x$time_value[[1]], .window_size = 7) %>% + ungroup() %>% + head(10) ``` +While the computations above do not look very useful, these can be used as +building blocks for computations that do something different depending on the +geo_value or ref_time_value. + ## Slide the tidy way Perhaps the most convenient way to setup a computation in `epi_slide()` is to @@ -154,15 +160,17 @@ to a computation in which we can access any columns of `.x` by name, just as we would in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example: ```{r} -x <- x %>% +slide_output <- edf %>% group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% - ungroup() - -head(x, 10) + epi_slide(cases_7dav = mean(cases, na.rm = TRUE), .window_size = 7) %>% + ungroup() %>% + head(10) ``` -In addition to referring to individual columns by name, you can refer to the -time window data as an `epi_df` or `tibble` using `.x`. Similarly, the other arguments of the function format are available through the magic names `.group_key` and `.ref_time_value`, and the tidyverse "pronouns" `.data` and `.env` can also be used. + +In addition to referring to individual columns by name, you can refer to +`epi_df` time window as `.x` (`.group_key` and `.ref_time_value` are still +available). Also, the tidyverse "pronouns" `.data` and `.env` can also be used +if you need distinguish between the data and environment. As a simple sanity check, we visualize the 7-day trailing averages computed on top of the original counts: @@ -171,7 +179,7 @@ top of the original counts: library(ggplot2) theme_set(theme_bw()) -ggplot(x, aes(x = time_value)) + +ggplot(slide_output, aes(x = time_value)) + geom_col(aes(y = cases, fill = geo_value), alpha = 0.5, show.legend = FALSE) + geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) + facet_wrap(~geo_value, scales = "free_y") + @@ -182,18 +190,40 @@ ggplot(x, aes(x = time_value)) + As we can see from the top right panel, it looks like Texas moved to weekly reporting of COVID-19 cases in summer of 2021. -## Running a local forecaster +## Slide with a function + +We can also pass a function to the second argument in `epi_slide()`. In this +case, the passed function `.f` must have the form `function(x, g, t, ...)`, +where -As a more complex example, we create a forecaster based on a local (in time) -autoregression or AR model. AR models can be fit in numerous ways (using base R -functions and various packages), but here we define it "by hand" both because it -provides a more advanced example of sliding a function over an `epi_df` object, -and because it allows us to be a bit more flexible in defining a *probabilistic* -forecaster: one that outputs not just a point prediction, but a notion of -uncertainty around this. In particular, our forecaster will output a point -prediction along with an 90\% uncertainty band, represented by a predictive -quantiles at the 5\% and 95\% levels (lower and upper endpoints of the -uncertainty band). +- "x" is an epi_df with the same column names as the archive's `DT`, minus + the `version` column +- "g" is a one-row tibble containing the values of the grouping variables +for the associated group +- "t" is the ref_time_value for the current window +- "..." are additional arguments + +Recreating the last example of a 7-day trailing average: + +```{r} +edf %>% + group_by(geo_value) %>% + epi_slide(function(x, g, t) mean(x$cases, na.rm = TRUE), .window_size = 7) %>% + ungroup() %>% + head(10) +``` + +## Running a simple autoregressive forecaster + +As a more complex example, we create a forecaster based on an autoregression or +AR model. AR models can be fit in numerous ways (using base R functions and +various packages), but here we define it "by hand" both because it provides a +more advanced example of sliding a function over an `epi_df` object, and because +it allows us to be a bit more flexible in defining a *probabilistic* forecaster: +one that outputs not just a point prediction, but a notion of uncertainty around +this. In particular, our forecaster will output a point prediction along with an +90\% uncertainty band, represented by a predictive quantiles at the 5\% and 95\% +levels (lower and upper endpoints of the uncertainty band). The function defined below, `prob_ar()`, is our probabilistic AR forecaster. The `lags`argument indicates which lags to use in the model, and `ahead` indicates @@ -210,6 +240,9 @@ prob_ar <- function(y, lags = c(0, 7, 14), ahead = 6, min_train_window = 20, return(data.frame(point = NA, lower = NA, upper = NA)) } + # Filter down the edge-NAs + y <- y[!is.na(y)] + # Build features and response for the AR model dat <- do.call( data.frame, @@ -246,29 +279,21 @@ scale of smoothed COVID-19 cases. This is clearly equivalent, up to a constant, to modeling weekly sums of COVID-19 cases. ```{r} -fc_time_values <- seq(as.Date("2020-06-01"), - as.Date("2021-12-01"), - by = "1 months" -) -x %>% +fc_time_values <- seq(as.Date("2020-06-01"), as.Date("2021-12-01"), by = "1 months") +edf %>% group_by(geo_value) %>% - epi_slide( - fc = prob_ar(cases_7dav), .window_size = 120, - .ref_time_values = fc_time_values - ) %>% + epi_slide(cases_7dav = mean(.data$cases, na.rm = TRUE), .window_size = 7) %>% + epi_slide(fc = prob_ar(.data$cases_7dav), .window_size = 120, .ref_time_values = fc_time_values) %>% ungroup() %>% head(10) ``` Note that here we have utilized an argument `.ref_time_values` to perform the sliding computation (here, compute a forecast) at a specific subset of reference -time values. We get out a ["packed"][tidyr::pack] data frame column `fc` -containing `fc$point`, `fc$lower`, and `fc$upper` that correspond to the point -forecast, and the lower and upper endpoints of the 95\% prediction band, -respectively. (We could also have used `, prob_ar(cases_7dav)` to get three -separate columns `point`, `lower`, and `upper`, or used `fc = -list(prob_ar(cases_7dav))` to make an `fc` column with a ["nested"][tidyr::nest] -format (list of data frames) instead.) +time values (the start of every month from mid 2020 to the end of 2021). The +resulting epi_df now contains three new columns: `fc$point`, `fc$lower`, and +`fc$upper` corresponding to the point forecast, and the lower and upper +endpoints of the 95\% prediction band, respectively. To finish off, we plot the forecasts at some times (spaced out by a few months) over the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do @@ -279,10 +304,13 @@ so that we can call it a few times. # Note the use of .all_rows = TRUE (keeps all original rows in the output) k_week_ahead <- function(x, ahead = 7) { x %>% - group_by(.data$geo_value) %>% + group_by(geo_value) %>% + epi_slide(cases_7dav = mean(.data$cases, na.rm = TRUE), .window_size = 7) %>% epi_slide( fc = prob_ar(.data$cases_7dav, ahead = ahead), - .window_size = 120, .ref_time_values = fc_time_values, .all_rows = TRUE + .window_size = 120, + .ref_time_values = fc_time_values, + .all_rows = TRUE ) %>% ungroup() %>% mutate(target_date = .data$time_value + ahead) @@ -290,10 +318,10 @@ k_week_ahead <- function(x, ahead = 7) { # First generate the forecasts, and bind them together z <- bind_rows( - k_week_ahead(x, ahead = 7), - k_week_ahead(x, ahead = 14), - k_week_ahead(x, ahead = 21), - k_week_ahead(x, ahead = 28) + k_week_ahead(edf, ahead = 7), + k_week_ahead(edf, ahead = 14), + k_week_ahead(edf, ahead = 21), + k_week_ahead(edf, ahead = 28) ) # Now plot them, on top of actual COVID-19 case counts @@ -341,8 +369,10 @@ example in the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html). ## Attribution + +The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. + This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. [From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. - +These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes.