Merge pull request #17 from JamesHWade/feature/pad-missing-values

[Feature] Pad Missing Values
JamesHWade · Mar 3, 2024 · cdc7b5a · cdc7b5a
2 parents bc0383a + 7e410b4
commit cdc7b5a
Show file tree

Hide file tree

Showing 31 changed files with 387 additions and 292 deletions.
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -1,9 +1,5 @@
 # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
-#
-# NOTE: This workflow is overkill for most R packages and
-# check-standard.yaml is likely a better choice.
-# usethis::use_github_action("check-standard") will install it.
 on:
   push:
     branches: [main, master]
@@ -23,26 +19,17 @@ jobs:
       matrix:
         config:
           - {os: macos-latest,   r: 'release'}
-
           - {os: windows-latest, r: 'release'}
-          # Use 3.6 to trigger usage of RTools35
-          - {os: windows-latest, r: '3.6'}
-          # use 4.1 to check with rtools40's older compiler
-          - {os: windows-latest, r: '4.1'}
-
           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
           - {os: ubuntu-latest,   r: 'release'}
           - {os: ubuntu-latest,   r: 'oldrel-1'}
-          - {os: ubuntu-latest,   r: 'oldrel-2'}
-          - {os: ubuntu-latest,   r: 'oldrel-3'}
-          - {os: ubuntu-latest,   r: 'oldrel-4'}
 
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       R_KEEP_PKG_SOURCE: yes
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - uses: r-lib/actions/setup-pandoc@v2
 

diff --git a/.github/workflows/pr-commands.yaml b/.github/workflows/pr-commands.yaml
@@ -14,7 +14,7 @@ jobs:
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - uses: r-lib/actions/pr-fetch@v2
         with:
@@ -51,7 +51,7 @@ jobs:
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - uses: r-lib/actions/pr-fetch@v2
         with:

diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -15,7 +15,7 @@ jobs:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - uses: r-lib/actions/setup-r@v2
         with:
@@ -31,7 +31,7 @@ jobs:
           covr::codecov(
             quiet = FALSE,
             clean = FALSE,
-            install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package")
+            install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
           )
         shell: Rscript {0}
 
@@ -44,7 +44,7 @@ jobs:
 
       - name: Upload test results
         if: failure()
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: coverage-test-failures
           path: ${{ runner.temp }}/package
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -39,4 +39,4 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3.9000
+RoxygenNote: 7.3.1
diff --git a/R/baseline.R b/R/baseline.R
@@ -70,17 +70,18 @@ print.step_baseline <-
 #'
 #' @examples
 #' meats_long %>% subtract_rf_baseline(yvar = transmittance)
-subtract_rf_baseline <- function(data, yvar, span = 2/3, maxit = c(5, 5)){
-
+subtract_rf_baseline <- function(data, yvar, span = 2 / 3, maxit = c(5, 5)) {
   # rlang::arg_match0(as.character(rlang::enquo(yvar)), values = names(data))
 
   data %>%
     dplyr::mutate(
       raw = {{ yvar }},
-      baseline = IDPmisc::rfbaseline(x = 1:length({{ yvar }}),
-                                     y = {{ yvar }},
-                                     span = span,
-                                     maxit = maxit)$fit,
+      baseline = IDPmisc::rfbaseline(
+        x = 1:length({{ yvar }}),
+        y = {{ yvar }},
+        span = span,
+        maxit = maxit
+      )$fit,
       {{ yvar }} := {{ yvar }} - baseline
     )
 }

diff --git a/R/checks.R b/R/checks.R
@@ -11,8 +11,10 @@ check_missing_measures <- function(.data, loc) {
 
 check_single_selector <- function(res, arg) {
   if (length(res) != 1) {
-    msg <- paste0("The selection for `", arg, "` should only select a single ",
-                  "column (", length(res), " columns were selected).")
+    msg <- paste0(
+      "The selection for `", arg, "` should only select a single ",
+      "column (", length(res), " columns were selected)."
+    )
     rlang::abort(msg)
   }
 }
@@ -31,3 +33,30 @@ check_measure_dims <- function(x) {
   }
   invisible(NULL)
 }
+
+pad_measure_dims <- function(x) {
+  # Determine the most frequent number of rows
+  num_rows <- purrr::map_int(x$.measures, nrow)
+  num_unique <- sort(table(num_rows), decreasing = TRUE)
+  most_freq <- as.integer(names(num_unique)[1])
+
+  # Pad each measure so they all have 'most_freq' rows
+  x$.measures <- purrr::map(x$.measures, ~ {
+    df <- .x
+    if (nrow(df) < most_freq) {
+      # Calculate how many rows to add
+      rows_to_add <- most_freq - nrow(df)
+      # Create a data frame with the required number of missing rows
+      missing_rows <-
+        purrr::map_dfc(names(df), ~ rep(NA_real_, rows_to_add)) %>%
+        tibble::as_tibble() %>%
+        setNames(names(df)) %>%
+        suppressMessages() # suppress message about new column names
+      # Bind the missing rows to the original data frame
+      df <- bind_rows(df, missing_rows)
+      df
+    }
+    df
+  })
+  x
+}
diff --git a/R/meats.R → R/data-meats.R b/R/meats.R → R/data-meats.R
diff --git a/R/helpers.R b/R/helpers.R
@@ -35,7 +35,7 @@ matrix_to_measure <- function(x, loc) {
   }
 
   x <- t(x)
-  x <-  tibble::as_tibble(x, .name_repair = "minimal")
+  x <- tibble::as_tibble(x, .name_repair = "minimal")
 
   res <- purrr::map(x, ~ tibble::new_tibble(list(location = loc, value = .x)))
   unname(res)
@@ -70,10 +70,12 @@ check_has_measure <- function(x, cl) {
 
   if (!any(names(x) == ".measures")) {
     msg <-
-      paste0("It appears that the measurements have not been converted ",
-             "for the inernal format. See `step_measure_input_long()` ",
-             "and `step_measure_input_wide()` and use these prior to ",
-             step_fn)
+      paste0(
+        "It appears that the measurements have not been converted ",
+        "for the inernal format. See `step_measure_input_long()` ",
+        "and `step_measure_input_wide()` and use these prior to ",
+        step_fn
+      )
     rlang::abort(msg)
   }
 }
diff --git a/R/input_long.R b/R/input_long.R
@@ -7,14 +7,16 @@
 #' @family input/output steps
 #' @inheritParams recipes::step_center
 #' @param ... One or more selector functions to choose which _single_ column
-#' contains the analytical measurements. The selection should be in the order
-#' of the measurement's profile.
+#'   contains the analytical measurements. The selection should be in the order
+#'   of the measurement's profile.
 #' @param location One or more selector functions to choose which _single_
-#' column has the locations of the analytical values.
+#'   column has the locations of the analytical values.
+#' @param pad Whether to pad the measurements to ensure that they all have the
+#'   same number of values. This is useful when there are missing values in the
+#'   measurements.
 #' @param columns A character vector of column names determined by the recipe.
-#' @details
-#' This step is designed for data in a format where there is a column for the
-#' analytical measurement (e.g., absorption, etc.) and another with the
+#' @details This step is designed for data in a format where there is a column
+#' for the analytical measurement (e.g., absorption, etc.) and another with the
 #' location of the value (e.g., wave number, etc.).
 #'
 #' `step_measure_input_long()` will collect those data and put them into a
@@ -48,6 +50,7 @@ step_measure_input_long <-
   function(recipe,
            ...,
            location,
+           pad = FALSE,
            role = "measure",
            trained = FALSE,
            columns = NULL,
@@ -64,21 +67,23 @@ step_measure_input_long <-
         role = role,
         columns = columns,
         location = location,
+        pad = pad,
         skip = skip,
         id = id
       )
     )
   }
 
 step_measure_input_long_new <-
-  function(terms, role, trained, columns, location, na_rm, skip, id) {
+  function(terms, role, trained, columns, location, pad, skip, id) {
     step(
       subclass = "measure_input_long",
       terms = terms,
       role = role,
       trained = trained,
       columns = columns,
       location = location,
+      pad = pad,
       skip = skip,
       id = id
     )
@@ -104,6 +109,7 @@ prep.step_measure_input_long <- function(x, training, info = NULL, ...) {
     trained = TRUE,
     columns = unname(c(value_name, loc_name)),
     location = x$location,
+    pad = x$pad,
     skip = x$skip,
     id = x$id
   )
@@ -125,6 +131,10 @@ bake.step_measure_input_long <- function(object, new_data, ...) {
       tidyr::nest(.by = c(-value), .key = ".measures")
   }
 
+  if (rlang::is_true(object$pad)) {
+    new_data <- pad_measure_dims(new_data)
+  }
+
   check_measure_dims(new_data)
 
   new_data
@@ -144,12 +154,16 @@ print.step_measure_input_long <-
 #' @export
 tidy.step_measure_input_long <- function(x, ...) {
   if (is_trained(x)) {
-    res <- tibble(terms = x$columns[!is.na(x$columns)],
-                  value = na_dbl)
+    res <- tibble(
+      terms = x$columns[!is.na(x$columns)],
+      value = na_dbl
+    )
   } else {
     term_names <- sel2char(x$terms)
-    res <- tibble(terms = term_names,
-                  value = na_dbl)
+    res <- tibble(
+      terms = term_names,
+      value = na_dbl
+    )
   }
   res$id <- x$id
   res
@@ -163,5 +177,3 @@ rename_long_cols <- function(.data, val_chr, loc_chr) {
   }
   dplyr::rename(.data, dplyr::all_of(res))
 }
-
-
diff --git a/R/input_wide.R b/R/input_wide.R
@@ -66,30 +66,30 @@
 #' @export
 
 step_measure_input_wide <-
-    function(recipe,
-             ...,
-             role = "measure",
-             trained = FALSE,
-             columns = NULL,
-             location_values = NULL,
-             skip = FALSE,
-             id = rand_id("measure_input_wide")) {
-      add_step(
-        recipe,
-        step_measure_input_wide_new(
-          terms = enquos(...),
-          trained = trained,
-          role = role,
-          columns = columns,
-          location_values = location_values,
-          skip = skip,
-          id = id
-        )
+  function(recipe,
+           ...,
+           role = "measure",
+           trained = FALSE,
+           columns = NULL,
+           location_values = NULL,
+           skip = FALSE,
+           id = rand_id("measure_input_wide")) {
+    add_step(
+      recipe,
+      step_measure_input_wide_new(
+        terms = enquos(...),
+        trained = trained,
+        role = role,
+        columns = columns,
+        location_values = location_values,
+        skip = skip,
+        id = id
       )
-    }
+    )
+  }
 
 step_measure_input_wide_new <-
-  function(terms, role, trained, columns, location_values, na_rm, skip, id) {
+  function(terms, role, trained, columns, location_values, skip, id) {
     step(
       subclass = "measure_input_wide",
       terms = terms,
@@ -111,8 +111,10 @@ prep.step_measure_input_wide <- function(x, training, info = NULL, ...) {
     num_inputs <- length(col_names)
     num_loc <- length(x$location_values)
     if (num_inputs != num_loc) {
-      msg <- paste0(num_inputs, " columns were selected as inputs but ",
-                    "`location_values` has ", num_loc, " values.")
+      msg <- paste0(
+        num_inputs, " columns were selected as inputs but ",
+        "`location_values` has ", num_loc, " values."
+      )
       rlang::abort(msg)
     }
     # if
@@ -133,7 +135,6 @@ prep.step_measure_input_wide <- function(x, training, info = NULL, ...) {
 
 #' @export
 bake.step_measure_input_wide <- function(object, new_data, ...) {
-
   # TODO check to make sure that the nested tibble has the same number of rows
   # in case the nesting was bad
   wide_to_list(new_data, object$location_values, object$terms)
@@ -151,12 +152,16 @@ print.step_measure_input_wide <-
 #' @export
 tidy.step_measure_input_wide <- function(x, ...) {
   if (is_trained(x)) {
-    res <- tibble(terms = x$columns,
-                  value = na_dbl)
+    res <- tibble(
+      terms = x$columns,
+      value = na_dbl
+    )
   } else {
     term_names <- sel2char(x$terms)
-    res <- tibble(terms = term_names,
-                  value = na_dbl)
+    res <- tibble(
+      terms = term_names,
+      value = na_dbl
+    )
   }
   res$id <- x$id
   res