Modularize str profile preprocessing

duke-malaria-collaboratory · Sep 7, 2023 · 9a2d61d · 9a2d61d
1 parent e575e8b
commit 9a2d61d
Show file tree

Hide file tree

Showing 20 changed files with 381 additions and 228 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,8 @@ export(calc_allele_freqs)
 export(calc_log10_lrs)
 export(filter_peaks)
 export(identify_matches)
+export(prep_bloodmeal_profiles)
+export(prep_human_profiles)
 export(rm_dups)
 export(rm_twins)
 import(euroformix)
diff --git a/R/bistro.R b/R/bistro.R
@@ -114,39 +114,19 @@ bistro <-
       pop_allele_freqs <- calc_allele_freqs(human_profiles)
     }
 
-    if (rm_twins) {
-      human_profiles <- rm_twins(human_profiles)
-    }
-
     message("Formatting bloodmeal profiles")
-
-    if (is.null(bloodmeal_ids)) {
-      bloodmeal_ids <- unique(bloodmeal_profiles$SampleName)
-    } else {
-      bloodmeal_ids <-
-        subset_ids(bloodmeal_ids, bloodmeal_profiles$SampleName)
-    }
-
-    bloodmeal_profiles <- bloodmeal_profiles |>
-      dplyr::filter(SampleName %in% bloodmeal_ids)
-
-    check_heights(bloodmeal_profiles$Height, peak_thresh)
-
-    bloodmeal_profiles <- bloodmeal_profiles |>
-      rm_dups() |>
-      filter_peaks(peak_thresh)
+    bloodmeal_profiles <- prep_bloodmeal_profiles(
+      bloodmeal_profiles,
+      peak_thresh,
+      bloodmeal_ids
+    )
 
     message("Formatting human profiles")
-
-    if (is.null(human_ids)) {
-      human_ids <- unique(human_profiles$SampleName)
-    } else {
-      human_ids <- subset_ids(human_ids, human_profiles$SampleName)
-    }
-
-    human_profiles <- human_profiles |>
-      dplyr::filter(SampleName %in% human_ids) |>
-      rm_dups()
+    human_profiles <- prep_human_profiles(
+      human_profiles,
+      human_ids,
+      rm_twins
+    )
 
     message("Calculating log10LRs")
 

diff --git a/R/calc_log10_lrs.R b/R/calc_log10_lrs.R
@@ -116,6 +116,11 @@ calc_one_log10_lr <-
 
 #' Calculate log10_lrs for multiple bloodmeal-human pairs
 #'
+#' Note that this function doesn't preprocess the bloodmeal and human profile
+#' data. If you would like to preprocess it in the same way as is performed
+#' internally in the `bistro()` function, you must run
+#' `prep_bloodmeal_profiles()` and `prep_human_profiles()` first.
+#'
 #' @inheritParams bistro
 #'
 #' @return A tibble with the same output as for [bistro()], except there is no

diff --git a/R/identify_matches.R b/R/identify_matches.R
@@ -29,7 +29,11 @@ identify_one_match_set <- function(log10_lrs, bloodmeal_id) {
   )
 
   if (all(is.na(log10_lrs$log10_lr) | is.infinite(log10_lrs$log10_lr))) {
-    matches_thresh <- matches
+    matches_thresh <- matches |>
+      dplyr::mutate(notes = ifelse(is.na(notes),
+        "all log10LRs NA or Inf",
+        paste0("all log10LRs NA or Inf", ";", notes)
+      ))
   } else if (max(log10_lrs$log10_lr[!is.infinite(log10_lrs$log10_lr)]) < 1.5 &&
     is.na(notes)) {
     matches_thresh <- matches |>

diff --git a/R/preprocess_data.R b/R/preprocess_data.R
@@ -0,0 +1,145 @@
+#' Preprocess bloodmeal profiles
+#'
+#' Removes duplicates and peaks below threshold, subsets ids
+#'
+#' @inheritParams bistro
+#'
+#' @return Dataframe with preprocessed bloodmeal profiles
+#' @export
+#' @keywords internal
+prep_bloodmeal_profiles <- function(bloodmeal_profiles,
+                                    peak_thresh,
+                                    bloodmeal_ids = NULL) {
+  if (is.null(bloodmeal_ids)) {
+    bloodmeal_ids <- unique(bloodmeal_profiles$SampleName)
+  } else {
+    bloodmeal_ids <-
+      subset_ids(bloodmeal_ids, bloodmeal_profiles$SampleName)
+  }
+
+  bloodmeal_profiles <- bloodmeal_profiles |>
+    dplyr::filter(SampleName %in% bloodmeal_ids)
+
+  check_heights(bloodmeal_profiles$Height, peak_thresh)
+
+  bloodmeal_profiles <- bloodmeal_profiles |>
+    rm_dups() |>
+    filter_peaks(peak_thresh)
+
+  return(bloodmeal_profiles)
+}
+
+#' Preprocess human profiles
+#'
+#' Removes duplicates and optionally twins, subsets ids
+#'
+#' @inheritParams bistro
+#'
+#' @return Dataframe with preprocessed human profiles
+#' @export
+#' @keywords internal
+prep_human_profiles <- function(human_profiles,
+                                human_ids = NULL,
+                                rm_twins = TRUE) {
+  if (rm_twins) {
+    human_profiles <- rm_twins(human_profiles)
+  }
+
+  if (is.null(human_ids)) {
+    human_ids <- unique(human_profiles$SampleName)
+  } else {
+    human_ids <- subset_ids(human_ids, human_profiles$SampleName)
+  }
+
+  human_profiles <- human_profiles |>
+    dplyr::filter(SampleName %in% human_ids) |>
+    rm_dups()
+
+  return(human_profiles)
+}
+
+#' Remove duplicate rows with warning
+#'
+#' @param df Dataframe from which to remove duplicate rows
+#'
+#' @return Un-duplicated dataframe
+#' @export
+#' @keywords internal
+rm_dups <- function(df) {
+  n_orig <- nrow(df)
+  if (n_orig != dplyr::n_distinct(df)) {
+    df <- df |>
+      dplyr::distinct()
+    warning(paste0(
+      "Detected and removed ",
+      n_orig - nrow(df),
+      " duplicate rows."
+    ))
+  }
+  return(df)
+}
+
+#' Remove identical STR profiles
+#'
+#' @inheritParams bistro
+#'
+#' @return Data frame with twins removed
+#' @export
+#' @keywords internal
+rm_twins <- function(human_profiles) {
+  not_twins <- human_profiles |>
+    dplyr::arrange(Marker, Allele) |>
+    dplyr::group_by(SampleName) |>
+    dplyr::summarize(all_alleles = stringr::str_c(paste0(Marker, Allele),
+      collapse = ";"
+    )) |>
+    dplyr::group_by(all_alleles) |>
+    dplyr::mutate(n = dplyr::n()) |>
+    dplyr::filter(n == 1) |>
+    dplyr::pull(SampleName)
+
+  n_ident_profs <-
+    dplyr::n_distinct(human_profiles$SampleName) - length(not_twins)
+
+  if (n_ident_profs > 0) {
+    message(
+      paste0(
+        "Identified ",
+        n_ident_profs,
+        " people whose profiles appear more than once",
+        " (likely identical twins). These are being removed."
+      )
+    )
+    human_profiles <- human_profiles |>
+      dplyr::filter(SampleName %in% not_twins)
+  }
+
+  return(human_profiles)
+}
+
+#' Subset to ids present in the dataset
+#'
+#' @param ids ids to check if in vec
+#' @param vec vector of ids
+#' @param vec_name name of vector
+#'
+#' @return list of IDs that are present
+#' @keywords internal
+subset_ids <- function(ids, vec, vec_name) {
+  id_absent <- setdiff(ids, vec)
+  if (length(id_absent) > 0) {
+    warning(
+      "Removing ",
+      length(id_absent),
+      " ",
+      vec_name,
+      " not found in the dataset: ",
+      id_absent
+    )
+  }
+  ids <- intersect(ids, vec)
+  if (length(ids) == 0) {
+    stop("None of the provided ", vec_name, " are present in the dataset.")
+  }
+  return(ids)
+}
diff --git a/R/utils.R b/R/utils.R
@@ -30,89 +30,3 @@ utils::globalVariables(
 ignore_unused_imports <- function() {
   codetools::checkUsage
 }
-
-#' Remove duplicate rows with warning
-#'
-#' @param df Dataframe from which to remove duplicate rows
-#'
-#' @return Un-duplicated dataframe
-#' @export
-#' @keywords internal
-rm_dups <- function(df) {
-  n_orig <- nrow(df)
-  if (n_orig != dplyr::n_distinct(df)) {
-    df <- df |>
-      dplyr::distinct()
-    warning(paste0(
-      "Detected and removed ",
-      n_orig - nrow(df),
-      " duplicate rows."
-    ))
-  }
-  return(df)
-}
-
-#' Remove identical STR profiles
-#'
-#' @inheritParams bistro
-#'
-#' @return Data frame with twins removed
-#' @export
-#' @keywords internal
-rm_twins <- function(human_profiles) {
-  not_twins <- human_profiles |>
-    dplyr::arrange(Marker, Allele) |>
-    dplyr::group_by(SampleName) |>
-    dplyr::summarize(all_alleles = stringr::str_c(paste0(Marker, Allele),
-      collapse = ";"
-    )) |>
-    dplyr::group_by(all_alleles) |>
-    dplyr::mutate(n = dplyr::n()) |>
-    dplyr::filter(n == 1) |>
-    dplyr::pull(SampleName)
-
-  n_ident_profs <-
-    dplyr::n_distinct(human_profiles$SampleName) - length(not_twins)
-
-  if (n_ident_profs > 0) {
-    message(
-      paste0(
-        "Identified ",
-        n_ident_profs,
-        " people whose profiles appear more than once",
-        " (likely identical twins). These are being removed."
-      )
-    )
-    human_profiles <- human_profiles |>
-      dplyr::filter(SampleName %in% not_twins)
-  }
-
-  return(human_profiles)
-}
-
-#' Subset to ids present in the dataset
-#'
-#' @param ids ids to check if in vec
-#' @param vec vector of ids
-#' @param vec_name name of vector
-#'
-#' @return list of IDs that are present
-#' @keywords internal
-subset_ids <- function(ids, vec, vec_name) {
-  id_absent <- setdiff(ids, vec)
-  if (length(id_absent) > 0) {
-    warning(
-      "Removing ",
-      length(id_absent),
-      " ",
-      vec_name,
-      " not found in the dataset: ",
-      id_absent
-    )
-  }
-  ids <- intersect(ids, vec)
-  if (length(ids) == 0) {
-    stop("None of the provided ", vec_name, " are present in the dataset.")
-  }
-  return(ids)
-}
diff --git a/man/calc_log10_lrs.Rd b/man/calc_log10_lrs.Rd
diff --git a/man/prep_bloodmeal_profiles.Rd b/man/prep_bloodmeal_profiles.Rd
diff --git a/man/prep_human_profiles.Rd b/man/prep_human_profiles.Rd
diff --git a/man/rm_dups.Rd b/man/rm_dups.Rd