Add option to remove certain markers before calculating LRs

duke-malaria-collaboratory · Sep 7, 2023 · e9c12b6 · e9c12b6
1 parent f6a38e9
commit e9c12b6
Show file tree

Hide file tree

Showing 24 changed files with 246 additions and 99 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -11,5 +11,6 @@ export(match_static_thresh)
 export(prep_bloodmeal_profiles)
 export(prep_human_profiles)
 export(rm_dups)
+export(rm_markers)
 export(rm_twins)
 import(euroformix)
diff --git a/R/bistro.R b/R/bistro.R
@@ -35,6 +35,7 @@
 #' @param rm_twins A boolean indicating whether or not to remove likely twins
 #'   (identical STR profiles) from the human database prior to identifying
 #'   matches. Default: TRUE
+#' @param rm_markers A vector indicating what markers should be removed prior to calculating log10LRs. NULL to include all markers. By default, for the bistro function AMEL is removed as it is not standard to include it in LR calculations.
 #' @param model_degrad A boolean indicating whether or not to model peak
 #'   degradation. Used for `modelDegrad` argument in
 #'   [euroformix::contLikSearch()]. Default: TRUE
@@ -92,6 +93,7 @@ bistro <-
            bloodmeal_ids = NULL,
            human_ids = NULL,
            rm_twins = TRUE,
+           rm_markers = c('AMEL'),
            model_degrad = TRUE,
            model_bw_stutt = FALSE,
            model_fw_stutt = FALSE,
@@ -110,33 +112,46 @@ bistro <-
       bloodmeal_ids,
       human_ids,
       rm_twins,
+      rm_markers,
       model_degrad,
       model_bw_stutt,
       model_fw_stutt,
       difftol,
       threads,
       seed,
-      time_limit
+      time_limit,
+      return_lrs
     )
 
     if (calc_allele_freqs) {
-      pop_allele_freqs <- calc_allele_freqs(human_profiles)
+      pop_allele_freqs <- calc_allele_freqs(human_profiles, rm_markers)
+    }else if(!is.null(rm_markers)){
+      pop_allele_freqs <- pop_allele_freqs |>
+        dplyr::select(-dplyr::matches(paste0('^', toupper(rm_markers), '$')))
     }
 
     message("Formatting bloodmeal profiles")
     bloodmeal_profiles <- prep_bloodmeal_profiles(
       bloodmeal_profiles,
       bloodmeal_ids,
-      peak_thresh
+      peak_thresh,
+      rm_markers
     )
 
     message("Formatting human profiles")
     human_profiles <- prep_human_profiles(
       human_profiles,
       human_ids,
-      rm_twins
+      rm_twins,
+      rm_markers
     )
 
+    bm_markers <- unique(bloodmeal_profiles$Marker[!is.na(bloodmeal_profiles$Marker)])
+    hu_markers <- unique(human_profiles$Marker[!is.na(human_profiles$Marker)])
+    message('Markers being used: ',
+            paste(intersect(bm_markers, hu_markers),
+                  collapse = ', '))
+
     message("Calculating log10LRs")
 
     log10_lrs <-

diff --git a/R/calc_allele_freqs.R b/R/calc_allele_freqs.R
@@ -10,9 +10,15 @@
 #'
 #' @export
 #' @keywords internal
-calc_allele_freqs <- function(human_profiles) {
+calc_allele_freqs <- function(human_profiles, rm_markers = NULL) {
   # check if expected columns are present
   check_colnames(human_profiles, c("SampleName", "Marker", "Allele"))
+  check_ids(rm_markers, "rm_markers")
+
+  if(!is.null(rm_markers)){
+    human_profiles <- human_profiles |>
+      dplyr::filter(!Marker %in% rm_markers)
+  }
 
   # remove duplicates if necessary
   human_profiles <- rm_dups(human_profiles)

diff --git a/R/checks.R b/R/checks.R
@@ -9,31 +9,47 @@ check_bistro_inputs <-
            human_profiles,
            kit,
            peak_thresh,
-           pop_allele_freqs = NULL,
-           calc_allele_freqs = FALSE,
-           bloodmeal_ids = NULL,
-           human_ids = NULL,
-           rm_twins = TRUE,
-           model_degrad = TRUE,
-           model_bw_stutt = FALSE,
-           model_fw_stutt = FALSE,
-           difftol = 1,
-           threads = 4,
-           seed = 1,
-           time_limit = 3) {
+           pop_allele_freqs,
+           calc_allele_freqs,
+           bloodmeal_ids,
+           human_ids,
+           rm_twins,
+           rm_markers,
+           model_degrad,
+           model_bw_stutt,
+           model_fw_stutt,
+           difftol,
+           threads,
+           seed,
+           time_limit,
+           return_lrs) {
     check_colnames(
       bloodmeal_profiles,
       c("SampleName", "Marker", "Allele", "Height")
     )
     check_colnames(human_profiles, c("SampleName", "Marker", "Allele"))
     check_ids(bloodmeal_ids, "bloodmeal_ids")
     check_ids(human_ids, "human_ids")
+    check_ids(rm_markers)
 
     kit_df <- check_kit(kit)
 
-    bm_prof_markers <- toupper(unique(bloodmeal_profiles$Marker))
-    hu_prof_markers <- toupper(unique(human_profiles$Marker))
-    kit_markers <- toupper(unique(kit_df$Marker))
+    bm_prof_markers <- bloodmeal_profiles$Marker |>
+      unique() |>
+      toupper()
+    hu_prof_markers <- human_profiles$Marker |>
+      unique() |>
+      toupper()
+    kit_markers <- kit_df$Marker |>
+      unique() |>
+      toupper()
+
+    if(!is.null(rm_markers)){
+      rm_markers <- toupper(rm_markers)
+      bm_prof_markers <- bm_prof_markers[!bm_prof_markers %in% rm_markers]
+      hu_prof_markers <- hu_prof_markers[!hu_prof_markers %in% rm_markers]
+      kit_markers <- kit_markers[!kit_markers %in% rm_markers]
+    }
 
     check_setdiff_markers(
       bm_prof_markers,
@@ -56,6 +72,7 @@ check_bistro_inputs <-
     check_is_bool(model_degrad, "model_degrad")
     check_is_bool(model_bw_stutt, "model_bw_stutt")
     check_is_bool(model_fw_stutt, "model_fw_stutt")
+    check_is_bool(return_lrs, "return_lrs")
     check_is_numeric(difftol, "difftol", pos = TRUE)
     check_is_numeric(threads, "threads", pos = TRUE)
     check_is_numeric(seed, "seed")

diff --git a/R/match_exact.R b/R/match_exact.R
@@ -21,7 +21,8 @@ match_exact <- function(bloodmeal_profiles,
                         bloodmeal_ids = NULL,
                         human_ids = NULL,
                         peak_thresh = NULL,
-                        rm_twins = TRUE) {
+                        rm_twins = TRUE,
+                        rm_markers = NULL) {
   if (is.null(peak_thresh)) {
     check_colnames(bloodmeal_profiles, c("SampleName", "Marker", "Allele"))
   } else {
@@ -34,17 +35,20 @@ match_exact <- function(bloodmeal_profiles,
   check_ids(bloodmeal_ids, "bloodmeal_ids")
   check_ids(human_ids, "human_ids")
   check_is_bool(rm_twins, "rm_twins")
+  check_ids(rm_markers, "rm_markers")
 
   bloodmeal_profiles <- prep_bloodmeal_profiles(
     bloodmeal_profiles,
     bloodmeal_ids,
-    peak_thresh
+    peak_thresh,
+    rm_markers = NULL
   )
 
   human_profiles <- prep_human_profiles(
     human_profiles,
     human_ids,
-    rm_twins
+    rm_twins,
+    rm_markers = NULL
   )
 
   # human profiles

diff --git a/R/match_similarity.R b/R/match_similarity.R
@@ -30,7 +30,7 @@
 #' match_similarity(bloodmeal_profiles, human_profiles)
 match_similarity <- function(bloodmeal_profiles, human_profiles,
                              bloodmeal_ids = NULL, human_ids = NULL,
-                             peak_thresh = NULL, rm_twins = TRUE,
+                             peak_thresh = NULL, rm_twins = TRUE, rm_markers = NULL,
                              return_similarities = FALSE) {
   if (is.null(peak_thresh)) {
     check_colnames(bloodmeal_profiles, c("SampleName", "Marker", "Allele"))
@@ -44,12 +44,14 @@ match_similarity <- function(bloodmeal_profiles, human_profiles,
   check_ids(bloodmeal_ids, "bloodmeal_ids")
   check_ids(human_ids, "human_ids")
   check_is_bool(rm_twins, "rm_twins")
+  check_ids(rm_markers, "rm_markers")
   check_is_bool(return_similarities, "return_similarities")
 
   bloodmeal_profiles <- prep_bloodmeal_profiles(
     bloodmeal_profiles,
     bloodmeal_ids,
-    peak_thresh
+    peak_thresh,
+    rm_markers = rm_markers
   )
 
   all_bm_ids <- unique(bloodmeal_profiles$SampleName)
@@ -60,12 +62,13 @@ match_similarity <- function(bloodmeal_profiles, human_profiles,
   human_profiles <- prep_human_profiles(
     human_profiles,
     human_ids,
-    rm_twins
+    rm_twins,
+    rm_markers = rm_markers
   ) |>
     tidyr::drop_na()
 
   message("Calculating human-human similarities")
-  hu_similarities <- get_human_similarities(human_profiles)
+  hu_similarities <- get_human_similarity(human_profiles)
 
   max_similarity <- hu_similarities |>
     dplyr::filter(similarity != 1) |> # remove twins
@@ -136,7 +139,7 @@ match_similarity <- function(bloodmeal_profiles, human_profiles,
 #' - hu1: human ID 1
 #' - hu2: human ID 2
 #' - similarity: similarity value (# exact locus matches / # loci)
-get_human_similarities <- function(human_profiles) {
+get_human_similarity <- function(human_profiles) {
   human_profiles <- human_profiles |>
     tidyr::drop_na()
   str_hu1_by_marker <- human_profiles |>

diff --git a/R/preprocess_data.R b/R/preprocess_data.R
@@ -9,7 +9,8 @@
 #' @keywords internal
 prep_bloodmeal_profiles <- function(bloodmeal_profiles,
                                     bloodmeal_ids = NULL,
-                                    peak_thresh = NULL) {
+                                    peak_thresh = NULL,
+                                    rm_markers = c('AMEL')) {
   if (is.null(bloodmeal_ids)) {
     bloodmeal_ids <- unique(bloodmeal_profiles$SampleName)
   } else {
@@ -19,6 +20,7 @@ prep_bloodmeal_profiles <- function(bloodmeal_profiles,
 
   bloodmeal_profiles <- bloodmeal_profiles |>
     dplyr::filter(SampleName %in% bloodmeal_ids) |>
+    rm_markers(rm_markers) |>
     rm_dups()
 
   if (!is.null(peak_thresh)) {
@@ -41,7 +43,8 @@ prep_bloodmeal_profiles <- function(bloodmeal_profiles,
 #' @keywords internal
 prep_human_profiles <- function(human_profiles,
                                 human_ids = NULL,
-                                rm_twins = TRUE) {
+                                rm_twins = TRUE,
+                                rm_markers = c('AMEL')) {
   if (rm_twins) {
     human_profiles <- rm_twins(human_profiles)
   }
@@ -54,11 +57,27 @@ prep_human_profiles <- function(human_profiles,
 
   human_profiles <- human_profiles |>
     dplyr::filter(SampleName %in% human_ids) |>
+    rm_markers(rm_markers) |>
     rm_dups()
 
   return(human_profiles)
 }
 
+#' Remove markers
+#'
+#' @param df Dataframe from which to remove markers
+#'
+#' @return Dataframe without markers
+#' @export
+#' @keywords internal
+rm_markers <- function(profiles, markers){
+  check_colnames(profiles, 'Marker')
+  profiles <- profiles |>
+    dplyr::mutate(Marker = toupper(Marker)) |>
+    dplyr::filter(!Marker %in% toupper(markers))
+  return(profiles)
+}
+
 #' Remove duplicate rows with warning
 #'
 #' @param df Dataframe from which to remove duplicate rows

diff --git a/man/bistro.Rd b/man/bistro.Rd
diff --git a/man/calc_allele_freqs.Rd b/man/calc_allele_freqs.Rd
diff --git a/man/check_bistro_inputs.Rd b/man/check_bistro_inputs.Rd
diff --git a/man/get_human_similarities.Rd → man/get_human_similarity.Rd b/man/get_human_similarities.Rd → man/get_human_similarity.Rd