Skip to content

Commit

Permalink
Modularize str profile preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
zenalapp committed Sep 7, 2023
1 parent e575e8b commit 9a2d61d
Show file tree
Hide file tree
Showing 20 changed files with 381 additions and 228 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ export(calc_allele_freqs)
export(calc_log10_lrs)
export(filter_peaks)
export(identify_matches)
export(prep_bloodmeal_profiles)
export(prep_human_profiles)
export(rm_dups)
export(rm_twins)
import(euroformix)
40 changes: 10 additions & 30 deletions R/bistro.R
Original file line number Diff line number Diff line change
Expand Up @@ -114,39 +114,19 @@ bistro <-
pop_allele_freqs <- calc_allele_freqs(human_profiles)
}

if (rm_twins) {
human_profiles <- rm_twins(human_profiles)
}

message("Formatting bloodmeal profiles")

if (is.null(bloodmeal_ids)) {
bloodmeal_ids <- unique(bloodmeal_profiles$SampleName)
} else {
bloodmeal_ids <-
subset_ids(bloodmeal_ids, bloodmeal_profiles$SampleName)
}

bloodmeal_profiles <- bloodmeal_profiles |>
dplyr::filter(SampleName %in% bloodmeal_ids)

check_heights(bloodmeal_profiles$Height, peak_thresh)

bloodmeal_profiles <- bloodmeal_profiles |>
rm_dups() |>
filter_peaks(peak_thresh)
bloodmeal_profiles <- prep_bloodmeal_profiles(
bloodmeal_profiles,
peak_thresh,
bloodmeal_ids
)

message("Formatting human profiles")

if (is.null(human_ids)) {
human_ids <- unique(human_profiles$SampleName)
} else {
human_ids <- subset_ids(human_ids, human_profiles$SampleName)
}

human_profiles <- human_profiles |>
dplyr::filter(SampleName %in% human_ids) |>
rm_dups()
human_profiles <- prep_human_profiles(
human_profiles,
human_ids,
rm_twins
)

message("Calculating log10LRs")

Expand Down
5 changes: 5 additions & 0 deletions R/calc_log10_lrs.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,11 @@ calc_one_log10_lr <-

#' Calculate log10_lrs for multiple bloodmeal-human pairs
#'
#' Note that this function doesn't preprocess the bloodmeal and human profile
#' data. If you would like to preprocess it in the same way as is performed
#' internally in the `bistro()` function, you must run
#' `prep_bloodmeal_profiles()` and `prep_human_profiles()` first.
#'
#' @inheritParams bistro
#'
#' @return A tibble with the same output as for [bistro()], except there is no
Expand Down
6 changes: 5 additions & 1 deletion R/identify_matches.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ identify_one_match_set <- function(log10_lrs, bloodmeal_id) {
)

if (all(is.na(log10_lrs$log10_lr) | is.infinite(log10_lrs$log10_lr))) {
matches_thresh <- matches
matches_thresh <- matches |>
dplyr::mutate(notes = ifelse(is.na(notes),
"all log10LRs NA or Inf",
paste0("all log10LRs NA or Inf", ";", notes)
))
} else if (max(log10_lrs$log10_lr[!is.infinite(log10_lrs$log10_lr)]) < 1.5 &&
is.na(notes)) {
matches_thresh <- matches |>
Expand Down
145 changes: 145 additions & 0 deletions R/preprocess_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#' Preprocess bloodmeal profiles
#'
#' Removes duplicates and peaks below threshold, subsets ids
#'
#' @inheritParams bistro
#'
#' @return Dataframe with preprocessed bloodmeal profiles
#' @export
#' @keywords internal
prep_bloodmeal_profiles <- function(bloodmeal_profiles,
peak_thresh,
bloodmeal_ids = NULL) {
if (is.null(bloodmeal_ids)) {
bloodmeal_ids <- unique(bloodmeal_profiles$SampleName)
} else {
bloodmeal_ids <-
subset_ids(bloodmeal_ids, bloodmeal_profiles$SampleName)
}

bloodmeal_profiles <- bloodmeal_profiles |>
dplyr::filter(SampleName %in% bloodmeal_ids)

check_heights(bloodmeal_profiles$Height, peak_thresh)

bloodmeal_profiles <- bloodmeal_profiles |>
rm_dups() |>
filter_peaks(peak_thresh)

return(bloodmeal_profiles)
}

#' Preprocess human profiles
#'
#' Removes duplicates and optionally twins, subsets ids
#'
#' @inheritParams bistro
#'
#' @return Dataframe with preprocessed human profiles
#' @export
#' @keywords internal
prep_human_profiles <- function(human_profiles,
human_ids = NULL,
rm_twins = TRUE) {
if (rm_twins) {
human_profiles <- rm_twins(human_profiles)
}

if (is.null(human_ids)) {
human_ids <- unique(human_profiles$SampleName)
} else {
human_ids <- subset_ids(human_ids, human_profiles$SampleName)
}

human_profiles <- human_profiles |>
dplyr::filter(SampleName %in% human_ids) |>
rm_dups()

return(human_profiles)
}

#' Remove duplicate rows with warning
#'
#' @param df Dataframe from which to remove duplicate rows
#'
#' @return Un-duplicated dataframe
#' @export
#' @keywords internal
rm_dups <- function(df) {
n_orig <- nrow(df)
if (n_orig != dplyr::n_distinct(df)) {
df <- df |>
dplyr::distinct()
warning(paste0(
"Detected and removed ",
n_orig - nrow(df),
" duplicate rows."
))
}
return(df)
}

#' Remove identical STR profiles
#'
#' @inheritParams bistro
#'
#' @return Data frame with twins removed
#' @export
#' @keywords internal
rm_twins <- function(human_profiles) {
not_twins <- human_profiles |>
dplyr::arrange(Marker, Allele) |>
dplyr::group_by(SampleName) |>
dplyr::summarize(all_alleles = stringr::str_c(paste0(Marker, Allele),
collapse = ";"
)) |>
dplyr::group_by(all_alleles) |>
dplyr::mutate(n = dplyr::n()) |>
dplyr::filter(n == 1) |>
dplyr::pull(SampleName)

n_ident_profs <-
dplyr::n_distinct(human_profiles$SampleName) - length(not_twins)

if (n_ident_profs > 0) {
message(
paste0(
"Identified ",
n_ident_profs,
" people whose profiles appear more than once",
" (likely identical twins). These are being removed."
)
)
human_profiles <- human_profiles |>
dplyr::filter(SampleName %in% not_twins)
}

return(human_profiles)
}

#' Subset to ids present in the dataset
#'
#' @param ids ids to check if in vec
#' @param vec vector of ids
#' @param vec_name name of vector
#'
#' @return list of IDs that are present
#' @keywords internal
subset_ids <- function(ids, vec, vec_name) {
id_absent <- setdiff(ids, vec)
if (length(id_absent) > 0) {
warning(
"Removing ",
length(id_absent),
" ",
vec_name,
" not found in the dataset: ",
id_absent
)
}
ids <- intersect(ids, vec)
if (length(ids) == 0) {
stop("None of the provided ", vec_name, " are present in the dataset.")
}
return(ids)
}
86 changes: 0 additions & 86 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,89 +30,3 @@ utils::globalVariables(
ignore_unused_imports <- function() {
codetools::checkUsage
}

#' Remove duplicate rows with warning
#'
#' @param df Dataframe from which to remove duplicate rows
#'
#' @return Un-duplicated dataframe
#' @export
#' @keywords internal
rm_dups <- function(df) {
n_orig <- nrow(df)
if (n_orig != dplyr::n_distinct(df)) {
df <- df |>
dplyr::distinct()
warning(paste0(
"Detected and removed ",
n_orig - nrow(df),
" duplicate rows."
))
}
return(df)
}

#' Remove identical STR profiles
#'
#' @inheritParams bistro
#'
#' @return Data frame with twins removed
#' @export
#' @keywords internal
rm_twins <- function(human_profiles) {
not_twins <- human_profiles |>
dplyr::arrange(Marker, Allele) |>
dplyr::group_by(SampleName) |>
dplyr::summarize(all_alleles = stringr::str_c(paste0(Marker, Allele),
collapse = ";"
)) |>
dplyr::group_by(all_alleles) |>
dplyr::mutate(n = dplyr::n()) |>
dplyr::filter(n == 1) |>
dplyr::pull(SampleName)

n_ident_profs <-
dplyr::n_distinct(human_profiles$SampleName) - length(not_twins)

if (n_ident_profs > 0) {
message(
paste0(
"Identified ",
n_ident_profs,
" people whose profiles appear more than once",
" (likely identical twins). These are being removed."
)
)
human_profiles <- human_profiles |>
dplyr::filter(SampleName %in% not_twins)
}

return(human_profiles)
}

#' Subset to ids present in the dataset
#'
#' @param ids ids to check if in vec
#' @param vec vector of ids
#' @param vec_name name of vector
#'
#' @return list of IDs that are present
#' @keywords internal
subset_ids <- function(ids, vec, vec_name) {
id_absent <- setdiff(ids, vec)
if (length(id_absent) > 0) {
warning(
"Removing ",
length(id_absent),
" ",
vec_name,
" not found in the dataset: ",
id_absent
)
}
ids <- intersect(ids, vec)
if (length(ids) == 0) {
stop("None of the provided ", vec_name, " are present in the dataset.")
}
return(ids)
}
5 changes: 4 additions & 1 deletion man/calc_log10_lrs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions man/prep_bloodmeal_profiles.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions man/prep_human_profiles.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/rm_dups.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 9a2d61d

Please sign in to comment.