From 516c75013f916059ace4f89bb4f281930ed1857b Mon Sep 17 00:00:00 2001 From: "Zachary S.L. Foster" Date: Wed, 9 Aug 2023 17:24:16 -0700 Subject: [PATCH] fixed bug when no taxa passed sendsketch quality filters --- bin/pick_assemblies.R | 30 +++++++++++++++----------- bin/sendsketch_filter.R | 24 +++++++++++---------- modules/local/initialclassification.nf | 1 + 3 files changed, 32 insertions(+), 23 deletions(-) diff --git a/bin/pick_assemblies.R b/bin/pick_assemblies.R index 287181f5..1e85ec0c 100755 --- a/bin/pick_assemblies.R +++ b/bin/pick_assemblies.R @@ -5,10 +5,10 @@ min_coverage <- 30 # Parse inputs args <- commandArgs(trailingOnly = TRUE) -# args <- c("/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/a5/f4d1a4a852122ea9970733dd75170b/families.txt", -# "/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/a5/f4d1a4a852122ea9970733dd75170b/genera.txt", -# "/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/a5/f4d1a4a852122ea9970733dd75170b/species.txt", -# "/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/c8/3c3003a3c9420f2d5142f83b392c27/merged_assembly_stats.tsv", +# args <- c("/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/7e/d237e4239125be2ecc2c30520fcb3c/families.txt", +# "/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/7e/d237e4239125be2ecc2c30520fcb3c/genera.txt", +# "/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/7e/d237e4239125be2ecc2c30520fcb3c/species.txt", +# "/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/0d/03837419adaf53ebcb2464a02453a2/merged_assembly_stats.tsv", # "5", # "result.tsv") names(args) <- c("family", "genus", "species", "stats", "count", "out_path") @@ -45,8 +45,10 @@ sp_stats <- lapply(split(sp_stats, sp_stats$SpeciesName), function(per_sp_data) per_org_data[which.max(per_org_data$ScaffoldN50), ] }) }) -sp_stats <- do.call(rbind, unlist(sp_stats, recursive = FALSE)) -sp_stats <- sp_stats[order(sp_stats$ScaffoldN50, decreasing = TRUE)[1:get_count(sp_stats)], ] +if (length(sp_stats) != 0 ) { + sp_stats <- do.call(rbind, unlist(sp_stats, recursive = FALSE)) + sp_stats <- sp_stats[order(sp_stats$ScaffoldN50, decreasing = TRUE)[1:get_count(sp_stats)], ] +} # Pick representatives for each genus gn_stats <- stats[stats$genus %in% genera, , drop = FALSE] @@ -55,9 +57,11 @@ gn_stats <- lapply(split(gn_stats, gn_stats$genus), function(per_sp_data) { per_org_data[which.max(per_org_data$ScaffoldN50), ] }) }) -gn_stats <- do.call(rbind, unlist(gn_stats, recursive = FALSE)) -gn_stats <- gn_stats[! gn_stats$SpeciesName %in% sp_stats$SpeciesName, ] # Dont include the species already chosen -gn_stats <- gn_stats[order(gn_stats$ScaffoldN50, decreasing = TRUE)[1:get_count(gn_stats)], ] +if (length(gn_stats) != 0 ) { + gn_stats <- do.call(rbind, unlist(gn_stats, recursive = FALSE)) + gn_stats <- gn_stats[! gn_stats$SpeciesName %in% sp_stats$SpeciesName, ] # Dont include the species already chosen + gn_stats <- gn_stats[order(gn_stats$ScaffoldN50, decreasing = TRUE)[1:get_count(gn_stats)], ] +} # Pick representatives for each family fa_stats <- stats[stats$Family %in% families, , drop = FALSE] @@ -66,9 +70,11 @@ fa_stats <- lapply(split(fa_stats, fa_stats$Family), function(per_fm_data) { per_gn_data[which.max(per_gn_data$ScaffoldN50), ] }) }) -fa_stats <- do.call(rbind, unlist(fa_stats, recursive = FALSE)) -fa_stats <- fa_stats[! fa_stats$genus %in% gn_stats$genus, ] # Dont include the genera already chosen -fa_stats <- fa_stats[order(fa_stats$ScaffoldN50, decreasing = TRUE)[1:get_count(fa_stats)], ] +if (length(fa_stats) != 0 ) { + fa_stats <- do.call(rbind, unlist(fa_stats, recursive = FALSE)) + fa_stats <- fa_stats[! fa_stats$genus %in% gn_stats$genus, ] # Dont include the genera already chosen + fa_stats <- fa_stats[order(fa_stats$ScaffoldN50, decreasing = TRUE)[1:get_count(fa_stats)], ] +} # Combine results result <- rbind(sp_stats, gn_stats, fa_stats) diff --git a/bin/sendsketch_filter.R b/bin/sendsketch_filter.R index d760838a..0c4f03cb 100755 --- a/bin/sendsketch_filter.R +++ b/bin/sendsketch_filter.R @@ -1,8 +1,8 @@ #!/usr/bin/env -S Rscript --vanilla # Options -ani_threshold <- 90 -complt_threshold <- 70 +ani_threshold <- c(species = 90, genus = 90, family = 70) # These numbers are total guesses. TODO: find reasonable defaults (issue #11) +complt_threshold <- c(species = 60, genus = 40, family = 30) # These numbers are total guesses. TODO: find reasonable defaults (issue #11) # Parse inputs args = commandArgs(trailingOnly = TRUE) @@ -12,19 +12,21 @@ data <- read.csv(args[1], skip = 2, header = TRUE, sep = '\t') data$ANI <- as.numeric(sub(pattern = "%", replacement = "", fixed = TRUE, data$ANI)) data$Complt <- as.numeric(sub(pattern = "%", replacement = "", fixed = TRUE, data$Complt)) -# Filter table by match quality -data <- data[data$ANI > ani_threshold, ] -data <- data[data$Complt > complt_threshold, ] +# Filter data by threshold and extract passing taxon names +filter_and_extract <- function(table, level, pattern) { + table <- table[table$ANI > ani_threshold[level] & table$Complt > complt_threshold[level], ] + gsub(pattern, "\\1", table$taxonomy) +} # Extract taxon info -data$genus <- gsub(".*g:(.+);s:.*", "\\1", data$taxonomy) -data$family <- gsub(".*f:(.+);g:.*", "\\1", data$taxonomy) -data$species <- gsub(".*;s:([a-zA-Z0-9 .]+);?.*", "\\1", data$taxonomy) +genus <- filter_and_extract(data, "genus", ".*g:(.+);s:.*") +family <- filter_and_extract(data, "family", ".*f:(.+);g:.*") +species <- filter_and_extract(data, "species", ".*;s:([a-zA-Z0-9 .]+);?.*") # Write output -writeLines(unique(data$species), "species.txt") -writeLines(unique(data$genus), "genera.txt") -writeLines(unique(data$family), "families.txt") +writeLines(unique(species), "species.txt") +writeLines(unique(genus), "genera.txt") +writeLines(unique(family), "families.txt") # Save kingdom kingdom <- gsub("sk:(.+);p:.*", "\\1", data$taxonomy[1]) diff --git a/modules/local/initialclassification.nf b/modules/local/initialclassification.nf index 67d41e12..ddc31b32 100644 --- a/modules/local/initialclassification.nf +++ b/modules/local/initialclassification.nf @@ -26,6 +26,7 @@ process INITIALCLASSIFICATION { def prefix = task.ext.prefix ?: "${meta.id}" """ Rscript --vanilla ${projectDir}/bin/sendsketch_filter.R $hits + KINGDOM="\$(cat kingdom.txt)" CLASS="\$(cat classification.txt)"