Skip to content

Commit

Permalink
fixed bug when no taxa passed sendsketch quality filters
Browse files Browse the repository at this point in the history
  • Loading branch information
zachary-foster committed Aug 10, 2023
1 parent a4b26ec commit 516c750
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 23 deletions.
30 changes: 18 additions & 12 deletions bin/pick_assemblies.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ min_coverage <- 30

# Parse inputs
args <- commandArgs(trailingOnly = TRUE)
# args <- c("/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/a5/f4d1a4a852122ea9970733dd75170b/families.txt",
# "/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/a5/f4d1a4a852122ea9970733dd75170b/genera.txt",
# "/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/a5/f4d1a4a852122ea9970733dd75170b/species.txt",
# "/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/c8/3c3003a3c9420f2d5142f83b392c27/merged_assembly_stats.tsv",
# args <- c("/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/7e/d237e4239125be2ecc2c30520fcb3c/families.txt",
# "/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/7e/d237e4239125be2ecc2c30520fcb3c/genera.txt",
# "/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/7e/d237e4239125be2ecc2c30520fcb3c/species.txt",
# "/media/fosterz/external_primary/files/projects/work/current/nf-core-plantpathsurveil/work/0d/03837419adaf53ebcb2464a02453a2/merged_assembly_stats.tsv",
# "5",
# "result.tsv")
names(args) <- c("family", "genus", "species", "stats", "count", "out_path")
Expand Down Expand Up @@ -45,8 +45,10 @@ sp_stats <- lapply(split(sp_stats, sp_stats$SpeciesName), function(per_sp_data)
per_org_data[which.max(per_org_data$ScaffoldN50), ]
})
})
sp_stats <- do.call(rbind, unlist(sp_stats, recursive = FALSE))
sp_stats <- sp_stats[order(sp_stats$ScaffoldN50, decreasing = TRUE)[1:get_count(sp_stats)], ]
if (length(sp_stats) != 0 ) {
sp_stats <- do.call(rbind, unlist(sp_stats, recursive = FALSE))
sp_stats <- sp_stats[order(sp_stats$ScaffoldN50, decreasing = TRUE)[1:get_count(sp_stats)], ]
}

# Pick representatives for each genus
gn_stats <- stats[stats$genus %in% genera, , drop = FALSE]
Expand All @@ -55,9 +57,11 @@ gn_stats <- lapply(split(gn_stats, gn_stats$genus), function(per_sp_data) {
per_org_data[which.max(per_org_data$ScaffoldN50), ]
})
})
gn_stats <- do.call(rbind, unlist(gn_stats, recursive = FALSE))
gn_stats <- gn_stats[! gn_stats$SpeciesName %in% sp_stats$SpeciesName, ] # Dont include the species already chosen
gn_stats <- gn_stats[order(gn_stats$ScaffoldN50, decreasing = TRUE)[1:get_count(gn_stats)], ]
if (length(gn_stats) != 0 ) {
gn_stats <- do.call(rbind, unlist(gn_stats, recursive = FALSE))
gn_stats <- gn_stats[! gn_stats$SpeciesName %in% sp_stats$SpeciesName, ] # Dont include the species already chosen
gn_stats <- gn_stats[order(gn_stats$ScaffoldN50, decreasing = TRUE)[1:get_count(gn_stats)], ]
}

# Pick representatives for each family
fa_stats <- stats[stats$Family %in% families, , drop = FALSE]
Expand All @@ -66,9 +70,11 @@ fa_stats <- lapply(split(fa_stats, fa_stats$Family), function(per_fm_data) {
per_gn_data[which.max(per_gn_data$ScaffoldN50), ]
})
})
fa_stats <- do.call(rbind, unlist(fa_stats, recursive = FALSE))
fa_stats <- fa_stats[! fa_stats$genus %in% gn_stats$genus, ] # Dont include the genera already chosen
fa_stats <- fa_stats[order(fa_stats$ScaffoldN50, decreasing = TRUE)[1:get_count(fa_stats)], ]
if (length(fa_stats) != 0 ) {
fa_stats <- do.call(rbind, unlist(fa_stats, recursive = FALSE))
fa_stats <- fa_stats[! fa_stats$genus %in% gn_stats$genus, ] # Dont include the genera already chosen
fa_stats <- fa_stats[order(fa_stats$ScaffoldN50, decreasing = TRUE)[1:get_count(fa_stats)], ]
}

# Combine results
result <- rbind(sp_stats, gn_stats, fa_stats)
Expand Down
24 changes: 13 additions & 11 deletions bin/sendsketch_filter.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env -S Rscript --vanilla

# Options
ani_threshold <- 90
complt_threshold <- 70
ani_threshold <- c(species = 90, genus = 90, family = 70) # These numbers are total guesses. TODO: find reasonable defaults (issue #11)
complt_threshold <- c(species = 60, genus = 40, family = 30) # These numbers are total guesses. TODO: find reasonable defaults (issue #11)

# Parse inputs
args = commandArgs(trailingOnly = TRUE)
Expand All @@ -12,19 +12,21 @@ data <- read.csv(args[1], skip = 2, header = TRUE, sep = '\t')
data$ANI <- as.numeric(sub(pattern = "%", replacement = "", fixed = TRUE, data$ANI))
data$Complt <- as.numeric(sub(pattern = "%", replacement = "", fixed = TRUE, data$Complt))

# Filter table by match quality
data <- data[data$ANI > ani_threshold, ]
data <- data[data$Complt > complt_threshold, ]
# Filter data by threshold and extract passing taxon names
filter_and_extract <- function(table, level, pattern) {
table <- table[table$ANI > ani_threshold[level] & table$Complt > complt_threshold[level], ]
gsub(pattern, "\\1", table$taxonomy)
}

# Extract taxon info
data$genus <- gsub(".*g:(.+);s:.*", "\\1", data$taxonomy)
data$family <- gsub(".*f:(.+);g:.*", "\\1", data$taxonomy)
data$species <- gsub(".*;s:([a-zA-Z0-9 .]+);?.*", "\\1", data$taxonomy)
genus <- filter_and_extract(data, "genus", ".*g:(.+);s:.*")
family <- filter_and_extract(data, "family", ".*f:(.+);g:.*")
species <- filter_and_extract(data, "species", ".*;s:([a-zA-Z0-9 .]+);?.*")

# Write output
writeLines(unique(data$species), "species.txt")
writeLines(unique(data$genus), "genera.txt")
writeLines(unique(data$family), "families.txt")
writeLines(unique(species), "species.txt")
writeLines(unique(genus), "genera.txt")
writeLines(unique(family), "families.txt")

# Save kingdom
kingdom <- gsub("sk:(.+);p:.*", "\\1", data$taxonomy[1])
Expand Down
1 change: 1 addition & 0 deletions modules/local/initialclassification.nf
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ process INITIALCLASSIFICATION {
def prefix = task.ext.prefix ?: "${meta.id}"
"""
Rscript --vanilla ${projectDir}/bin/sendsketch_filter.R $hits
KINGDOM="\$(cat kingdom.txt)"
CLASS="\$(cat classification.txt)"
Expand Down

0 comments on commit 516c750

Please sign in to comment.