refactor in progress; testing

nf-core · Aug 4, 2023 · 4bc90b0 · 4bc90b0
1 parent 3ed9c7e
commit 4bc90b0
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 34 deletions.
diff --git a/subworkflows/local/align_reads_to_ref.nf b/subworkflows/local/align_reads_to_ref.nf
@@ -14,8 +14,11 @@ workflow ALIGN_READS_TO_REF {
 
     ch_versions = Channel.empty()
 
-    ch_reads     = ch_input.map { [it[0], it[1]] }
-    ch_bwa_index = ch_input.map { [it[0], it[5]] }
+    samp_ref_combo = ch_input
+        .map { [[id: "${it[2].id}_${it[0].id}", ref: it[2], sample: it[0]]] + it } // make composite ID for read/ref combos
+
+    ch_reads     = samp_ref_combo.map { [it[0], it[2]] }
+    ch_bwa_index = samp_ref_combo.map { [it[0], it[6]] }
     BWA_MEM ( ch_reads, ch_bwa_index, false )
     ch_versions = ch_versions.mix(BWA_MEM.out.versions.toSortedList().map{it[0]})
 
@@ -25,8 +28,8 @@ workflow ALIGN_READS_TO_REF {
     PICARD_SORTSAM_1 ( PICARD_ADDORREPLACEREADGROUPS.out.bam, 'coordinate' )
     ch_versions = ch_versions.mix(PICARD_SORTSAM_1.out.versions.toSortedList().map{it[0]})
 
-    ch_reference = ch_input.map { [it[0], it[3]] } // channel: [ val(meta), file(reference) ]
-    ch_ref_index = ch_input.map { [it[0], it[4]] } // channel: [ val(meta), file(ref_index) ]
+    ch_reference = samp_ref_combo.map { [it[0], it[4]] } // channel: [ val(ref_samp_meta), file(reference) ]
+    ch_ref_index = samp_ref_combo.map { [it[0], it[5]] } // channel: [ val(ref_samp_meta), file(ref_index) ]
     picard_input = PICARD_SORTSAM_1.out.bam // joined to associated right reference with each sample
         .join(ch_reference)
         .join(ch_ref_index)
@@ -43,9 +46,16 @@ workflow ALIGN_READS_TO_REF {
     SAMTOOLS_INDEX ( PICARD_SORTSAM_2.out.bam )
     ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.toSortedList().map{it[0]})
 
+    // Revet combined metas back to seperate ones for sample and reference
+    out_bam = PICARD_SORTSAM_2.out.bam        // channel: [ val(ref_samp_meta), [ bam ] ]
+        .map { [it[0].sample, it[0].ref, it[1]] }
+    out_bai = SAMTOOLS_INDEX.out.bai        // channel: [ val(ref_samp_meta), [ bai ] ]
+        .map { [it[0].sample, it[0].ref, it[1]] }                               
+
+
     emit:
-    bam      = PICARD_SORTSAM_2.out.bam        // channel: [ val(meta), [ bam ] ]
-    bai      = SAMTOOLS_INDEX.out.bai   // channel: [ val(meta), [ bai ] ]
-    versions = ch_versions                     // channel: [ versions.yml ]
+    bam      = out_bam        // channel: [ val(meta), val(ref_meta), [ bam ] ]
+    bai      = out_bai        // channel: [ val(meta), val(ref_meta), [ bai ] ]
+    versions = ch_versions    // channel: [ versions.yml ]
 }
 
diff --git a/subworkflows/local/core_genome_phylogeny.nf b/subworkflows/local/core_genome_phylogeny.nf
@@ -11,7 +11,7 @@ include { COREGENOMEPHYLOGENYREPORT } from '../../modules/local/core_gene_phylog
 workflow CORE_GENOME_PHYLOGENY {
 
     take:
-    ch_input // [ val(meta), file(gff), val(ref_meta), file(reference) ]
+    ch_input // [ val(meta), file(gff), val(group_meta) ]
     ch_samplesheet // channel: path
 
     main:

diff --git a/subworkflows/local/genome_assembly.nf b/subworkflows/local/genome_assembly.nf
@@ -7,13 +7,16 @@ include { BAKTA_BAKTA     } from '../../modules/nf-core/bakta/bakta/main'
 workflow GENOME_ASSEMBLY {
 
     take:
-    ch_input // channel: [ val(meta), [fastq_1, fastq_2 ], val(ref_meta), file(reference) ]
+    ch_input // channel: [ val(meta), [fastq_1, fastq_2], val(ref_meta), file(reference), val(group_meta), val(kingdom) ]
 
     main:
 
     ch_versions = Channel.empty()
-    ch_reads = ch_input.map { it[0..1] }
-    ch_ref = ch_input.map { [it[0], it[2], it[3]] }
+    ch_input_filtered = ch_input
+        .filter { it[5] == "Bacteria" }
+    ch_reads = ch_input_filtered
+        .map { it[0..1] }
+        .unique()
 
     FASTP ( ch_reads, [], false, false )
     ch_versions = ch_versions.mix(FASTP.out.versions.first())
@@ -30,8 +33,8 @@ workflow GENOME_ASSEMBLY {
     )
     ch_versions = ch_versions.mix(FILTER_ASSEMBLY.out.versions.first())
 
-    ch_ref_grouped = FILTER_ASSEMBLY.out.filtered
-        .join(ch_ref) 
+    ch_ref_grouped = ch_input_filtered
+        .combine(FILTER_ASSEMBLY.out.filtered)
         .groupTuple(by: 2)
         .map { [it[2], it[3].sort()[0], it[1]] }
     QUAST (

diff --git a/subworkflows/local/variant_calling_analysis.nf b/subworkflows/local/variant_calling_analysis.nf
@@ -14,23 +14,24 @@ workflow VARIANT_CALLING_ANALYSIS {
     ch_reference = input.map { [it[0], it[2], it[3]] }
     ch_versions = Channel.empty()
 
-    MAKE_REFERENCE_INDEX ( ch_reference )
+    MAKE_REFERENCE_INDEX ( ch_reference.unique() )
     ch_versions = ch_versions.mix(MAKE_REFERENCE_INDEX.out.versions)      
 
     ALIGN_READS_TO_REF (
         ch_reads
         .join(ch_reference)
         .join(MAKE_REFERENCE_INDEX.out.samtools_fai)
         .join(MAKE_REFERENCE_INDEX.out.bwa_index)
+        .unique()
     )
     ch_versions = ch_versions.mix(ALIGN_READS_TO_REF.out.versions)       
 
     CALL_VARIANTS (
         ALIGN_READS_TO_REF.out.bam
-        .join(ALIGN_READS_TO_REF.out.bai)
-        .join(input.map { [it[0], it[2], it[3], it[4]] })
-        .join(MAKE_REFERENCE_INDEX.out.samtools_fai)
-        .join(MAKE_REFERENCE_INDEX.out.picard_dict)
+        .join(ALIGN_READS_TO_REF.out.bai, by: 0..1) // [meta, ref_meta, bam, bai]
+        .combine(input.map { [it[0], it[2], it[3], it[4]] }, by: 0..1)  // [meta, ref_meta, bam, bai, ref, group_meta]
+        .combine(MAKE_REFERENCE_INDEX.out.samtools_fai, by: 1)
+        .combine(MAKE_REFERENCE_INDEX.out.picard_dict, by: 1)
     )
     ch_versions = ch_versions.mix(CALL_VARIANTS.out.versions)       
 

diff --git a/test/data/metadata_medium.csv b/test/data/metadata_medium.csv
@@ -1,6 +1,11 @@
-sample,fastq_1,fastq_2,reference,reference_id,lab_id,report_group,lab_name,date_isolated,date_received,year,host,cv_key,nusery,breeder,breeder_long,br_key,notes
-22-299,test/data/reads/22-299_R1.fastq.gz,test/data/reads/22-299_R2.fastq.gz,,,202200055,xan_test;subgroup,"Karen Rane,Univ. MD",3/2/22,3/29/22,2022,Pelargonium x hortorum,CV-1,MD,Dummen,Dummen,Br-1,Karen Rane - originally contacted Melodie | Dummen is original source
-22-310,test/data/reads/22-310_R1.fastq.gz,test/data/reads/22-310_R2.fastq.gz,,,MD-30A,xan_test;subgroup,"Margery Daughtrey, Cornell",3/4/22,4/5/22,2022,P. x hortorum,CV-8,NY,Dummen,Dummen,Br-1,NA
-22-331,test/data/reads/22-331_R1.fastq.gz,test/data/reads/22-331_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,22-00152,xan_test,"John Bonkowski, Purdue",3/21/22,4/21/22,2022,Pelargonium x,CV-20,IN,Syngenta,Syngenta Calliope,Br-2,Samples from Syngenta (Purdue?) housed in same house as Dummen
-pram1,test/data/reads/lane6-s013-indexRPI27-ATTCCT-7612-C7_S13_L006_R1_001.fastq.gz,test/data/reads/lane6-s013-indexRPI27-ATTCCT-7612-C7_S13_L006_R2_001.fastq.gz,test/data/refs/PR-102_v3.1.fasta.gz,PR-102_v3.1,NA,pram_test,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-pram2,test/data/reads/lane6-s008-indexRPI16-CCGTCC-7612-D1_S8_L006_R1_001.fastq.gz,test/data/reads/lane6-s008-indexRPI16-CCGTCC-7612-D1_S8_L006_R2_001.fastq.gz,,,NA,pram_test,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+sample,fastq_1,fastq_2,reference,reference_id,report_group,lab_id,lab_name,date_isolated,date_received,year,host,cv_key,nusery,breeder,breeder_long,br_key,notes
+22-299,test/data/reads/22-299_R1.fastq.gz,test/data/reads/22-299_R2.fastq.gz,,,xan_test;subgroup,202200055,"Karen Rane,Univ. MD",3/2/22,3/29/22,2022,Pelargonium x hortorum,CV-1,MD,Dummen,Dummen,Br-1,Karen Rane - originally contacted Melodie | Dummen is original source
+22-300,test/data/reads/22-300_R1.fastq.gz,test/data/reads/22-300_R2.fastq.gz,,,xan_test;subgroup,202200056,"Karen Rane,Univ. MD",3/2/22,3/30/22,2022,Pelargonium x hortorum,CV-2,MD,Dummen,Dummen,Br-1,NA
+22-301,test/data/reads/22-301_R1.fastq.gz,test/data/reads/22-301_R2.fastq.gz,,,xan_test;subgroup,202200057,"Karen Rane,Univ. MD",3/2/22,3/31/22,2022,Pelargonium x hortorum,CV-3,MD,Dummen,Dummen,Br-1,NA
+22-324,test/data/reads/22-324_R1.fastq.gz,test/data/reads/22-324_R2.fastq.gz,,,xan_test;subgroup,MD 5-1-7,"Margery Daughtrey, Cornell",1987,4/19/22,1987,P. x hortorum,NA,NA,NA,NA,NA,NA
+22-329A,test/data/reads/22-329A_R1.fastq.gz,test/data/reads/22-329A_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,MD-52B,"Margery Daughtrey, Cornell",3/25/22,4/30/22,2022,P. x hortorum,CV-28,PA,Dummen,Dummen,Br-1,from same plant as 22-329A
+22-329B,test/data/reads/22-329B_R1.fastq.gz,test/data/reads/22-329B_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,MD-52A,"Margery Daughtrey, Cornell",3/25/22,4/29/22,2022,P. x hortorum,CV-28,PA,Dummen,Dummen,Br-1,NA
+22-330,test/data/reads/22-330_R1.fastq.gz,test/data/reads/22-330_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,MD-53B,"Margery Daughtrey, Cornell",3/25/22,5/1/22,2022,P. x hortorum,CV-29,PA,Syngenta,Syngenta,Br-2,NA
+SRR17286018,test/data/reads/SRR17286018_R1.fastq.gz,test/data/reads/SRR17286018_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+pram1,test/data/reads/lane6-s013-indexRPI27-ATTCCT-7612-C7_S13_L006_R1_001.fastq.gz,test/data/reads/lane6-s013-indexRPI27-ATTCCT-7612-C7_S13_L006_R2_001.fastq.gz,test/data/refs/PR-102_v3.1.fasta.gz,PR-102_v3.1,pram_test,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+pram2,test/data/reads/lane6-s008-indexRPI16-CCGTCC-7612-D1_S8_L006_R1_001.fastq.gz,test/data/reads/lane6-s008-indexRPI16-CCGTCC-7612-D1_S8_L006_R2_001.fastq.gz,test/data/refs/PR-102_v3.1.fasta.gz,PR-102_v3.1,pram_test,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
diff --git a/workflows/plantpathsurveil.nf b/workflows/plantpathsurveil.nf
@@ -45,6 +45,7 @@ include { CORE_GENOME_PHYLOGENY    } from '../subworkflows/local/core_genome_phy
 include { VARIANT_CALLING_ANALYSIS } from '../subworkflows/local/variant_calling_analysis'
 include { DOWNLOAD_REFERENCES      } from '../subworkflows/local/download_references'
 include { ASSIGN_REFERENCES        } from '../subworkflows/local/assign_references'
+include { GENOME_ASSEMBLY          } from '../subworkflows/local/genome_assembly'
 
 
 /*
@@ -78,7 +79,7 @@ workflow PLANTPATHSURVEIL {
     INPUT_CHECK (
         ch_input
     )
-    ch_reads = INPUT_CHECK.out.sample_data
+    ch_reads = INPUT_CHECK.out.sample_data // [val(meta), [file(fastq)], val(ref_meta), file(reference), val(group_meta)]
         .map { it[0..1] }
         .distinct()
     ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
@@ -117,17 +118,20 @@ workflow PLANTPATHSURVEIL {
     )
 
     // Assemble and annotate bacterial genomes
-    //GENOME_ASSEMBLY (                                                           
-    //    ch_reads                                                                
-    //    .join(ch_reference)                                                     
-    //)                                                                           
-    //ch_versions = ch_versions.mix(GENOME_ASSEMBLY.out.versions)                 
+    GENOME_ASSEMBLY (                                                           
+        ASSIGN_REFERENCES.out.sample_data                           
+            .combine(COARSE_SAMPLE_TAXONOMY.out.kingdom, by: 0)                                                   
+    )                                                                           
+    ch_versions = ch_versions.mix(GENOME_ASSEMBLY.out.versions)                 
 
     // Create core gene phylogeny for bacterial samples
-    //CORE_GENOME_PHYLOGENY (                                                     
-    //    GENOME_ASSEMBLY.out.gff.join(ch_reference),                             
-    //    ch_samplesheet                                                          
-    //)                                                                           
+    gff_and_group = ASSIGN_REFERENCES.out.sample_data  // [val(meta), [file(fastq)], val(ref_meta), file(reference), val(group_meta)]
+        .combine(GENOME_ASSEMBLY.out.gff, by: 0) // [val(meta), [file(fastq)], val(ref_meta), file(reference), val(group_meta), file(gff)]
+        .map { [it[0], it[5], it[4]] } // [ val(meta), file(gff), val(group_meta) ]            
+    CORE_GENOME_PHYLOGENY (                                                     
+        gff_and_group,                             
+        ch_input                                                          
+    )                                                                           
 
     // Read2tree phylogeny for eukaryotes
     //READ2TREE_ANALYSIS (