rnajena · gabriellovate · Apr 30, 2024 · Apr 30, 2024
diff --git a/bin/deduplicate_annotations.py b/bin/deduplicate_annotations.py
diff --git a/bin/helper.py b/bin/helper.py
@@ -5,6 +5,7 @@
 from matplotlib.colors import LogNorm
 import pandas as pd
 
+
 def parse_fasta(fasta_file):
     """Parses a fasta file and returns a dictionary of segment names and sequences.
 
@@ -27,24 +28,25 @@ def parse_fasta(fasta_file):
     with open(fasta_file) as file:  # read genome file
         for line in file:  # parse genome file
             if line.startswith(">"):  # parse header
-            #* if there is a header already, we store the current sequence
-            #* to this header.
+                # * if there is a header already, we store the current sequence
+                # * to this header.
                 if header:
                     fasta_dict[header] = seq
-                    #* then we flush the sequence
+                    # * then we flush the sequence
                     seq = ""
-                #* and parse the new header
+                # * and parse the new header
                 header = line.strip()[1:]
             else:
-                #* if no header is detected, we append the new line
-                #* to the current sequence
+                # * if no header is detected, we append the new line
+                # * to the current sequence
                 seq += line.strip()
-        #* after the last line, we have to store
-        #* the last sequence as well. Since no new line with ">" occurs, we
-        #* do this manually
+        # * after the last line, we have to store
+        # * the last sequence as well. Since no new line with ">" occurs, we
+        # * do this manually
         fasta_dict[header] = seq
     return fasta_dict
 
+
 def make_combination_array(genome_dict, intra_only=False):
     """
     Creates a dictionary of numpy array of all possible genome segment combinations.
@@ -72,21 +74,24 @@ def make_combination_array(genome_dict, intra_only=False):
     # * the iterator to a list. Actually, we also could just put the iterator in the for loop.
     # * should work as well. Is a tad more memory efficient.
     if intra_only:
-        segment_combinations = list(itertools.combinations_with_replacement(segments, 2))
+        segment_combinations = list(
+            itertools.combinations_with_replacement(segments, 2)
+        )
         segment_combinations = [
             segment_combination
             for segment_combination in segment_combinations
             if segment_combination[0] == segment_combination[1]
         ]
     else:
-        segment_combinations = list(itertools.combinations_with_replacement(segments, 2))
+        segment_combinations = list(
+            itertools.combinations_with_replacement(segments, 2)
+        )
         segment_combinations = [
             segment_combination
             for segment_combination in segment_combinations
             if segment_combination[0] != segment_combination[1]
         ]
 
-
     for segment_combination in segment_combinations:
         # for segment_combination in itertools.combinations_with_replacement(segments,2): # * this should work as well
         combination_arrays[segment_combination] = np.zeros(
@@ -98,38 +103,6 @@ def make_combination_array(genome_dict, intra_only=False):
     return combination_arrays
 
 
-def parse_annotation_table(annotation_table):
-    """
-    Parses an annotation table and returns a dictionary of segment names and annotations.
-    The annotation table must have the following columns: id,segment01,start01,end01,segment02,start02,end02
-
-    Parameters
-    ----------
-    annotation_table : str
-        Path to annotation table.
-
-    Returns
-    -------
-    annotation_dict : dict
-        Dictionary of segment names and annotations.
-
-    """
-    annotation_dict = {}
-    with open(annotation_table) as file:
-        for line in file:
-            if not line.startswith("id"):
-                line = line.strip().split("\t")
-                annotation_dict[line[0]] = {
-                    "segment01": line[1],
-                    "start01": int(line[2]),
-                    "end01": int(line[3]),
-                    "segment02": line[4],
-                    "start02": int(line[5]),
-                    "end02": int(line[6]),
-                }
-    return annotation_dict
-
-
 def positive_to_negative_strand_point(genome_dict, segment, position):
     """Transpose a point from the positive to the negative strand.
 
@@ -147,6 +120,7 @@ def positive_to_negative_strand_point(genome_dict, segment, position):
     # Get the position on the negative strand
     return aLen - position + 1
 
+
 def negative_to_positive_strand(genome_dict, aSeq, cai, caj, bSeq, cbi, cbj):
     """Transpose the region of interest from the negative to the positive strand.
 
@@ -178,6 +152,7 @@ def negative_to_positive_strand(genome_dict, aSeq, cai, caj, bSeq, cbi, cbj):
 
     return aSeq, cai_pos, caj_pos, bSeq, cbi_pos, cbj_pos
 
+
 def positive_to_negative_strand(genome_dict, aSeq, cai, caj, bSeq, cbi, cbj):
     """Transpose the region of interest from the positive to the negative strand.
 
@@ -238,12 +213,10 @@ def parse_annotation_table(annotation_table):
             "end02",
         ]
         regions = pd.read_csv(annotation_table, names=header)
-         # add id column
+        # add id column
         regions["id"] = regions.index
     elif annotation_table.lower().endswith(".tsv"):
         regions = pd.read_csv(annotation_table, sep="\t")
     else:
-        raise ValueError(
-            "Annotation table must be either a csv file or a xlsx file."
-        )
-    return regions
+        raise ValueError("Annotation table must be either a csv file or a xlsx file.")
+    return regions
diff --git a/bin/make_circos_files.py b/bin/make_circos_files.py
@@ -448,7 +448,10 @@ def main():
         DESeq2_results = DESeq2_results.rename(columns={"Unnamed: 0": "id"})
 
         # Read annotation table
-        annotation_table = hp.parse_annotation_table(annotation_table)
+        annotation_table = pd.read_csv(annotation_table, header=0, index_col=0, sep="\t")
+
+        # Remove rows in DESeq2 results that are not in the annotation table
+        DESeq2_results = DESeq2_results[DESeq2_results["id"].isin(annotation_table.index)]
 
         # Parse genome segment names and sequences
         genome_dict = hp.parse_fasta(genome_file)

diff --git a/main.nf b/main.nf
@@ -73,6 +73,7 @@ include { fillArrays; mergeArrays } from './modules/handle_arrays.nf'
 include { plotHeatmaps as plotHeatmapsRaw } from './modules/data_visualization.nf'
 include { plotHeatmaps as plotHeatmapsMerged } from './modules/data_visualization.nf'
 include { plotHeatmapsAnnotated } from './modules/data_visualization.nf'
+include { plotHeatmapsAnnotated as plotHeatmapsAnnotatedDedup } from './modules/data_visualization.nf'
 include { annotateArrays; mergeAnnotations } from './modules/annotate_interactions.nf'
 // differential analysis
 include { generateCountTables; mergeCountTables; runDESeq2 } from './modules/differential_analysis.nf'
@@ -214,6 +215,7 @@ workflow {
     )
 
     // Generate count tables
+    annotated_trns_ch.view()
     count_tables_ch = generateCountTables( annotated_trns_ch )
     merged_count_tables_ch = mergeCountTables(
         count_tables_ch
@@ -232,10 +234,17 @@ workflow {
     // Deduplicate annotations
     deduplicate_annotations_input_ch = merged_count_tables_all_ch // group_name, merged_count_table
             .combine( mergeAnnotations.out ) // merged_annotations
-            .map( it -> [ it[0], it[2], it[3] ] ) // group name, count table, annotations
-    deduplicate_annotations_input_ch.view()
+            .map( it -> [ it[0], it[2], it[1] ] ) // group name, count table, annotations
+    deduplicate_annotations_input_ch
     deduplicateAnnotations( deduplicate_annotations_input_ch )
 
+    // Plot deduplicated annotations on the heatmaps
+    dedup_heatmaps_ch = annotated_arrays_ch
+        .map( it -> [ it[0], it[1], it[2], it[3] ] ) // sample name, genome, array, annotations
+        .combine( deduplicateAnnotations.out )
+    dedup_heatmaps_ch.view()
+    plotHeatmapsAnnotatedDedup( dedup_heatmaps_ch )
+
     // Run differential analysis with DESeq2
     samples_input_ch = Channel
             .fromPath( params.comparisons, checkIfExists: true )
@@ -244,6 +253,7 @@ workflow {
             .map( it -> [ it[1], it[0], it[2] ] )
             .combine( merged_count_tables_ch, by: 0 )
             .map( it -> [ it[1], it[2], it[0], it[3] ] )
+            .view()
 
     runDESeq2( samples_input_ch )
 
@@ -271,7 +281,7 @@ workflow {
                         .map( it -> [ it[0], it[2], it[1] ] )
                         .combine( deduplicateAnnotations.out )
     }
-
+    circos_deseq2_ch.view()
     // Create circos tables
     makeCircosTable_deseq2( circos_deseq2_ch )
     makeCircosTable_count_table( circos_count_table_ch )

diff --git a/modules/annotate_interactions.nf b/modules/annotate_interactions.nf
@@ -50,7 +50,7 @@ process deduplicateAnnotations {
     tuple val(group), path(annotation_table), path(count_table)
 
     output:
-    tuple path("deduplicated_annotations/deduplicated_annotation_table.tsv"), path("deduplicated_annotations/deduplicated_count_table.tsv")
+    tuple val(group), path("deduplicated_annotations/annotation_table_deduplicated.tsv"), path("deduplicated_annotations/count_table_deduplicated.tsv")
 
     publishDir "${params.output}/06-annotations" , mode: 'copy'
 

diff --git a/modules/data_visualization.nf b/modules/data_visualization.nf
@@ -50,68 +50,14 @@ process makeCircosTable_count_table {
     label 'RNAswarm_small'
 
     input:
-    tuple val(genome_name), path(genome), path(count_table), path(annotation_table)
+    tuple val(genome_name), path(genome), path(genome_count_table), val(group_name), path(annotation_table), path(global_count_table)
 
     output:
     tuple val(genome_name), path("${genome_name}_circos")
 
     script:
     """
-    make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos
-    """
-}
-
-/*************************************************************************
-* make circos table for count_table results
-*************************************************************************/
-process makeCircosTable_count_table_30 {
-    label 'RNAswarm_small'
-
-    input:
-    tuple val(genome_name), path(genome), path(count_table), path(annotation_table)
-
-    output:
-    tuple val(genome_name), path("${genome_name}_circos_30")
-
-    script:
-    """
-    make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos_30 --number_of_top_hits 30
-    """
-}
-
-/*************************************************************************
-* make circos table for count_table results
-*************************************************************************/
-process makeCircosTable_count_table_40 {
-    label 'RNAswarm_small'
-
-    input:
-    tuple val(genome_name), path(genome), path(count_table), path(annotation_table)
-
-    output:
-    tuple val(genome_name), path("${genome_name}_circos_40")
-
-    script:
-    """
-    make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos_40 --number_of_top_hits 40
-    """
-}
-
-/*************************************************************************
-* make circos table for count_table results
-*************************************************************************/
-process makeCircosTable_count_table_50 {
-    label 'RNAswarm'
-
-    input:
-    tuple val(genome_name), path(genome), path(count_table), path(annotation_table)
-
-    output:
-    tuple val(genome_name), path("${genome_name}_circos_50")
-
-    script:
-    """
-    make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos_50 --number_of_top_hits 50
+    make_circos_files.py -c ${genome_count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos
     """
 }
 
@@ -122,7 +68,7 @@ process makeCircosTable_deseq2 {
     label 'RNAswarm_small'
 
     input:
-    tuple val(genome_name_01), path(genome_01), val(genome_name_02), path(genome_02), path(results_DESeq2), path(annotation_table)
+    tuple val(genome_name_01), path(genome_01), val(genome_name_02), path(genome_02), path(results_DESeq2), val(group_name), path(annotation_table), path(global_count_table)
 
     output:
     tuple val(genome_name_01), val(genome_name_02), path("${genome_name_01}_${genome_name_02}_circos")

diff --git a/modules/differential_analysis.nf b/modules/differential_analysis.nf
@@ -19,6 +19,7 @@ process generateCountTables {
     """
 }
 
+
 /*************************************************************************
 # Merge count tables
 *************************************************************************/