Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Main #31

Merged
merged 1 commit into from
Apr 30, 2024
Merged

Main #31

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified bin/deduplicate_annotations.py
100644 → 100755
Empty file.
71 changes: 22 additions & 49 deletions bin/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from matplotlib.colors import LogNorm
import pandas as pd


def parse_fasta(fasta_file):
"""Parses a fasta file and returns a dictionary of segment names and sequences.

Expand All @@ -27,24 +28,25 @@ def parse_fasta(fasta_file):
with open(fasta_file) as file: # read genome file
for line in file: # parse genome file
if line.startswith(">"): # parse header
#* if there is a header already, we store the current sequence
#* to this header.
# * if there is a header already, we store the current sequence
# * to this header.
if header:
fasta_dict[header] = seq
#* then we flush the sequence
# * then we flush the sequence
seq = ""
#* and parse the new header
# * and parse the new header
header = line.strip()[1:]
else:
#* if no header is detected, we append the new line
#* to the current sequence
# * if no header is detected, we append the new line
# * to the current sequence
seq += line.strip()
#* after the last line, we have to store
#* the last sequence as well. Since no new line with ">" occurs, we
#* do this manually
# * after the last line, we have to store
# * the last sequence as well. Since no new line with ">" occurs, we
# * do this manually
fasta_dict[header] = seq
return fasta_dict


def make_combination_array(genome_dict, intra_only=False):
"""
Creates a dictionary of numpy array of all possible genome segment combinations.
Expand Down Expand Up @@ -72,21 +74,24 @@ def make_combination_array(genome_dict, intra_only=False):
# * the iterator to a list. Actually, we also could just put the iterator in the for loop.
# * should work as well. Is a tad more memory efficient.
if intra_only:
segment_combinations = list(itertools.combinations_with_replacement(segments, 2))
segment_combinations = list(
itertools.combinations_with_replacement(segments, 2)
)
segment_combinations = [
segment_combination
for segment_combination in segment_combinations
if segment_combination[0] == segment_combination[1]
]
else:
segment_combinations = list(itertools.combinations_with_replacement(segments, 2))
segment_combinations = list(
itertools.combinations_with_replacement(segments, 2)
)
segment_combinations = [
segment_combination
for segment_combination in segment_combinations
if segment_combination[0] != segment_combination[1]
]


for segment_combination in segment_combinations:
# for segment_combination in itertools.combinations_with_replacement(segments,2): # * this should work as well
combination_arrays[segment_combination] = np.zeros(
Expand All @@ -98,38 +103,6 @@ def make_combination_array(genome_dict, intra_only=False):
return combination_arrays


def parse_annotation_table(annotation_table):
"""
Parses an annotation table and returns a dictionary of segment names and annotations.
The annotation table must have the following columns: id,segment01,start01,end01,segment02,start02,end02

Parameters
----------
annotation_table : str
Path to annotation table.

Returns
-------
annotation_dict : dict
Dictionary of segment names and annotations.

"""
annotation_dict = {}
with open(annotation_table) as file:
for line in file:
if not line.startswith("id"):
line = line.strip().split("\t")
annotation_dict[line[0]] = {
"segment01": line[1],
"start01": int(line[2]),
"end01": int(line[3]),
"segment02": line[4],
"start02": int(line[5]),
"end02": int(line[6]),
}
return annotation_dict


def positive_to_negative_strand_point(genome_dict, segment, position):
"""Transpose a point from the positive to the negative strand.

Expand All @@ -147,6 +120,7 @@ def positive_to_negative_strand_point(genome_dict, segment, position):
# Get the position on the negative strand
return aLen - position + 1


def negative_to_positive_strand(genome_dict, aSeq, cai, caj, bSeq, cbi, cbj):
"""Transpose the region of interest from the negative to the positive strand.

Expand Down Expand Up @@ -178,6 +152,7 @@ def negative_to_positive_strand(genome_dict, aSeq, cai, caj, bSeq, cbi, cbj):

return aSeq, cai_pos, caj_pos, bSeq, cbi_pos, cbj_pos


def positive_to_negative_strand(genome_dict, aSeq, cai, caj, bSeq, cbi, cbj):
"""Transpose the region of interest from the positive to the negative strand.

Expand Down Expand Up @@ -238,12 +213,10 @@ def parse_annotation_table(annotation_table):
"end02",
]
regions = pd.read_csv(annotation_table, names=header)
# add id column
# add id column
regions["id"] = regions.index
elif annotation_table.lower().endswith(".tsv"):
regions = pd.read_csv(annotation_table, sep="\t")
else:
raise ValueError(
"Annotation table must be either a csv file or a xlsx file."
)
return regions
raise ValueError("Annotation table must be either a csv file or a xlsx file.")
return regions
5 changes: 4 additions & 1 deletion bin/make_circos_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,10 @@ def main():
DESeq2_results = DESeq2_results.rename(columns={"Unnamed: 0": "id"})

# Read annotation table
annotation_table = hp.parse_annotation_table(annotation_table)
annotation_table = pd.read_csv(annotation_table, header=0, index_col=0, sep="\t")

# Remove rows in DESeq2 results that are not in the annotation table
DESeq2_results = DESeq2_results[DESeq2_results["id"].isin(annotation_table.index)]

# Parse genome segment names and sequences
genome_dict = hp.parse_fasta(genome_file)
Expand Down
16 changes: 13 additions & 3 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ include { fillArrays; mergeArrays } from './modules/handle_arrays.nf'
include { plotHeatmaps as plotHeatmapsRaw } from './modules/data_visualization.nf'
include { plotHeatmaps as plotHeatmapsMerged } from './modules/data_visualization.nf'
include { plotHeatmapsAnnotated } from './modules/data_visualization.nf'
include { plotHeatmapsAnnotated as plotHeatmapsAnnotatedDedup } from './modules/data_visualization.nf'
include { annotateArrays; mergeAnnotations } from './modules/annotate_interactions.nf'
// differential analysis
include { generateCountTables; mergeCountTables; runDESeq2 } from './modules/differential_analysis.nf'
Expand Down Expand Up @@ -214,6 +215,7 @@ workflow {
)

// Generate count tables
annotated_trns_ch.view()
count_tables_ch = generateCountTables( annotated_trns_ch )
merged_count_tables_ch = mergeCountTables(
count_tables_ch
Expand All @@ -232,10 +234,17 @@ workflow {
// Deduplicate annotations
deduplicate_annotations_input_ch = merged_count_tables_all_ch // group_name, merged_count_table
.combine( mergeAnnotations.out ) // merged_annotations
.map( it -> [ it[0], it[2], it[3] ] ) // group name, count table, annotations
deduplicate_annotations_input_ch.view()
.map( it -> [ it[0], it[2], it[1] ] ) // group name, count table, annotations
deduplicate_annotations_input_ch
deduplicateAnnotations( deduplicate_annotations_input_ch )

// Plot deduplicated annotations on the heatmaps
dedup_heatmaps_ch = annotated_arrays_ch
.map( it -> [ it[0], it[1], it[2], it[3] ] ) // sample name, genome, array, annotations
.combine( deduplicateAnnotations.out )
dedup_heatmaps_ch.view()
plotHeatmapsAnnotatedDedup( dedup_heatmaps_ch )

// Run differential analysis with DESeq2
samples_input_ch = Channel
.fromPath( params.comparisons, checkIfExists: true )
Expand All @@ -244,6 +253,7 @@ workflow {
.map( it -> [ it[1], it[0], it[2] ] )
.combine( merged_count_tables_ch, by: 0 )
.map( it -> [ it[1], it[2], it[0], it[3] ] )
.view()

runDESeq2( samples_input_ch )

Expand Down Expand Up @@ -271,7 +281,7 @@ workflow {
.map( it -> [ it[0], it[2], it[1] ] )
.combine( deduplicateAnnotations.out )
}

circos_deseq2_ch.view()
// Create circos tables
makeCircosTable_deseq2( circos_deseq2_ch )
makeCircosTable_count_table( circos_count_table_ch )
Expand Down
2 changes: 1 addition & 1 deletion modules/annotate_interactions.nf
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ process deduplicateAnnotations {
tuple val(group), path(annotation_table), path(count_table)

output:
tuple path("deduplicated_annotations/deduplicated_annotation_table.tsv"), path("deduplicated_annotations/deduplicated_count_table.tsv")
tuple val(group), path("deduplicated_annotations/annotation_table_deduplicated.tsv"), path("deduplicated_annotations/count_table_deduplicated.tsv")

publishDir "${params.output}/06-annotations" , mode: 'copy'

Expand Down
60 changes: 3 additions & 57 deletions modules/data_visualization.nf
Original file line number Diff line number Diff line change
Expand Up @@ -50,68 +50,14 @@ process makeCircosTable_count_table {
label 'RNAswarm_small'

input:
tuple val(genome_name), path(genome), path(count_table), path(annotation_table)
tuple val(genome_name), path(genome), path(genome_count_table), val(group_name), path(annotation_table), path(global_count_table)

output:
tuple val(genome_name), path("${genome_name}_circos")

script:
"""
make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos
"""
}

/*************************************************************************
* make circos table for count_table results
*************************************************************************/
process makeCircosTable_count_table_30 {
label 'RNAswarm_small'

input:
tuple val(genome_name), path(genome), path(count_table), path(annotation_table)

output:
tuple val(genome_name), path("${genome_name}_circos_30")

script:
"""
make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos_30 --number_of_top_hits 30
"""
}

/*************************************************************************
* make circos table for count_table results
*************************************************************************/
process makeCircosTable_count_table_40 {
label 'RNAswarm_small'

input:
tuple val(genome_name), path(genome), path(count_table), path(annotation_table)

output:
tuple val(genome_name), path("${genome_name}_circos_40")

script:
"""
make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos_40 --number_of_top_hits 40
"""
}

/*************************************************************************
* make circos table for count_table results
*************************************************************************/
process makeCircosTable_count_table_50 {
label 'RNAswarm'

input:
tuple val(genome_name), path(genome), path(count_table), path(annotation_table)

output:
tuple val(genome_name), path("${genome_name}_circos_50")

script:
"""
make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos_50 --number_of_top_hits 50
make_circos_files.py -c ${genome_count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos
"""
}

Expand All @@ -122,7 +68,7 @@ process makeCircosTable_deseq2 {
label 'RNAswarm_small'

input:
tuple val(genome_name_01), path(genome_01), val(genome_name_02), path(genome_02), path(results_DESeq2), path(annotation_table)
tuple val(genome_name_01), path(genome_01), val(genome_name_02), path(genome_02), path(results_DESeq2), val(group_name), path(annotation_table), path(global_count_table)

output:
tuple val(genome_name_01), val(genome_name_02), path("${genome_name_01}_${genome_name_02}_circos")
Expand Down
1 change: 1 addition & 0 deletions modules/differential_analysis.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ process generateCountTables {
"""
}


/*************************************************************************
# Merge count tables
*************************************************************************/
Expand Down