diff --git a/workflow/rules/qc_3prime.smk b/workflow/rules/qc_3prime.smk index e1d048ef..2d5eac59 100644 --- a/workflow/rules/qc_3prime.smk +++ b/workflow/rules/qc_3prime.smk @@ -1,8 +1,7 @@ rule get_selected_transcripts_aligned_read_bins: input: aligned_file="results/QC/{sample}-{unit}.aligned.txt", - samtools_sort="results/kallisto-bam-sorted/{sample}-{unit}-pseudoalignments.sorted.bam", - samtools_index="results/kallisto-bam-sorted/{sample}-{unit}-pseudoalignments.sorted.bam.bai", + canonical_ids="resources/canonical_ids.bed", read_length="results/stats/max-read-length.json", output: fwrd_allsamp_hist_fil=temp( diff --git a/workflow/scripts/get-3prime-max-positions.py b/workflow/scripts/get-3prime-max-positions.py index e75dbbf5..f5f04949 100644 --- a/workflow/scripts/get-3prime-max-positions.py +++ b/workflow/scripts/get-3prime-max-positions.py @@ -11,22 +11,22 @@ bam_file = pysam.AlignmentFile(snakemake.input["canonical_mapped_bam"], "rb") bam_header = bam_file.header.to_dict() trans_length_data = pd.DataFrame(bam_header.get("SQ")) -trans_length_data.rename(columns={"SN": "Transcript_ID"}, inplace=True) +trans_length_data.rename(columns={"SN": "transcript"}, inplace=True) # Aligned text file reading align_bam_txt = pd.read_csv( snakemake.input["canonical_mapped_pos"], sep="\t", - names=["read_name", "Transcript_ID", "Start", "read", "Quality"], + names=["read_name", "transcript", "start", "read", "quality"], ) -align_bam_txt["Strand"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[1] -align_bam_txt["Transcript"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[0] -merge_data = align_bam_txt.merge(trans_length_data, on="Transcript_ID") +align_bam_txt["strand"] = align_bam_txt["transcript"].str.split("_", 1).str[1] +align_bam_txt["Transcript"] = align_bam_txt["transcript"].str.split("_", 1).str[0] +merge_data = align_bam_txt.merge(trans_length_data, on="transcript") # reads aligned to forward strand -forward_strand = merge_data.loc[merge_data["Strand"] == "1"] +forward_strand = merge_data.loc[merge_data["strand"] == "1"] forward_strand[sample_name + "_forward_strand"] = ( - forward_strand["LN"] - forward_strand["Start"] + forward_strand["transcript_length"] - forward_strand["start"] ) aligned_reads = forward_strand.loc[ forward_strand.groupby(["read_name", "read"])[ @@ -35,9 +35,9 @@ ] # reads aligned to reverse strand -reverse_strand = merge_data.loc[merge_data["Strand"] == "-1"] +reverse_strand = merge_data.loc[merge_data["strand"] == "-1"] read_min = reverse_strand.loc[ - reverse_strand.groupby(["read_name", "read"])["Start"].idxmin() + reverse_strand.groupby(["read_name", "read"])["start"].idxmin() ] aligned_reads = pd.concat([aligned_reads, read_min]) aligned_reads.to_csv( diff --git a/workflow/scripts/get-sample-hist-bins.py b/workflow/scripts/get-sample-hist-bins.py index a852bff7..e5749bb0 100644 --- a/workflow/scripts/get-sample-hist-bins.py +++ b/workflow/scripts/get-sample-hist-bins.py @@ -32,40 +32,44 @@ bam_file = pysam.AlignmentFile(snakemake.input["samtools_sort"], "rb") bam_header = bam_file.header.to_dict() trans_length_data = pd.DataFrame(bam_header.get("SQ")) -trans_length_data.rename(columns={"SN": "Transcript_ID"}, inplace=True) +trans_length_data.rename(columns={"SN": "transcript"}, inplace=True) + +# BED file reading +trans_length_data = pd.read_csv( + snakemake.input["canonical_ids"], + sep="\t", + names=["transcript", "transcript_start", "transcript_length", "strand"], +).drop(columns = ["transcript_start"]) # Aligned text file reading align_bam_txt = pd.read_csv( snakemake.input["aligned_file"], sep="\t", - names=["read_Name", "Transcript_ID", "Start", "reads", "Quality"], + names=["read_Name", "transcript", "start", "reads", "quality"], ) -align_bam_txt["Strand"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[1] -align_bam_txt["Transcript"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[0] - # Both transcript len and start postion are merged based on same transcript ID -merge_data = align_bam_txt.merge(trans_length_data, on="Transcript_ID") +merge_data = align_bam_txt.merge(trans_length_data, on="transcript") # Forward strand -forward_strand = merge_data.loc[merge_data["Strand"] == "1"] +forward_strand = merge_data.loc[merge_data["strand"] == "1"] # Each read postion is calcuated forward_strand[sample_name + "_forward_strand"] = ( - forward_strand["LN"] - forward_strand["Start"] + forward_strand["transcript_length"] - forward_strand["start"] ) aligned_reads = forward_strand.loc[ forward_strand.groupby(["read_Name", "reads"])[ sample_name + "_forward_strand" ].idxmin() ] -if aligned_reads["Transcript"].str.contains(transcript_ids).any(): +if aligned_reads["transcript"].str.contains(transcript_ids).any(): # Get aligned read postion of the given transcript - fwrd_filtered_transcript_data = aligned_reads.query("Transcript == @transcript_ids") + fwrd_filtered_transcript_data = aligned_reads.query("transcript == @transcript_ids") Freq_fwrd_fil, bins_fwrd_fil = np.histogram( fwrd_filtered_transcript_data[sample_name + "_forward_strand"], bins=read_length, - range=[0, max(fwrd_filtered_transcript_data["LN"])], + range=[0, max(fwrd_filtered_transcript_data["transcript_length"])], ) hist_fwrd_fil = pd.DataFrame( { @@ -80,7 +84,7 @@ Freq_fwrd, bins_fwrd = np.histogram( aligned_reads[sample_name + "_forward_strand"], bins=read_length, - range=[0, max(aligned_reads["LN"])], + range=[0, max(aligned_reads["transcript_length"])], ) Freq_fwrd_trim, bins_fwrd_trim = np.histogram( forward_strand[sample_name + "_forward_strand"], @@ -113,20 +117,20 @@ ) # Reverse strand -reverse_strand = merge_data.loc[merge_data["Strand"] == "-1"] +reverse_strand = merge_data.loc[merge_data["strand"] == "-1"] # Minimum aligned start postion is taken read_min = reverse_strand.loc[ - reverse_strand.groupby(["read_Name", "reads"])["Start"].idxmin() + reverse_strand.groupby(["read_Name", "reads"])["start"].idxmin() ] -if read_min["Transcript"].str.contains(transcript_ids).any(): +if read_min["transcript"].str.contains(transcript_ids).any(): # Get aligned read postion of the given transcript - rev_filtered_transcript_data = read_min.query("Transcript == @transcript_ids") + rev_filtered_transcript_data = read_min.query("transcript == @transcript_ids") Freq_rev_fil, bins_rev_fil = np.histogram( - rev_filtered_transcript_data["Start"], + rev_filtered_transcript_data["start"], bins=read_length, - range=[0, max(rev_filtered_transcript_data["LN"])], + range=[0, max(rev_filtered_transcript_data["transcript_length"])], ) hist_rev_fil = pd.DataFrame( { @@ -140,10 +144,10 @@ elif transcript_ids == "all": # Values added to corresponding bins Freq_rev, bins_rev = np.histogram( - read_min["Start"], bins=read_length, range=[0, max(read_min["LN"])] + read_min["start"], bins=read_length, range=[0, max(read_min["transcript_length"])] ) Freq_rev_trim, bins_rev_trim = np.histogram( - read_min["Start"], bins=read_length, range=[0, 20000] + read_min["start"], bins=read_length, range=[0, 20000] ) # Dataframe created for bins diff --git a/workflow/scripts/get_canonical_ids.R b/workflow/scripts/get_canonical_ids.R index bf1c250e..c7b17096 100644 --- a/workflow/scripts/get_canonical_ids.R +++ b/workflow/scripts/get_canonical_ids.R @@ -30,16 +30,14 @@ canonical_ids <- get_transcripts_ids %>% # add columns necessary for valid BED file and sort accordingly, see: # https://bedtools.readthedocs.io/en/latest/content/general-usage.html add_column( - start = 0, - name = "", - score = "" + start = 0 ) %>% + # Here, we hijack the `name` field of the BED format to encode the strand, + # as biomart gives the strand as `1` vs `-1`, as opposed to BED's `+` vs. `-`. select( ensembl_transcript_id_version, start, transcript_length, - name, - score, strand ) diff --git a/workflow/scripts/plot-ind-sample-QC-histogram.py b/workflow/scripts/plot-ind-sample-QC-histogram.py index 34109364..efcfe485 100644 --- a/workflow/scripts/plot-ind-sample-QC-histogram.py +++ b/workflow/scripts/plot-ind-sample-QC-histogram.py @@ -33,32 +33,32 @@ bam_file = pysam.AlignmentFile('results/ind_transcripts/' + sample_name + '-pseudoalignments.sorted.bam',"rb") bam_header = bam_file.header.to_dict() get_trans_length = pd.DataFrame(bam_header.get('SQ')) - get_trans_length.rename(columns={'SN': 'Transcript_ID'}, inplace=True) + get_trans_length.rename(columns={'SN': 'transcript'}, inplace=True) # Aligned text file reading align_bam_txt = pd.read_csv('results/QC/' + sample_name + '.aligned.txt', - sep="\t",names=["read_Name","Transcript_ID", "Start","reads", "Quality"]) - align_bam_txt["Strand"] =align_bam_txt['Transcript_ID'].str.split('_', 1).str[1] - align_bam_txt["Transcript"] =align_bam_txt['Transcript_ID'].str.split('_', 1).str[0] + sep="\t",names=["read_Name","transcript", "start","reads", "quality"]) + align_bam_txt["strand"] =align_bam_txt['transcript'].str.split('_', 1).str[1] + align_bam_txt["Transcript"] =align_bam_txt['transcript'].str.split('_', 1).str[0] # merging aligned bam text file based on transcript id from bam file - merge_align_trans_data = align_bam_txt.merge(get_trans_length, on='Transcript_ID') + merge_align_trans_data = align_bam_txt.merge(get_trans_length, on='transcript') filtered_transcript_data = merge_align_trans_data.query("Transcript == @transcript_ids") # Forward strand - if ((filtered_transcript_data['Strand']=='1')).any(): + if ((filtered_transcript_data['strand']=='1')).any(): fwrd_flag = 1 - filtered_transcript_data[sample_name + '_forward_strand'] = filtered_transcript_data['LN'] - filtered_transcript_data['Start'] + filtered_transcript_data[sample_name + '_forward_strand'] = filtered_transcript_data['transcript_length'] - filtered_transcript_data['start'] aligned_reads = filtered_transcript_data.loc[filtered_transcript_data.groupby(['read_Name','reads'])[sample_name + '_forward_strand'].idxmin()] - Freq_fwrd, bins_fwrd = np.histogram(aligned_reads[sample_name + '_forward_strand'], bins = read_length, range=[0,max(aligned_reads['LN'])]) + Freq_fwrd, bins_fwrd = np.histogram(aligned_reads[sample_name + '_forward_strand'], bins = read_length, range=[0,max(aligned_reads['transcript_length'])]) hist_fwrd = pd.DataFrame({'sample_Name': sample_name, 'Freq_forward': Freq_fwrd, 'bins_foward': bins_fwrd[:-1]}) fwrd_allsamp_hist = pd.concat([fwrd_allsamp_hist, hist_fwrd]) - elif ((filtered_transcript_data['Strand']=='-1')).any(): + elif ((filtered_transcript_data['strand']=='-1')).any(): # reverse strand rev_flag = 1 - reverse_strand = filtered_transcript_data.loc[filtered_transcript_data['Strand'] == '-1'] - read_min = reverse_strand.loc[reverse_strand.groupby(['read_Name','reads'])['Start'].idxmin()] - Freq_rev, bins_rev = np.histogram(read_min['Start'], bins =read_length, range=[0, max(read_min['LN'])]) + reverse_strand = filtered_transcript_data.loc[filtered_transcript_data['strand'] == '-1'] + read_min = reverse_strand.loc[reverse_strand.groupby(['read_Name','reads'])['start'].idxmin()] + Freq_rev, bins_rev = np.histogram(read_min['start'], bins =read_length, range=[0, max(read_min['transcript_length'])]) hist_rev = pd.DataFrame({'sample_Name': sample_name, 'Freq_rev': Freq_rev,'bins_rev': bins_rev[:-1]}) rev_allsamp_hist = pd.concat([rev_allsamp_hist, hist_rev])