Skip to content

Commit

Permalink
fix: use BED file in QC plot data generation, clean up variable names…
Browse files Browse the repository at this point in the history
… in QC plot scripts
  • Loading branch information
dlaehnemann committed Aug 17, 2023
1 parent c71f286 commit abe45d3
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 48 deletions.
3 changes: 1 addition & 2 deletions workflow/rules/qc_3prime.smk
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
rule get_selected_transcripts_aligned_read_bins:
input:
aligned_file="results/QC/{sample}-{unit}.aligned.txt",
samtools_sort="results/kallisto-bam-sorted/{sample}-{unit}-pseudoalignments.sorted.bam",
samtools_index="results/kallisto-bam-sorted/{sample}-{unit}-pseudoalignments.sorted.bam.bai",
canonical_ids="resources/canonical_ids.bed",
read_length="results/stats/max-read-length.json",
output:
fwrd_allsamp_hist_fil=temp(
Expand Down
18 changes: 9 additions & 9 deletions workflow/scripts/get-3prime-max-positions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,22 @@
bam_file = pysam.AlignmentFile(snakemake.input["canonical_mapped_bam"], "rb")
bam_header = bam_file.header.to_dict()
trans_length_data = pd.DataFrame(bam_header.get("SQ"))
trans_length_data.rename(columns={"SN": "Transcript_ID"}, inplace=True)
trans_length_data.rename(columns={"SN": "transcript"}, inplace=True)

# Aligned text file reading
align_bam_txt = pd.read_csv(
snakemake.input["canonical_mapped_pos"],
sep="\t",
names=["read_name", "Transcript_ID", "Start", "read", "Quality"],
names=["read_name", "transcript", "start", "read", "quality"],
)
align_bam_txt["Strand"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[1]
align_bam_txt["Transcript"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[0]
merge_data = align_bam_txt.merge(trans_length_data, on="Transcript_ID")
align_bam_txt["strand"] = align_bam_txt["transcript"].str.split("_", 1).str[1]
align_bam_txt["Transcript"] = align_bam_txt["transcript"].str.split("_", 1).str[0]
merge_data = align_bam_txt.merge(trans_length_data, on="transcript")

# reads aligned to forward strand
forward_strand = merge_data.loc[merge_data["Strand"] == "1"]
forward_strand = merge_data.loc[merge_data["strand"] == "1"]
forward_strand[sample_name + "_forward_strand"] = (
forward_strand["LN"] - forward_strand["Start"]
forward_strand["transcript_length"] - forward_strand["start"]
)
aligned_reads = forward_strand.loc[
forward_strand.groupby(["read_name", "read"])[
Expand All @@ -35,9 +35,9 @@
]

# reads aligned to reverse strand
reverse_strand = merge_data.loc[merge_data["Strand"] == "-1"]
reverse_strand = merge_data.loc[merge_data["strand"] == "-1"]
read_min = reverse_strand.loc[
reverse_strand.groupby(["read_name", "read"])["Start"].idxmin()
reverse_strand.groupby(["read_name", "read"])["start"].idxmin()
]
aligned_reads = pd.concat([aligned_reads, read_min])
aligned_reads.to_csv(
Expand Down
44 changes: 24 additions & 20 deletions workflow/scripts/get-sample-hist-bins.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,40 +32,44 @@
bam_file = pysam.AlignmentFile(snakemake.input["samtools_sort"], "rb")
bam_header = bam_file.header.to_dict()
trans_length_data = pd.DataFrame(bam_header.get("SQ"))
trans_length_data.rename(columns={"SN": "Transcript_ID"}, inplace=True)
trans_length_data.rename(columns={"SN": "transcript"}, inplace=True)

# BED file reading
trans_length_data = pd.read_csv(
snakemake.input["canonical_ids"],
sep="\t",
names=["transcript", "transcript_start", "transcript_length", "strand"],
).drop(columns = ["transcript_start"])

# Aligned text file reading
align_bam_txt = pd.read_csv(
snakemake.input["aligned_file"],
sep="\t",
names=["read_Name", "Transcript_ID", "Start", "reads", "Quality"],
names=["read_Name", "transcript", "start", "reads", "quality"],
)
align_bam_txt["Strand"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[1]
align_bam_txt["Transcript"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[0]


# Both transcript len and start postion are merged based on same transcript ID
merge_data = align_bam_txt.merge(trans_length_data, on="Transcript_ID")
merge_data = align_bam_txt.merge(trans_length_data, on="transcript")

# Forward strand
forward_strand = merge_data.loc[merge_data["Strand"] == "1"]
forward_strand = merge_data.loc[merge_data["strand"] == "1"]

# Each read postion is calcuated
forward_strand[sample_name + "_forward_strand"] = (
forward_strand["LN"] - forward_strand["Start"]
forward_strand["transcript_length"] - forward_strand["start"]
)
aligned_reads = forward_strand.loc[
forward_strand.groupby(["read_Name", "reads"])[
sample_name + "_forward_strand"
].idxmin()
]
if aligned_reads["Transcript"].str.contains(transcript_ids).any():
if aligned_reads["transcript"].str.contains(transcript_ids).any():
# Get aligned read postion of the given transcript
fwrd_filtered_transcript_data = aligned_reads.query("Transcript == @transcript_ids")
fwrd_filtered_transcript_data = aligned_reads.query("transcript == @transcript_ids")
Freq_fwrd_fil, bins_fwrd_fil = np.histogram(
fwrd_filtered_transcript_data[sample_name + "_forward_strand"],
bins=read_length,
range=[0, max(fwrd_filtered_transcript_data["LN"])],
range=[0, max(fwrd_filtered_transcript_data["transcript_length"])],
)
hist_fwrd_fil = pd.DataFrame(
{
Expand All @@ -80,7 +84,7 @@
Freq_fwrd, bins_fwrd = np.histogram(
aligned_reads[sample_name + "_forward_strand"],
bins=read_length,
range=[0, max(aligned_reads["LN"])],
range=[0, max(aligned_reads["transcript_length"])],
)
Freq_fwrd_trim, bins_fwrd_trim = np.histogram(
forward_strand[sample_name + "_forward_strand"],
Expand Down Expand Up @@ -113,20 +117,20 @@
)

# Reverse strand
reverse_strand = merge_data.loc[merge_data["Strand"] == "-1"]
reverse_strand = merge_data.loc[merge_data["strand"] == "-1"]

# Minimum aligned start postion is taken
read_min = reverse_strand.loc[
reverse_strand.groupby(["read_Name", "reads"])["Start"].idxmin()
reverse_strand.groupby(["read_Name", "reads"])["start"].idxmin()
]

if read_min["Transcript"].str.contains(transcript_ids).any():
if read_min["transcript"].str.contains(transcript_ids).any():
# Get aligned read postion of the given transcript
rev_filtered_transcript_data = read_min.query("Transcript == @transcript_ids")
rev_filtered_transcript_data = read_min.query("transcript == @transcript_ids")
Freq_rev_fil, bins_rev_fil = np.histogram(
rev_filtered_transcript_data["Start"],
rev_filtered_transcript_data["start"],
bins=read_length,
range=[0, max(rev_filtered_transcript_data["LN"])],
range=[0, max(rev_filtered_transcript_data["transcript_length"])],
)
hist_rev_fil = pd.DataFrame(
{
Expand All @@ -140,10 +144,10 @@
elif transcript_ids == "all":
# Values added to corresponding bins
Freq_rev, bins_rev = np.histogram(
read_min["Start"], bins=read_length, range=[0, max(read_min["LN"])]
read_min["start"], bins=read_length, range=[0, max(read_min["transcript_length"])]
)
Freq_rev_trim, bins_rev_trim = np.histogram(
read_min["Start"], bins=read_length, range=[0, 20000]
read_min["start"], bins=read_length, range=[0, 20000]
)

# Dataframe created for bins
Expand Down
8 changes: 3 additions & 5 deletions workflow/scripts/get_canonical_ids.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,14 @@ canonical_ids <- get_transcripts_ids %>%
# add columns necessary for valid BED file and sort accordingly, see:
# https://bedtools.readthedocs.io/en/latest/content/general-usage.html
add_column(
start = 0,
name = "",
score = ""
start = 0
) %>%
# Here, we hijack the `name` field of the BED format to encode the strand,
# as biomart gives the strand as `1` vs `-1`, as opposed to BED's `+` vs. `-`.
select(
ensembl_transcript_id_version,
start,
transcript_length,
name,
score,
strand
)

Expand Down
24 changes: 12 additions & 12 deletions workflow/scripts/plot-ind-sample-QC-histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,32 +33,32 @@
bam_file = pysam.AlignmentFile('results/ind_transcripts/' + sample_name + '-pseudoalignments.sorted.bam',"rb")
bam_header = bam_file.header.to_dict()
get_trans_length = pd.DataFrame(bam_header.get('SQ'))
get_trans_length.rename(columns={'SN': 'Transcript_ID'}, inplace=True)
get_trans_length.rename(columns={'SN': 'transcript'}, inplace=True)

# Aligned text file reading
align_bam_txt = pd.read_csv('results/QC/' + sample_name + '.aligned.txt',
sep="\t",names=["read_Name","Transcript_ID", "Start","reads", "Quality"])
align_bam_txt["Strand"] =align_bam_txt['Transcript_ID'].str.split('_', 1).str[1]
align_bam_txt["Transcript"] =align_bam_txt['Transcript_ID'].str.split('_', 1).str[0]
sep="\t",names=["read_Name","transcript", "start","reads", "quality"])
align_bam_txt["strand"] =align_bam_txt['transcript'].str.split('_', 1).str[1]
align_bam_txt["Transcript"] =align_bam_txt['transcript'].str.split('_', 1).str[0]

# merging aligned bam text file based on transcript id from bam file
merge_align_trans_data = align_bam_txt.merge(get_trans_length, on='Transcript_ID')
merge_align_trans_data = align_bam_txt.merge(get_trans_length, on='transcript')
filtered_transcript_data = merge_align_trans_data.query("Transcript == @transcript_ids")

# Forward strand
if ((filtered_transcript_data['Strand']=='1')).any():
if ((filtered_transcript_data['strand']=='1')).any():
fwrd_flag = 1
filtered_transcript_data[sample_name + '_forward_strand'] = filtered_transcript_data['LN'] - filtered_transcript_data['Start']
filtered_transcript_data[sample_name + '_forward_strand'] = filtered_transcript_data['transcript_length'] - filtered_transcript_data['start']
aligned_reads = filtered_transcript_data.loc[filtered_transcript_data.groupby(['read_Name','reads'])[sample_name + '_forward_strand'].idxmin()]
Freq_fwrd, bins_fwrd = np.histogram(aligned_reads[sample_name + '_forward_strand'], bins = read_length, range=[0,max(aligned_reads['LN'])])
Freq_fwrd, bins_fwrd = np.histogram(aligned_reads[sample_name + '_forward_strand'], bins = read_length, range=[0,max(aligned_reads['transcript_length'])])
hist_fwrd = pd.DataFrame({'sample_Name': sample_name, 'Freq_forward': Freq_fwrd, 'bins_foward': bins_fwrd[:-1]})
fwrd_allsamp_hist = pd.concat([fwrd_allsamp_hist, hist_fwrd])
elif ((filtered_transcript_data['Strand']=='-1')).any():
elif ((filtered_transcript_data['strand']=='-1')).any():
# reverse strand
rev_flag = 1
reverse_strand = filtered_transcript_data.loc[filtered_transcript_data['Strand'] == '-1']
read_min = reverse_strand.loc[reverse_strand.groupby(['read_Name','reads'])['Start'].idxmin()]
Freq_rev, bins_rev = np.histogram(read_min['Start'], bins =read_length, range=[0, max(read_min['LN'])])
reverse_strand = filtered_transcript_data.loc[filtered_transcript_data['strand'] == '-1']
read_min = reverse_strand.loc[reverse_strand.groupby(['read_Name','reads'])['start'].idxmin()]
Freq_rev, bins_rev = np.histogram(read_min['start'], bins =read_length, range=[0, max(read_min['transcript_length'])])
hist_rev = pd.DataFrame({'sample_Name': sample_name, 'Freq_rev': Freq_rev,'bins_rev': bins_rev[:-1]})
rev_allsamp_hist = pd.concat([rev_allsamp_hist, hist_rev])

Expand Down

0 comments on commit abe45d3

Please sign in to comment.