Skip to content

Commit

Permalink
Merge pull request #11 from sanger-bentley-group/feature/het_snps
Browse files Browse the repository at this point in the history
Feature/het snps
  • Loading branch information
blue-moon22 authored Apr 28, 2022
2 parents 5d902a1 + 4bd1d22 commit 484040e
Show file tree
Hide file tree
Showing 16 changed files with 320 additions and 105 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,15 @@ Change:
- `/path/to/gbs_qc_reports` to the directory location of the generated reports. (Default is the current directory)

## Output
You should get two tab-delimited output reports `qc_report_summary.tab` and `qc_report_complete.tab` in the `--qc_reports_directory` you specified. `qc_report_summary.tab` gives the `lane_id` and PASS/FAIL `status`. `qc_report_summary.tab` gives all the PASS/FAIL status for each QC.
You should get two tab-delimited output reports `qc_report_summary.tab` and `qc_report_complete.tab` in the `--qc_reports_directory` you specified. `qc_report_summary.tab` gives the `lane_id` and PASS/FAIL `status`. `qc_report_complete.tab` gives all the PASS/FAIL status for each QC.

### Missing Data
If there are empty values in `qc_report_summary.tab` then at least one QC workflow may have failed. You can look in the `qc_report_complete.tab` to find which one.

If there are empty values for:
- `rel_abundance` then these lanes may not have been imported/imported properly with a kraken report.
- `contig_no` then these lanes may not have been assembled/assembled properly
- `contig_no` or `gc_content` then these lanes may not have been assembled/assembled properly
- `HET_SNPs` then these lanes may not have had variants called

If this is the case contact `[email protected]` for help with this.

Expand All @@ -80,7 +81,7 @@ If this is the case contact `[email protected]` for help with this.
--genome_len_higher_threshold Genome length/total number of bases < genome_len_higher_threshold to pass. (Default: 2800000)
--cov_depth_threshold Genome depth of coverage > cov_depth_threshold to pass. (Default: 20)
--cov_breadth_threshold Genome breadth of coverage > cov_breadth_threshold to pass. (Default: 70)
--percentage_het_snps_threshold Percentage of HET SNPs (of total SNPs) < percentage_het_snps_threshold to pass. (Default: 15)
--het_snps_threshold Number of HET SNPs <= het_snps_threshold to pass. (Default: 20)

## The methods
The methods used for finding relative abundance from Kraken, coverage breadth, coverage depth and percentage HET SNPs out of total SNPs are described [here](http://mediawiki.internal.sanger.ac.uk/index.php/Pathogen_Informatics_QC_Pipeline) (Sanger access only).
Expand Down
165 changes: 165 additions & 0 deletions bin/filtervcf_v4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#! /usr/bin/env python2.7


## modified for not using the MGE removal step
## changed the threshold for SNP from 20 to 5

import sys

in1 = sys.argv[1] ##raw vcf file
out1 = sys.argv[2] ##output prefix
#mge = sys.argv[3] ##"MGE_staph.bed file
#cov = sys.argv[3] ##coverage file
min_dist = sys.argv[3] ##the distance parameter to compare snp at distance
#ref = sys.argv[4] ##reference file


d1 = [base.strip() for base in open(in1).readlines() if not base.startswith("#") and "INDEL" not in base]
#mge_coord = [base.strip() for base in open(mge).readlines()]

def checkPV4(x):
l2 = []
for each in x:
if each.startswith("PV4="):
l2.append(each.split("=")[1])
return l2

def checkdp4(x):
l1 = []
for char in x:
if char.startswith("DP="):
l1.append(char.split("=")[1])
elif char.startswith("DP4="):
l1.append(char.split("=")[1])
elif char.startswith("MQ="):
l1.append(char.split("=")[1])
return l1

#def mge_filt(x):
# """list of the variants called at each position"""
#
# vmge = {}
# for each in x:
# c1 = each.split("\t")
#
# for char in mge_coord:
# d1 = char.split("\t")
# if int(c1[1]) >= int(d1[0]) and int(c1[1]) <= int(d1[1]):
# vmge[c1[1]] = each
#
# var1 = {base.split("\t")[1]:base for base in x}
# mge_rmd = set(var1) - set(vmge)
# mge_filt = [var1[base] for base in list(mge_rmd)]
# return mge_filt

def dist_filter(z):
j = 0
k = 0
others = []
clust = {}
a1 = []
z1 = sorted(z)

for i in range(len(z1)-1):
if z1[i+1] - z1[i] < int(min_dist):
a1 += [z1[i], z1[i+1]]
clust["c"+str(j)] = a1
else:
others += [z1[i], z1[i+1]]
j += 1
a1 = []

lclust = []
for each in clust:
lclust += clust[each]

x1 = set(others)
x2 = set(lclust)

fsnps = sorted(list( x1 - x2 ))

return fsnps

Hetvar = []
Homvar = []
failed = []

for each in d1:

a1 = each.split("\t")
a2 = a1[7].split(";")
a3 = checkdp4(a2)
a4 = a1[9].split(":")[-2]
a5 = checkPV4(a2)
dp4 = a3[1].split(",")
if float(a1[5]) >50 and int(a3[0]) >5 and int(a3[-1]) >30 and int(dp4[2]) >2 and int(dp4[3]) > 2:
p1 = [a1[0], a1[1], a1[3],a1[4],a1[5], a3[0],a3[-1]]+dp4
l1 = int(dp4[2]) / (float(dp4[2]) + float(dp4[0]))
r1 = int(dp4[3]) / (float(dp4[3]) + float(dp4[1]))
if len(a5) >0:
PV4 = a5[0].split(",")
if float(PV4[0]) >0.001 and float(PV4[2]) > 0.001 and float(PV4[3]) > 0.001:
if l1 < 0.90 and r1 < 0.90:
t1 = [a1[0], a1[1], a1[3],a1[4],a1[5], a3[0],a3[-1]]+dp4 + ["HET"]
Hetvar.append("\t".join(t1))
if l1 > 0.80 and r1 > 0.80:
t3 = [a1[0], a1[1], a1[3], a1[4], a1[5], a3[0], a3[-1]] + dp4 + ["HOMO"]
Homvar.append("\t".join(t3))
else:
t5 = [a1[0], a1[1], a1[3], a1[4], a1[5], a3[0], a3[-1]] + dp4 +["-"]
failed.append("\t".join(t5))
else:
if l1 < 0.90 and r1 < 0.90:
t1 = [a1[0], a1[1], a1[3],a1[4],a1[5], a3[0],a3[-1]]+dp4 + ["HET"]
Hetvar.append("\t".join(t1))
if l1 > 0.80 and r1 > 0.80:
t3 = [a1[0], a1[1], a1[3], a1[4], a1[5], a3[0], a3[-1]] + dp4 + ["HOMO"]
Homvar.append("\t".join(t3))

else:
t2 = [a1[0], a1[1], a1[3],a1[4],a1[5], a3[0],a3[-1]] + dp4 + ["-"]
failed.append("\t".join(t2))


#vfilt1 = mge_filt(fvar)

homfilt = {int(base.split("\t")[1]):base for base in Homvar}
hetfilt = {int(base.split("\t")[1]):base for base in Hetvar}
vdistHom = dist_filter(homfilt.keys())
vdistHet = dist_filter(hetfilt.keys())

homfiltered = [homfilt[int(base)] for base in vdistHom]
hetfiltered = [hetfilt[int(base)] for base in vdistHet]



#c1 = open(cov).read().strip().replace(" ","")
n1 = len(hetfiltered)
n2 = len(homfiltered)

#refSeq = ""
#for line in open(ref).readlines():
# if not line.startswith(">"):
# refSeq += line.strip()

try:
st1 = format(float(n1)/n2, ".2f")

except ZeroDivisionError as err:
print(err)
st1 = 0

finally:
h1 = ["\t".join("Het_SNPs Total_SNPs Proportion".split(" "))]
h2 = ["\t".join([str(n1), str(n2), str(st1)])]
stats = h1+h2

header = "Chrom Pos Ref Alt BaseQ Depth MapQ RF RR AF AR TYPE"
#open(out1+"_failed.vcf","w+").writelines("\n".join(["\t".join(header.split(" "))]+sorted(failed, key = lambda x: int(x.split("\t")[1]))))
open(out1+"_"+min_dist+"stats", "w+").writelines("\n".join(stats))
open(out1+"_"+min_dist+"fhom.vcf","w+").writelines("\n".join(["\t".join(header.split(" "))]+sorted(homfiltered, key = lambda x: int(x.split("\t")[1]))))
open(out1+"_"+min_dist+"fhet.vcf","w+").writelines("\n".join(["\t".join(header.split(" "))]+sorted(hetfiltered, key = lambda x: int(x.split("\t")[1]))))


##for hom snp#'TYPE!="snp" || FORMAT/DP<20 || DP4[2]+DP4[3]<4 || DP4[2]<2 || DP4[3]<2 || DP4[2]/(DP4[2]+DP4[0])<0.75 || DP4[3]/(DP4[3]+DP4[1])<0.75 || QUAL<50 || INFO/MQ<30'
##for het snp#~/Programs/bcftools-1.3.1/bcftools filter -e 'TYPE!="snp" || FORMAT/DP<20 || DP4[2]<2 || DP4[3]<2 || DP4[2]/(DP4[2]+DP4[0])>0.90 || DP4[3]/(DP4[3]+DP4[1])>0.90 ||DP4[0]<2||DP4[1]<2|| QUAL<50 ||SP <20|| INFO/MQ<30' -o het2_filtered.vcf Test10A1L5/samtools.vcf
40 changes: 22 additions & 18 deletions bin/get_percentage_het_snps.py → bin/get_het_snps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
import argparse
import sys
import os
import re
import pandas as pd

Expand All @@ -9,30 +10,33 @@
from lib.get_headers import read_header_json
from lib.get_items import get_items

def get_percentage_het_snps(qc_stats):
def get_het_snps(data_dir):

lane_ids_perc_het_snps = defaultdict(lambda: None)
lane_ids_het_snps = defaultdict(lambda: None)

with open(qc_stats, 'r') as stats:
next(stats)
for line in stats:
lane_ids_perc_het_snps[line.split(',')[2]] = float(line.split(',')[26])
for file in os.listdir(data_dir):
if file.endswith("stats"):
with open(f'{data_dir}/{file}', 'r') as stats:
next(stats)
for line in stats:
lane_id = file.rsplit('_', 1)[0]
lane_ids_het_snps[lane_id] = int(line.split('\t')[0])

return lane_ids_perc_het_snps
return lane_ids_het_snps


def write_qc_status(lane_ids, lane_ids_perc_het_snps, threshold, headers, output_file):
def write_qc_status(lane_ids, lane_ids_het_snps, threshold, headers, output_file):

headers.insert(0, 'lane_id')
df = pd.DataFrame(columns=headers, index = [0])

for ind, lane_id in enumerate(lane_ids):
perc_het_snps = lane_ids_perc_het_snps[lane_id]
het_snps = lane_ids_het_snps[lane_id]
df.loc[ind, headers[0]] = lane_id
df.loc[ind, headers[1]] = perc_het_snps
df.loc[ind, headers[1]] = het_snps

if perc_het_snps is not None:
if perc_het_snps < threshold:
if het_snps is not None:
if het_snps <= threshold:
df.loc[ind, headers[2]] = "PASS"
else:
df.loc[ind, headers[2]] = "FAIL"
Expand All @@ -41,13 +45,13 @@ def write_qc_status(lane_ids, lane_ids_perc_het_snps, threshold, headers, output


def get_arguments():
parser = argparse.ArgumentParser(description="Get percentage HET SNPs (of total SNPs) from Pathfind QC stats.")
parser = argparse.ArgumentParser(description="Get number of HET SNPs from Pathfind snp.")
parser.add_argument("-l", "--lane_ids", required=True, type=str,
help="Text file of lane ids.")
parser.add_argument("-q", "--qc_stats", required=True, type=str,
help="Pathfind QC stats file.")
parser.add_argument("-d", "--data_dir", required=True, type=str,
help="Data directory.")
parser.add_argument("-t", "--threshold", required=True, type=int,
help="QC PASS/FAIL threshold for percentage HET SNPs i.e. if the percentage HET SNPs is LESS THAN threshold, then PASS, otherwise FAIL.")
help="QC PASS/FAIL threshold for HET SNPs i.e. if HET SNPs is LESS THAN OR EQUAL TO threshold, then PASS, otherwise FAIL.")
parser.add_argument("-j", "--headers", required=True, type=str,
help="JSON of headers for QC report.")
parser.add_argument("-o", "--output_file", required=True, type=str,
Expand All @@ -64,10 +68,10 @@ def main(args):
header_dict = read_header_json(args.headers)

# Get number of contigs
lane_ids_perc_het_snps = get_percentage_het_snps(args.qc_stats)
lane_ids_het_snps = get_het_snps(args.data_dir)

# Write lane id, number of contigs and PASS/FAIL status in tab file
write_qc_status(lane_ids, lane_ids_perc_het_snps, args.threshold, header_dict["percentage_het_snps"], args.output_file)
write_qc_status(lane_ids, lane_ids_het_snps, args.threshold, header_dict["het_snps"], args.output_file)


if __name__ == '__main__':
Expand Down
6 changes: 3 additions & 3 deletions headers.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
"cov_breadth",
"cov_breadth_status"
],
"percentage_het_snps": [
"%_HET_SNPs",
"%_HET_SNPs_status"
"het_snps": [
"HET_SNPs",
"HET_SNPs_status"
]
}
11 changes: 7 additions & 4 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ include {genome_length} from './modules/genome_length.nf'
include {get_qc_stats_from_pf} from './modules/get_qc_stats_from_pf.nf'
include {depth_of_coverage} from './modules/depth_of_coverage.nf'
include {breadth_of_coverage} from './modules/breadth_of_coverage.nf'
include {percentage_HET_SNPs} from './modules/percentage_HET_SNPs.nf'
include {get_proportion_HET_SNPs} from './modules/get_proportion_HET_SNPs.nf'
include {HET_SNPs} from './modules/HET_SNPs.nf'

// Workflow for reads QC
workflow reads_qc {
Expand All @@ -38,6 +39,7 @@ workflow assemblies_qc {
take:
file_dest_ch
qc_stats_ch
het_stats_ch
headers_ch
lanes_ch

Expand All @@ -47,14 +49,14 @@ workflow assemblies_qc {
genome_length(file_dest_ch, headers_ch, lanes_ch)
depth_of_coverage(qc_stats_ch, headers_ch, lanes_ch)
breadth_of_coverage(qc_stats_ch, headers_ch, lanes_ch)
percentage_HET_SNPs(qc_stats_ch, headers_ch, lanes_ch)
HET_SNPs(het_stats_ch, headers_ch, lanes_ch)

number_of_contigs.out
.combine(contig_gc_content.out)
.combine(genome_length.out)
.combine(depth_of_coverage.out)
.combine(breadth_of_coverage.out)
.combine(percentage_HET_SNPs.out)
.combine(HET_SNPs.out)
.set { qc_report }

emit:
Expand All @@ -71,6 +73,7 @@ workflow {
lanes_ch = Channel.fromPath( params.lanes, checkIfExists: true )
get_file_destinations(lanes_ch)
get_qc_stats_from_pf(lanes_ch)
get_proportion_HET_SNPs(lanes_ch)

} else {
println("Error: You must specify a text file of lanes as --lanes <file with list of lanes>")
Expand All @@ -93,7 +96,7 @@ workflow {
reads_qc(get_file_destinations.out, headers_ch, lanes_ch)

// Run assembly QC
assemblies_qc(get_file_destinations.out, get_qc_stats_from_pf.out, headers_ch, lanes_ch)
assemblies_qc(get_file_destinations.out, get_qc_stats_from_pf.out, get_proportion_HET_SNPs.out, headers_ch, lanes_ch)

// Collate QC reports
collate_qc_data(reads_qc.out.qc_report, assemblies_qc.out.qc_report)
Expand Down
16 changes: 9 additions & 7 deletions modules/percentage_HET_SNPs.nf → modules/HET_SNPs.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
process percentage_HET_SNPs {
process HET_SNPs {

input:
path qc_stats
path het_stats
path headers
path lanes_file

Expand All @@ -10,16 +10,18 @@ process percentage_HET_SNPs {

script:
python_version = params.python_version
percentage_het_snps_threshold = params.percentage_het_snps_threshold
output_file = "percentage_het_snps.tab"
het_snps_threshold = params.het_snps_threshold
output_file = "het_snps.tab"

"""
tar -xf ${het_stats}
module load ISG/python/${python_version}
get_percentage_het_snps.py \
get_het_snps.py \
--lane_ids ${lanes_file} \
--qc_stats ${qc_stats} \
--threshold ${percentage_het_snps_threshold} \
--data_dir \$(pwd) \
--threshold ${het_snps_threshold} \
--headers ${headers} \
--output_file ${output_file}
"""
Expand Down
4 changes: 2 additions & 2 deletions modules/collate_qc_data.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ process collate_qc_data {

input:
path read_qc_report
tuple file(number_of_contigs), file(contig_gc_content), file(genome_length), file(depth_of_coverage), file(breadth_of_coverage), file(percentage_het_snps)
tuple file(number_of_contigs), file(contig_gc_content), file(genome_length), file(depth_of_coverage), file(breadth_of_coverage), file(het_snps)

output:
path("qc_report_complete.tab"), emit: complete
Expand All @@ -15,7 +15,7 @@ process collate_qc_data {
module load ISG/python/${python_version}
collate_qc_data.py \
--qc_reports ${read_qc_report} ${number_of_contigs} ${contig_gc_content} ${genome_length} ${depth_of_coverage} ${breadth_of_coverage} ${percentage_het_snps} \
--qc_reports ${read_qc_report} ${number_of_contigs} ${contig_gc_content} ${genome_length} ${depth_of_coverage} ${breadth_of_coverage} ${het_snps} \
--output_prefix "qc_report"
"""
}
Loading

0 comments on commit 484040e

Please sign in to comment.