Merge pull request #11 from sanger-bentley-group/feature/het_snps

Feature/het snps
sanger-bentley-group · Apr 28, 2022 · 484040e · 484040e
2 parents 5d902a1 + 4bd1d22
commit 484040e
Show file tree

Hide file tree

Showing 16 changed files with 320 additions and 105 deletions.
diff --git a/README.md b/README.md
@@ -58,14 +58,15 @@ Change:
 - `/path/to/gbs_qc_reports` to the directory location of the generated reports. (Default is the current directory)
 
 ## Output
-You should get two tab-delimited output reports `qc_report_summary.tab` and `qc_report_complete.tab` in the `--qc_reports_directory` you specified. `qc_report_summary.tab` gives the `lane_id` and PASS/FAIL `status`. `qc_report_summary.tab` gives all the PASS/FAIL status for each QC.
+You should get two tab-delimited output reports `qc_report_summary.tab` and `qc_report_complete.tab` in the `--qc_reports_directory` you specified. `qc_report_summary.tab` gives the `lane_id` and PASS/FAIL `status`. `qc_report_complete.tab` gives all the PASS/FAIL status for each QC.
 
 ### Missing Data
 If there are empty values in  `qc_report_summary.tab` then at least one QC workflow may have failed. You can look in the `qc_report_complete.tab` to find which one.
 
 If there are empty values for:
 - `rel_abundance` then these lanes may not have been imported/imported properly with a kraken report.
-- `contig_no` then these lanes may not have been assembled/assembled properly
+- `contig_no` or `gc_content` then these lanes may not have been assembled/assembled properly
+- `HET_SNPs` then these lanes may not have had variants called
 
 If this is the case contact `[email protected]` for help with this.
 
@@ -80,7 +81,7 @@ If this is the case contact `[email protected]` for help with this.
     --genome_len_higher_threshold   Genome length/total number of bases < genome_len_higher_threshold to pass. (Default: 2800000)
     --cov_depth_threshold           Genome depth of coverage > cov_depth_threshold to pass. (Default: 20)
     --cov_breadth_threshold         Genome breadth of coverage > cov_breadth_threshold to pass. (Default: 70)
-    --percentage_het_snps_threshold Percentage of HET SNPs (of total SNPs) < percentage_het_snps_threshold to pass. (Default: 15)
+    --het_snps_threshold            Number of HET SNPs <= het_snps_threshold to pass. (Default: 20)
 
 ## The methods
 The methods used for finding relative abundance from Kraken, coverage breadth, coverage depth and percentage HET SNPs out of total SNPs are described [here](http://mediawiki.internal.sanger.ac.uk/index.php/Pathogen_Informatics_QC_Pipeline) (Sanger access only).

diff --git a/bin/filtervcf_v4.py b/bin/filtervcf_v4.py
@@ -0,0 +1,165 @@
+#! /usr/bin/env python2.7
+
+
+## modified for not using the MGE removal step
+## changed the threshold for SNP from 20 to 5
+
+import sys
+
+in1 = sys.argv[1] ##raw vcf file
+out1 = sys.argv[2] ##output prefix
+#mge = sys.argv[3] ##"MGE_staph.bed file
+#cov = sys.argv[3] ##coverage file
+min_dist = sys.argv[3] ##the distance parameter to compare snp at distance
+#ref = sys.argv[4] ##reference file
+
+
+d1 = [base.strip() for base in open(in1).readlines() if not base.startswith("#") and "INDEL" not in base]
+#mge_coord = [base.strip() for base in open(mge).readlines()]
+
+def checkPV4(x):
+    l2 = []
+    for each in x:
+        if each.startswith("PV4="):
+            l2.append(each.split("=")[1])
+    return l2
+
+def checkdp4(x):
+    l1 = []
+    for char in x:
+        if char.startswith("DP="):
+            l1.append(char.split("=")[1])
+        elif char.startswith("DP4="):
+            l1.append(char.split("=")[1])
+        elif char.startswith("MQ="):
+            l1.append(char.split("=")[1])
+    return l1
+
+#def mge_filt(x):
+#    """list of the variants called at each position"""
+#
+#    vmge = {}
+#    for each in x:
+#        c1 = each.split("\t")
+#
+#        for char in mge_coord:
+#            d1 = char.split("\t")
+#            if int(c1[1]) >= int(d1[0]) and int(c1[1]) <= int(d1[1]):
+#                vmge[c1[1]] = each
+#
+#    var1 = {base.split("\t")[1]:base for base in x}
+#    mge_rmd = set(var1) - set(vmge)
+#    mge_filt = [var1[base] for base in list(mge_rmd)]
+#    return mge_filt
+
+def dist_filter(z):
+    j = 0
+    k = 0
+    others = []
+    clust = {}
+    a1 = []
+    z1 = sorted(z)
+
+    for i in range(len(z1)-1):
+        if z1[i+1] - z1[i] < int(min_dist):
+            a1 += [z1[i], z1[i+1]]
+            clust["c"+str(j)] = a1
+        else:
+            others += [z1[i], z1[i+1]]
+            j += 1
+            a1 = []
+
+    lclust = []
+    for each in clust:
+        lclust += clust[each]
+
+    x1 = set(others)
+    x2 = set(lclust)
+
+    fsnps = sorted(list( x1 - x2 ))
+
+    return fsnps
+
+Hetvar = []
+Homvar = []
+failed = []
+
+for each in d1:
+
+    a1 = each.split("\t")
+    a2 = a1[7].split(";")
+    a3 = checkdp4(a2)
+    a4 = a1[9].split(":")[-2]
+    a5 = checkPV4(a2)
+    dp4 = a3[1].split(",")
+    if float(a1[5]) >50 and int(a3[0]) >5 and int(a3[-1]) >30 and int(dp4[2]) >2 and int(dp4[3]) > 2:
+        p1 = [a1[0], a1[1], a1[3],a1[4],a1[5], a3[0],a3[-1]]+dp4
+        l1 = int(dp4[2]) / (float(dp4[2]) + float(dp4[0]))
+        r1 = int(dp4[3]) / (float(dp4[3]) + float(dp4[1]))
+        if len(a5) >0:
+            PV4 = a5[0].split(",")
+            if float(PV4[0]) >0.001 and float(PV4[2]) > 0.001 and float(PV4[3]) > 0.001:
+                if l1 < 0.90 and r1 < 0.90:
+                    t1 = [a1[0], a1[1], a1[3],a1[4],a1[5], a3[0],a3[-1]]+dp4 + ["HET"]
+                    Hetvar.append("\t".join(t1))
+                if l1 > 0.80 and r1 > 0.80:
+                    t3 = [a1[0], a1[1], a1[3], a1[4], a1[5], a3[0], a3[-1]] + dp4 + ["HOMO"]
+                    Homvar.append("\t".join(t3))
+            else:
+                t5 = [a1[0], a1[1], a1[3], a1[4], a1[5], a3[0], a3[-1]] + dp4 +["-"]
+                failed.append("\t".join(t5))
+        else:
+            if l1 < 0.90 and r1 < 0.90:
+                    t1 = [a1[0], a1[1], a1[3],a1[4],a1[5], a3[0],a3[-1]]+dp4 + ["HET"]
+                    Hetvar.append("\t".join(t1))
+            if l1 > 0.80 and r1 > 0.80:
+                    t3 = [a1[0], a1[1], a1[3], a1[4], a1[5], a3[0], a3[-1]] + dp4 + ["HOMO"]
+                    Homvar.append("\t".join(t3))
+
+    else:
+        t2 = [a1[0], a1[1], a1[3],a1[4],a1[5], a3[0],a3[-1]] + dp4 + ["-"]
+        failed.append("\t".join(t2))
+
+
+#vfilt1 = mge_filt(fvar)
+
+homfilt = {int(base.split("\t")[1]):base for base in Homvar}
+hetfilt = {int(base.split("\t")[1]):base for base in Hetvar}
+vdistHom = dist_filter(homfilt.keys())
+vdistHet = dist_filter(hetfilt.keys())
+
+homfiltered = [homfilt[int(base)] for base in vdistHom]
+hetfiltered = [hetfilt[int(base)] for base in vdistHet]
+
+
+
+#c1 = open(cov).read().strip().replace(" ","")
+n1 = len(hetfiltered)
+n2 = len(homfiltered)
+
+#refSeq = ""
+#for line in open(ref).readlines():
+#    if not line.startswith(">"):
+#        refSeq += line.strip()
+
+try:
+    st1 = format(float(n1)/n2, ".2f")
+
+except ZeroDivisionError as err:
+    print(err)
+    st1 = 0
+
+finally:
+    h1 = ["\t".join("Het_SNPs Total_SNPs Proportion".split(" "))]
+    h2 = ["\t".join([str(n1), str(n2), str(st1)])]
+    stats = h1+h2
+
+header = "Chrom Pos Ref Alt BaseQ Depth MapQ RF RR AF AR TYPE"
+#open(out1+"_failed.vcf","w+").writelines("\n".join(["\t".join(header.split(" "))]+sorted(failed, key = lambda x: int(x.split("\t")[1]))))
+open(out1+"_"+min_dist+"stats", "w+").writelines("\n".join(stats))
+open(out1+"_"+min_dist+"fhom.vcf","w+").writelines("\n".join(["\t".join(header.split(" "))]+sorted(homfiltered, key = lambda x: int(x.split("\t")[1]))))
+open(out1+"_"+min_dist+"fhet.vcf","w+").writelines("\n".join(["\t".join(header.split(" "))]+sorted(hetfiltered, key = lambda x: int(x.split("\t")[1]))))
+
+
+##for hom snp#'TYPE!="snp" || FORMAT/DP<20 || DP4[2]+DP4[3]<4 || DP4[2]<2 || DP4[3]<2 || DP4[2]/(DP4[2]+DP4[0])<0.75 || DP4[3]/(DP4[3]+DP4[1])<0.75 || QUAL<50 || INFO/MQ<30'
+##for het snp#~/Programs/bcftools-1.3.1/bcftools filter -e 'TYPE!="snp" || FORMAT/DP<20 || DP4[2]<2 || DP4[3]<2 || DP4[2]/(DP4[2]+DP4[0])>0.90 || DP4[3]/(DP4[3]+DP4[1])>0.90 ||DP4[0]<2||DP4[1]<2|| QUAL<50 ||SP <20|| INFO/MQ<30' -o het2_filtered.vcf Test10A1L5/samtools.vcf
diff --git a/bin/get_percentage_het_snps.py → bin/get_het_snps.py b/bin/get_percentage_het_snps.py → bin/get_het_snps.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import argparse
 import sys
+import os
 import re
 import pandas as pd
 
@@ -9,30 +10,33 @@
 from lib.get_headers import read_header_json
 from lib.get_items import get_items
 
-def get_percentage_het_snps(qc_stats):
+def get_het_snps(data_dir):
 
-    lane_ids_perc_het_snps = defaultdict(lambda: None)
+    lane_ids_het_snps = defaultdict(lambda: None)
 
-    with open(qc_stats, 'r') as stats:
-        next(stats)
-        for line in stats:
-            lane_ids_perc_het_snps[line.split(',')[2]] = float(line.split(',')[26])
+    for file in os.listdir(data_dir):
+        if file.endswith("stats"):
+            with open(f'{data_dir}/{file}', 'r') as stats:
+                next(stats)
+                for line in stats:
+                    lane_id = file.rsplit('_', 1)[0]
+                    lane_ids_het_snps[lane_id] = int(line.split('\t')[0])
 
-    return lane_ids_perc_het_snps
+    return lane_ids_het_snps
 
 
-def write_qc_status(lane_ids, lane_ids_perc_het_snps, threshold, headers, output_file):
+def write_qc_status(lane_ids, lane_ids_het_snps, threshold, headers, output_file):
 
     headers.insert(0, 'lane_id')
     df = pd.DataFrame(columns=headers, index = [0])
 
     for ind, lane_id in enumerate(lane_ids):
-        perc_het_snps = lane_ids_perc_het_snps[lane_id]
+        het_snps = lane_ids_het_snps[lane_id]
         df.loc[ind, headers[0]] = lane_id
-        df.loc[ind, headers[1]] = perc_het_snps
+        df.loc[ind, headers[1]] = het_snps
 
-        if perc_het_snps is not None:
-            if perc_het_snps < threshold:
+        if het_snps is not None:
+            if het_snps <= threshold:
                 df.loc[ind, headers[2]] = "PASS"
             else:
                 df.loc[ind, headers[2]] = "FAIL"
@@ -41,13 +45,13 @@ def write_qc_status(lane_ids, lane_ids_perc_het_snps, threshold, headers, output
 
 
 def get_arguments():
-    parser = argparse.ArgumentParser(description="Get percentage HET SNPs (of total SNPs) from Pathfind QC stats.")
+    parser = argparse.ArgumentParser(description="Get number of HET SNPs from Pathfind snp.")
     parser.add_argument("-l", "--lane_ids", required=True, type=str,
                         help="Text file of lane ids.")
-    parser.add_argument("-q", "--qc_stats", required=True, type=str,
-                        help="Pathfind QC stats file.")
+    parser.add_argument("-d", "--data_dir", required=True, type=str,
+                        help="Data directory.")
     parser.add_argument("-t", "--threshold", required=True, type=int,
-                        help="QC PASS/FAIL threshold for percentage HET SNPs i.e. if the percentage HET SNPs is LESS THAN threshold, then PASS, otherwise FAIL.")
+                        help="QC PASS/FAIL threshold for HET SNPs i.e. if HET SNPs is LESS THAN OR EQUAL TO threshold, then PASS, otherwise FAIL.")
     parser.add_argument("-j", "--headers", required=True, type=str,
                         help="JSON of headers for QC report.")
     parser.add_argument("-o", "--output_file", required=True, type=str,
@@ -64,10 +68,10 @@ def main(args):
     header_dict = read_header_json(args.headers)
 
     # Get number of contigs
-    lane_ids_perc_het_snps = get_percentage_het_snps(args.qc_stats)
+    lane_ids_het_snps = get_het_snps(args.data_dir)
 
     # Write lane id, number of contigs and PASS/FAIL status in tab file
-    write_qc_status(lane_ids, lane_ids_perc_het_snps, args.threshold, header_dict["percentage_het_snps"], args.output_file)
+    write_qc_status(lane_ids, lane_ids_het_snps, args.threshold, header_dict["het_snps"], args.output_file)
 
 
 if __name__ == '__main__':

diff --git a/headers.json b/headers.json
@@ -23,8 +23,8 @@
         "cov_breadth",
         "cov_breadth_status"
     ],
-    "percentage_het_snps": [
-        "%_HET_SNPs",
-        "%_HET_SNPs_status"
+    "het_snps": [
+        "HET_SNPs",
+        "HET_SNPs_status"
     ]
 }
diff --git a/main.nf b/main.nf
@@ -16,7 +16,8 @@ include {genome_length} from './modules/genome_length.nf'
 include {get_qc_stats_from_pf} from './modules/get_qc_stats_from_pf.nf'
 include {depth_of_coverage} from './modules/depth_of_coverage.nf'
 include {breadth_of_coverage} from './modules/breadth_of_coverage.nf'
-include {percentage_HET_SNPs} from './modules/percentage_HET_SNPs.nf'
+include {get_proportion_HET_SNPs} from './modules/get_proportion_HET_SNPs.nf'
+include {HET_SNPs} from './modules/HET_SNPs.nf'
 
 // Workflow for reads QC
 workflow reads_qc {
@@ -38,6 +39,7 @@ workflow assemblies_qc {
     take:
     file_dest_ch
     qc_stats_ch
+    het_stats_ch
     headers_ch
     lanes_ch
 
@@ -47,14 +49,14 @@ workflow assemblies_qc {
     genome_length(file_dest_ch, headers_ch, lanes_ch)
     depth_of_coverage(qc_stats_ch, headers_ch, lanes_ch)
     breadth_of_coverage(qc_stats_ch, headers_ch, lanes_ch)
-    percentage_HET_SNPs(qc_stats_ch, headers_ch, lanes_ch)
+    HET_SNPs(het_stats_ch, headers_ch, lanes_ch)
 
     number_of_contigs.out
     .combine(contig_gc_content.out)
     .combine(genome_length.out)
     .combine(depth_of_coverage.out)
     .combine(breadth_of_coverage.out)
-    .combine(percentage_HET_SNPs.out)
+    .combine(HET_SNPs.out)
     .set { qc_report }
 
     emit:
@@ -71,6 +73,7 @@ workflow {
         lanes_ch = Channel.fromPath( params.lanes, checkIfExists: true )
         get_file_destinations(lanes_ch)
         get_qc_stats_from_pf(lanes_ch)
+        get_proportion_HET_SNPs(lanes_ch)
 
     } else {
         println("Error: You must specify a text file of lanes as --lanes <file with list of lanes>")
@@ -93,7 +96,7 @@ workflow {
     reads_qc(get_file_destinations.out, headers_ch, lanes_ch)
 
     // Run assembly QC
-    assemblies_qc(get_file_destinations.out, get_qc_stats_from_pf.out, headers_ch, lanes_ch)
+    assemblies_qc(get_file_destinations.out, get_qc_stats_from_pf.out, get_proportion_HET_SNPs.out, headers_ch, lanes_ch)
 
     // Collate QC reports
     collate_qc_data(reads_qc.out.qc_report, assemblies_qc.out.qc_report)

diff --git a/modules/percentage_HET_SNPs.nf → modules/HET_SNPs.nf b/modules/percentage_HET_SNPs.nf → modules/HET_SNPs.nf
@@ -1,7 +1,7 @@
-process percentage_HET_SNPs {
+process HET_SNPs {
 
     input:
-    path qc_stats
+    path het_stats
     path headers
     path lanes_file
 
@@ -10,16 +10,18 @@ process percentage_HET_SNPs {
 
     script:
     python_version = params.python_version
-    percentage_het_snps_threshold = params.percentage_het_snps_threshold
-    output_file = "percentage_het_snps.tab"
+    het_snps_threshold = params.het_snps_threshold
+    output_file = "het_snps.tab"
 
     """
+    tar -xf ${het_stats}
+
     module load ISG/python/${python_version}
 
-    get_percentage_het_snps.py \
+    get_het_snps.py \
         --lane_ids ${lanes_file} \
-        --qc_stats ${qc_stats} \
-        --threshold ${percentage_het_snps_threshold} \
+        --data_dir \$(pwd) \
+        --threshold ${het_snps_threshold} \
         --headers ${headers} \
         --output_file ${output_file}
     """

diff --git a/modules/collate_qc_data.nf b/modules/collate_qc_data.nf
@@ -2,7 +2,7 @@ process collate_qc_data {
 
     input:
     path read_qc_report
-    tuple file(number_of_contigs), file(contig_gc_content), file(genome_length), file(depth_of_coverage), file(breadth_of_coverage), file(percentage_het_snps)
+    tuple file(number_of_contigs), file(contig_gc_content), file(genome_length), file(depth_of_coverage), file(breadth_of_coverage), file(het_snps)
 
     output:
     path("qc_report_complete.tab"), emit: complete
@@ -15,7 +15,7 @@ process collate_qc_data {
     module load ISG/python/${python_version}
 
     collate_qc_data.py \
-        --qc_reports ${read_qc_report} ${number_of_contigs} ${contig_gc_content} ${genome_length} ${depth_of_coverage} ${breadth_of_coverage} ${percentage_het_snps} \
+        --qc_reports ${read_qc_report} ${number_of_contigs} ${contig_gc_content} ${genome_length} ${depth_of_coverage} ${breadth_of_coverage} ${het_snps} \
         --output_prefix "qc_report"
     """
 }