From c1921ac769aba0aa434d3b168e05eae4163ec895 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 27 Sep 2023 13:09:46 -0400
Subject: [PATCH] Update vcf format and work on PFB

---
 include/fasta_query.h     |   3 +
 python/cnv_plots.py       | 380 ++++++++++++++++++--------------------
 python/pfb_from_gnomad.py |  39 ++++
 python/train_model.py     |  22 +--
 python/utils.py           |  34 +++-
 src/fasta_query.cpp       |  32 ++++
 src/sv_data.cpp           |  62 ++++++-
 7 files changed, 354 insertions(+), 218 deletions(-)
 create mode 100644 python/pfb_from_gnomad.py

diff --git a/include/fasta_query.h b/include/fasta_query.h
index 5ed70c62..0b237c44 100644
--- a/include/fasta_query.h
+++ b/include/fasta_query.h
@@ -17,6 +17,9 @@ class FASTAQuery {
         int setFilepath(std::string fasta_filepath);
         std::string getFilepath();
         std::string query(std::string chr, int pos_start, int pos_end);
+
+        // Get the chromosome contig lengths in VCF header format
+        std::string getContigHeader();
 };
 
 #endif // FASTA_QUERY_H
diff --git a/python/cnv_plots.py b/python/cnv_plots.py
index 9456d707..90b2fad4 100644
--- a/python/cnv_plots.py
+++ b/python/cnv_plots.py
@@ -1,4 +1,4 @@
-# Plot the copy number variants and their log2_ratio, BAF values.
+"""Plot the copy number variants and their log2_ratio, BAF values."""
 
 import os
 import sys
@@ -7,7 +7,7 @@
 from plotly.subplots import make_subplots
 import pandas as pd
 
-from .utils import parse_region
+from .utils import parse_region, get_info_field_column, get_info_field_value
 
 # Set up logging.
 log.basicConfig(
@@ -18,31 +18,6 @@
     ]
 )
 
-def get_info_field_value(info_field, field_name):
-    """
-    Get the value of a field in the INFO field of a VCF file.
-
-    Args:
-        info_field (str): The INFO field.
-        field_name (str): The name of the field to get the value of.
-
-    Returns:
-        str: The value of the field.
-    """
-
-    # Split the INFO field into its parts.
-    info_field_parts = info_field.split(";")
-
-    # Get the field value.
-    field_value = ""
-    for info_field_part in info_field_parts:
-        if info_field_part.startswith("{}=".format(field_name)):
-            field_value = info_field_part.split("=")[1]
-            break
-
-    # Return the field value.
-    return field_value
-
 def run(vcf_file, cnv_data_file, output_path, region):
     """
     Saves a plot of the CNVs and their log2 ratio and B-allele frequency
@@ -66,9 +41,9 @@ def run(vcf_file, cnv_data_file, output_path, region):
     chromosome, start_position, end_position = parse_region(region)
 
     if start_position is not None and end_position is not None:
-        log.info(f"Plotting CNVs in region %s:%d-%d.", chromosome, start_position, end_position)
+        log.info("Plotting CNVs in region %s:%d-%d.", chromosome, start_position, end_position)
     else:
-        log.info(f"Plotting CNVs in region %s.", chromosome)
+        log.info("Plotting CNVs in region %s.", chromosome)
 
     # Set up the CNV type string list for the 6 CNV states.
     cnv_types = ["NAN", "DEL", "DEL", "NEUT", "NEUT", "DUP", "DUP"]
@@ -95,9 +70,9 @@ def run(vcf_file, cnv_data_file, output_path, region):
 
     # Create an output html file where we will append the CNV plots.
     if start_position is not None and end_position is not None:
-        html_filename = "cnv_plots_{}_{}_{}.html".format(chromosome, start_position, end_position)
+        html_filename = f"cnv_plots_{chromosome}_{start_position}_{end_position}.html"
     else:
-        html_filename = "cnv_plots_{}.html".format(chromosome)
+        html_filename = f"cnv_plots_{chromosome}.html"
     
     # Create the output directory if it doesn't exist.
     if not os.path.exists(output_path):
@@ -108,177 +83,176 @@ def run(vcf_file, cnv_data_file, output_path, region):
     if os.path.exists(output_html_filepath):
         os.remove(output_html_filepath)
 
-    output_html_file = open(output_html_filepath, "w", encoding="utf-8")
-
-    #output_html_file = open(os.path.join(output_path, "cnv_plots.html"), "w")
-
-    # Loop through the VCF data and plot each CNV (DEL or DUP) along with log2
-    # ratio and BAF values for the SNPs in the CNV.
-    cnv_count = 0
-    for _, row in vcf_data.iterrows():
-
-        # Get the INFO field.
-        info_field = row[7]
-
-        # Get the SVTYPE field value.
-        svtype = get_info_field_value(info_field, "SVTYPE")
-
-        # Analyze the CNV if it is a DEL or DUP.
-        if svtype in ("DEL", "DUP"):
-
-            # Get the read depth (DP) value.
-            read_depth = int(get_info_field_value(info_field, "DP"))
-
-            # Get the start and end positions.
-            start_position = int(row[1])
-            end_position = int(get_info_field_value(info_field, "END"))
-
-            # Get the chromosome.
-            chromosome = row[0]
-
-            # Get the length of the CNV.
-            cnv_length = end_position - start_position + 1
-
-            # Get the plot range.
-            plot_start_position = start_position - (plot_range * cnv_length)
-            plot_end_position = end_position + (plot_range * cnv_length)
-
-            # Get the CNV state, log2 ratio, and BAF values for all SNPs in the
-            # plot range.
-            sv_data = cnv_data[(cnv_data["position"] >= plot_start_position) & (cnv_data["position"] <= plot_end_position)]
-            
-            # Get the marker colors for the state sequence.
-            marker_colors = []
-            for state in sv_data["cnv_state"]:
-                if state in [1, 2]:
-                    marker_colors.append("red")
-                elif state in [3, 4]:
-                    marker_colors.append("black")
-                elif state in [5, 6]:
-                    marker_colors.append("blue")
-
-            # Get the hover text for the state sequence markers.
-            hover_text = []
-            for _, row in sv_data.iterrows():
-                hover_text.append(f"TYPE: {cnv_types[row['cnv_state']]}<br>CHR: {row['chromosome']}<br>POS: {row['position']}<br>L2R: {row['log2_ratio']}<br>BAF: {row['b_allele_freq']}")
-
-            # Create the log2 ratio trace.
-            log2_ratio_trace = plotly.graph_objs.Scatter(
-                x = sv_data["position"],
-                y = sv_data["log2_ratio"],
-                mode = "markers+lines",
-                name = r'Log<sub>2</sub> Ratio',
-                text = hover_text,
-                hoverinfo = "text",
-                marker = dict(
-                    color = marker_colors,
-                    size = 10,
-                ),
-                line = dict(
-                    color = "black",
-                    width = 0
-                ),
-                showlegend = False
-            )
-
-            # Create the B-allele frequency trace.
-            baf_trace = plotly.graph_objs.Scatter(
-                x = sv_data["position"],
-                y = sv_data["b_allele_freq"],
-                mode = "markers+lines",
-                name = "B-Allele Frequency",
-                text = hover_text,
-                marker = dict(
-                    color = marker_colors,
-                    size = 10,
-                ),
-                line = dict(
-                    color = "black",
-                    width = 0
-                ),
-                showlegend = False
-            )
-
-            # Create a subplot for the CNV plot and the BAF plot.
-            fig = make_subplots(
-                rows=2,
-                cols=1,
-                shared_xaxes=True,
-                vertical_spacing=0.05,
-                subplot_titles=(r"SNP Log<sub>2</sub> Ratio", "SNP B-Allele Frequency")
-            )
-
-            # Add the traces to the figure.
-            fig.append_trace(log2_ratio_trace, 1, 1)
-            fig.append_trace(baf_trace, 2, 1)
-
-            # Set the x-axis title.
-            fig.update_xaxes(
-                title_text = "Chromosome Position",
-                row = 2,
-                col = 1
-            )
-
-            # Set the y-axis titles.
-            fig.update_yaxes(
-                title_text = r"Log<sub>2</sub> Ratio",
-                row = 1,
-                col = 1
-            )
-
-            fig.update_yaxes(
-                title_text = "B-Allele Frequency",
-                row = 2,
-                col = 1
-            )
-
-            # Set the figure title.
-            fig.update_layout(
-                title_text = f"CNV {svtype} {chromosome}:{start_position}-{end_position} (Read Depth: {read_depth})"
-            )
-
-            # Create a shaded rectangle for the CNV, layering it below the CNV
-            # trace and labeling it with the CNV type.
-            fig.add_vrect(
-                x0 = start_position,
-                x1 = end_position,
-                fillcolor = "Black",
-                layer = "below",
-                line_width = 0,
-                opacity = 0.1,
-                annotation_text = svtype,
-                annotation_position = "top left",
-                annotation_font_size = 20,
-                annotation_font_color = "black"
-            )
-
-            # Add vertical lines at the start and end positions of the CNV.
-            fig.add_vline(
-                x = start_position,
-                line_width = 2,
-                line_color = "black",
-                layer = "below"
-            )
-
-            fig.add_vline(
-                x = end_position,
-                line_width = 2,
-                line_color = "black",
-                layer = "below"
-            )
-
-            # Append the figure to the output html file.
-            output_html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn"))
-            log.info("Plotted CNV %s %s:%d-%d.", svtype, chromosome, start_position, end_position)
-
-            # Increment the CNV count.
-            cnv_count += 1
-
-            # Break if the maximum number of CNVs has been reached.
-            if cnv_count == max_cnvs:
-                break
-
-    # Close the output html file.
-    output_html_file.close()
+    with open(output_html_filepath, "w", encoding="utf-8") as output_html_file:
+
+        # Get the INFO field column by searching for the first column that
+        # contains SVTYPE=.
+        info_column = get_info_field_column(vcf_data)
+
+        # Loop through the VCF data and plot each CNV (DEL or DUP) along with log2
+        # ratio and BAF values for the SNPs in the CNV.
+        cnv_count = 0
+        for _, sv_data in vcf_data.iterrows():
+
+            # Get the INFO field
+            info_field = sv_data[info_column]
+
+            # Get the SVTYPE field value.
+            svtype = get_info_field_value(info_field, "SVTYPE")
+
+            # Analyze the CNV if it is a DEL or DUP.
+            if svtype in ("DEL", "DUP"):
+
+                # Get the read depth (DP) value.
+                read_depth = int(get_info_field_value(info_field, "DP"))
+
+                # Get the start and end positions.
+                start_position = int(sv_data[1])
+                end_position = int(get_info_field_value(info_field, "END"))
+
+                # Get the chromosome.
+                chromosome = sv_data[0]
+
+                # Get the length of the CNV.
+                cnv_length = end_position - start_position + 1
+
+                # Get the plot range.
+                plot_start_position = start_position - (plot_range * cnv_length)
+                plot_end_position = end_position + (plot_range * cnv_length)
+
+                # Get the CNV state, log2 ratio, and BAF values for all SNPs in the
+                # plot range.
+                sv_data = cnv_data[(cnv_data["position"] >= plot_start_position) & (cnv_data["position"] <= plot_end_position)]
+                
+                # Get the marker colors for the state sequence.
+                marker_colors = []
+                for state in sv_data["cnv_state"]:
+                    if state in [1, 2]:
+                        marker_colors.append("red")
+                    elif state in [3, 4]:
+                        marker_colors.append("black")
+                    elif state in [5, 6]:
+                        marker_colors.append("blue")
+
+                # Get the hover text for the state sequence markers.
+                hover_text = []
+                for _, row in sv_data.iterrows():
+                    hover_text.append(f"TYPE: {cnv_types[row['cnv_state']]}<br>CHR: {row['chromosome']}<br>POS: {row['position']}<br>L2R: {row['log2_ratio']}<br>BAF: {row['b_allele_freq']}")
+
+                # Create the log2 ratio trace.
+                log2_ratio_trace = plotly.graph_objs.Scatter(
+                    x = sv_data["position"],
+                    y = sv_data["log2_ratio"],
+                    mode = "markers+lines",
+                    name = r'Log<sub>2</sub> Ratio',
+                    text = hover_text,
+                    hoverinfo = "text",
+                    marker = dict(
+                        color = marker_colors,
+                        size = 10,
+                    ),
+                    line = dict(
+                        color = "black",
+                        width = 0
+                    ),
+                    showlegend = False
+                )
+
+                # Create the B-allele frequency trace.
+                baf_trace = plotly.graph_objs.Scatter(
+                    x = sv_data["position"],
+                    y = sv_data["b_allele_freq"],
+                    mode = "markers+lines",
+                    name = "B-Allele Frequency",
+                    text = hover_text,
+                    marker = dict(
+                        color = marker_colors,
+                        size = 10,
+                    ),
+                    line = dict(
+                        color = "black",
+                        width = 0
+                    ),
+                    showlegend = False
+                )
+
+                # Create a subplot for the CNV plot and the BAF plot.
+                fig = make_subplots(
+                    rows=2,
+                    cols=1,
+                    shared_xaxes=True,
+                    vertical_spacing=0.05,
+                    subplot_titles=(r"SNP Log<sub>2</sub> Ratio", "SNP B-Allele Frequency")
+                )
+
+                # Add the traces to the figure.
+                fig.append_trace(log2_ratio_trace, 1, 1)
+                fig.append_trace(baf_trace, 2, 1)
+
+                # Set the x-axis title.
+                fig.update_xaxes(
+                    title_text = "Chromosome Position",
+                    row = 2,
+                    col = 1
+                )
+
+                # Set the y-axis titles.
+                fig.update_yaxes(
+                    title_text = r"Log<sub>2</sub> Ratio",
+                    row = 1,
+                    col = 1
+                )
+
+                fig.update_yaxes(
+                    title_text = "B-Allele Frequency",
+                    row = 2,
+                    col = 1
+                )
+
+                # Set the figure title.
+                fig.update_layout(
+                    title_text = f"CNV {svtype} {chromosome}:{start_position}-{end_position} (Read Depth: {read_depth})"
+                )
+
+                # Create a shaded rectangle for the CNV, layering it below the CNV
+                # trace and labeling it with the CNV type.
+                fig.add_vrect(
+                    x0 = start_position,
+                    x1 = end_position,
+                    fillcolor = "Black",
+                    layer = "below",
+                    line_width = 0,
+                    opacity = 0.1,
+                    annotation_text = svtype,
+                    annotation_position = "top left",
+                    annotation_font_size = 20,
+                    annotation_font_color = "black"
+                )
+
+                # Add vertical lines at the start and end positions of the CNV.
+                fig.add_vline(
+                    x = start_position,
+                    line_width = 2,
+                    line_color = "black",
+                    layer = "below"
+                )
+
+                fig.add_vline(
+                    x = end_position,
+                    line_width = 2,
+                    line_color = "black",
+                    layer = "below"
+                )
+
+                # Append the figure to the output html file.
+                output_html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn"))
+                log.info("Plotted CNV %s %s:%d-%d.", svtype, chromosome, start_position, end_position)
+
+                # Increment the CNV count.
+                cnv_count += 1
+
+                # Break if the maximum number of CNVs has been reached.
+                if cnv_count == max_cnvs:
+                    break
     
     log.info("Saved CNV plots to %s.", output_html_filepath)
diff --git a/python/pfb_from_gnomad.py b/python/pfb_from_gnomad.py
new file mode 100644
index 00000000..c77b659e
--- /dev/null
+++ b/python/pfb_from_gnomad.py
@@ -0,0 +1,39 @@
+"""Run benchmarking on the ContextSV output VCF file."""
+import os
+import sys
+import logging as log
+import pandas as pd
+
+from .utils import get_info_field_column, get_info_field_value
+
+
+def save_snp_af_to_tsv(input_vcf, output_tsv):
+    """
+    Get the SNP Filtered Allele Frequency (FAF) from the input VCF file and save
+    to a TSV file.
+    This is the population B-allele frequency (PFB) of the SNP.
+    Any allele frequency less than 0.01 (1%) is considered a very rare variant
+    and is ignored in this analysis.
+    (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9160216/)
+    """
+    # Read the input VCF file.
+    vcf_df = pd.read_csv(input_vcf, sep="\t", comment="#", header=None)
+
+    # Get the column index of the INFO field.
+
+    # Return the SNP FAF.
+    return snp_faf
+
+
+# Run the program.
+if __name__ == "__main__":
+
+    # Get the GNOMAD VCF file of benchmarking variants.
+    benchmark_vcf = sys.argv[1]
+    
+    # Get the output TSV file.
+    output_tsv = sys.argv[2]
+
+    # Run the program.
+    run_cnv_benchmarking(benchmark_vcf, output_tsv)
+    
\ No newline at end of file
diff --git a/python/train_model.py b/python/train_model.py
index fe0d7b72..e5408313 100644
--- a/python/train_model.py
+++ b/python/train_model.py
@@ -5,16 +5,15 @@
 import os
 import sys
 import joblib
-import sklearn as sk
 from sklearn.linear_model import LogisticRegression
 import pandas as pd
-import os
 
-# Train the model.
-def train(tp_filepath, fp_filepath):
+
+def train(true_positives_filepath, false_positives_filepath):
+    """Train the binary classification model."""
     # Read in the true positive and false positive data.
-    tp_data = pd.read_csv(tp_filepath, sep="\t")
-    fp_data = pd.read_csv(fp_filepath, sep="\t")
+    tp_data = pd.read_csv(true_positives_filepath, sep="\t")
+    fp_data = pd.read_csv(false_positives_filepath, sep="\t")
 
     # Get the training data.
     tp_data["label"] = 1
@@ -33,12 +32,13 @@ def train(tp_filepath, fp_filepath):
     return model
 
 # Run the program.
-def run(tp_filepath, fp_filepath, output_dir):
+def run(true_positives_filepath, false_positives_filepath, output_directory):
+    """Run the program."""
     # Train the model.
-    model = train(tp_filepath, fp_filepath)
+    model = train(true_positives_filepath, false_positives_filepath)
 
     # Save the model
-    model_path = os.path.join(output_dir, "model.pkl")
+    model_path = os.path.join(output_directory, "model.pkl")
     joblib.dump(model, model_path)
 
     # Print the model.
@@ -47,8 +47,8 @@ def run(tp_filepath, fp_filepath, output_dir):
     # Return the model.
     return model
 
-# Score the structural variants.
 def score(model, cnv_data):
+    """Score the structural variants."""
     # Get the features.
     features = cnv_data[["lrr", "baf"]]
 
@@ -58,7 +58,7 @@ def score(model, cnv_data):
     # Return the scores.
     return scores
 
-# If this is the main program, run it.
+
 if __name__ == '__main__':
     # Get the command line arguments.
     tp_filepath = sys.argv[1]
diff --git a/python/utils.py b/python/utils.py
index 3a494883..9176dc9f 100644
--- a/python/utils.py
+++ b/python/utils.py
@@ -1,7 +1,7 @@
-# Utility functions for genome data analysis.
+"""Utility functions for genome data analysis."""
 
 def parse_region(region):
-    # Parse a region string into its chromosome and start and end positions.
+    """Parse a region string into its chromosome and start and end positions."""
     region_parts = region.split(":")
     chromosome = str(region_parts[0])
 
@@ -12,3 +12,33 @@ def parse_region(region):
         start_position, end_position = None, None
 
     return chromosome, start_position, end_position
+
+def get_info_field_column(vcf_data):
+    """Return the column index of the INFO field in a VCF file."""
+    index = vcf_data.apply(lambda col: col.astype(str).str.contains("SVTYPE=").any(), axis=0).idxmax()
+    return index
+
+def get_info_field_value(info_field, field_name):
+    """
+    Get the value of a field in the INFO field of a VCF file.
+
+    Args:
+        info_field (str): The INFO field.
+        field_name (str): The name of the field to get the value of.
+
+    Returns:
+        str: The value of the field.
+    """
+
+    # Split the INFO field into its parts.
+    info_field_parts = info_field.split(";")
+
+    # Get the field value.
+    field_value = ""
+    for info_field_part in info_field_parts:
+        if info_field_part.startswith("{}=".format(field_name)):
+            field_value = info_field_part.split("=")[1]
+            break
+
+    # Return the field value.
+    return field_value
diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp
index 000885e7..58fb8913 100644
--- a/src/fasta_query.cpp
+++ b/src/fasta_query.cpp
@@ -7,6 +7,8 @@
 #include <string>
 #include <fstream>
 #include <sstream>
+#include <vector>
+#include <algorithm>
 /// @endcond
 
 
@@ -134,3 +136,33 @@ std::string FASTAQuery::query(std::string chr, int pos_start, int pos_end)
 
     return subsequence;
 }
+
+// Function to get the chromosome contig lengths in VCF header format
+std::string FASTAQuery::getContigHeader()
+{
+    std::string contig_header = "";
+
+    // Sort the chromosomes
+    std::vector<std::string> chromosomes;
+    for (auto const& chr_seq : this->chr_to_seq)
+    {
+        chromosomes.push_back(chr_seq.first);
+    }
+    std::sort(chromosomes.begin(), chromosomes.end());
+
+    // Iterate over the chromosomes and add them to the contig header
+    for (auto const& chr : chromosomes)
+    {
+        // Add the contig header line
+        contig_header += "##contig=<ID=" + chr + ",length=" + std::to_string(this->chr_to_seq[chr].length()) + ">\n";
+    }
+    // {
+    //     std::string chr = chr_seq.first;
+    //     std::string seq = chr_seq.second;
+
+    //     // Add the contig header line
+    //     contig_header += "##contig=<ID=" + chr + ",length=" + std::to_string(seq.length()) + ">\n";
+    // }
+
+    return contig_header;
+}
\ No newline at end of file
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
index cb3e06ed..6977ded8 100644
--- a/src/sv_data.cpp
+++ b/src/sv_data.cpp
@@ -69,14 +69,54 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
         exit(1);
     }
 
+    // Set the sample name
+    std::string sample_name = "SAMPLE";
+
     // Write the header
     output_stream << "##fileformat=VCFv4.2" << std::endl;
+
+    // Get the date
+    time_t rawtime;
+    struct tm * timeinfo;
+    char buffer[80];
+    time(&rawtime);
+    timeinfo = localtime(&rawtime);
+
+    // Write the date
+    strftime(buffer, sizeof(buffer), "%Y%m%d", timeinfo);
+
+    // Write the date and source
+    output_stream << "##fileDate=" << buffer << std::endl;
+    output_stream << "##source=ContextSV" << std::endl;
+
+    // Write the reference genome
+    std::string ref_genome_filepath = ref_genome.getFilepath();
+    output_stream << "##reference=" << ref_genome_filepath << std::endl;
+
+    // Write the contig headers
+    output_stream << ref_genome.getContigHeader();
+
+    // Write the INFO lines
     output_stream << "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">" << std::endl;
     output_stream << "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">" << std::endl;
     output_stream << "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">" << std::endl;
     output_stream << "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total read depth at the locus\">" << std::endl;
     output_stream << "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Method used to call the structural variant\">" << std::endl;
-    output_stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO" << std::endl;
+    //output_stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO" << std::endl;
+
+    // Write the FILTER lines
+    output_stream << "##FILTER=<ID=PASS,Description=\"All filters passed\">" << std::endl;
+    output_stream << "##FILTER=<ID=LowQual,Description=\"Low quality\">" << std::endl;
+
+    // Write the FORMAT lines
+    // Genotype
+    output_stream << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << std::endl;
+
+    // Read depth
+    output_stream << "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth\">" << std::endl;
+
+    // Add the header line
+    output_stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" << sample_name << std::endl;
 
     // Iterate over the SV calls
     std::cout << "Saving SV calls to " << output_vcf << "..." << std::endl;
@@ -89,6 +129,11 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
         int sv_type = info.first;
         int depth = info.second;
 
+        // If the SV type is unknown, skip it
+        if (sv_type == -1) {
+            continue;
+        }
+
         // Get the CHROM, POS, END, and ALT
         std::string chr = std::get<0>(candidate);
         int pos = std::get<1>(candidate);
@@ -116,8 +161,21 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
             alt_allele = "<" + sv_type_str + ">";
         }
 
+        // Set the genotype as unspecified for now (Haven't distinguished b/w homozygous, heterozygous)
+        std::string genotype = "./.";
+
+        // Create the INFO string
+        std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_len) + ";DP=" + std::to_string(depth) + ";SVMETHOD=" + sv_method;
+
+        // Create the FORMAT string
+        std::string format_str = "GT:DP";
+
+        // Create the sample string
+        std::string sample_str = genotype + ":" + std::to_string(depth);
+
         // Write the SV call to the file
-        output_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele.substr(0, 10) << "\t" << alt_allele << "\t" << "." << "\t" << "." << "\t" << "END=" << end << ";SVTYPE=" << sv_type_str << ";SVLEN=" << sv_len << ";DP=" << depth << ";SVMETHOD=" << sv_method << std::endl;
+        output_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele.substr(0, 10) << "\t" << alt_allele << "\t" << "." << "\t" << "." << "\t" << info_str << "\t" << format_str << "\t" << sample_str << std::endl;
+        //output_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele.substr(0, 10) << "\t" << alt_allele << "\t" << "." << "\t" << "." << "\t" << "END=" << end << ";SVTYPE=" << sv_type_str << ";SVLEN=" << sv_len << ";DP=" << depth << ";SVMETHOD=" << sv_method << std::endl;
         //output_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele.substr(0, 10) << "\t" << alt_allele << "\t" << "." << "\t" << "." << "\t" << "SVTYPE=" << sv_type_str << ";SVLEN=" << sv_len << ";DP=" << depth << ";SVMETHOD=" << sv_method << std::endl;
         //output_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele.substr(0, 10) << "\t" << alt_allele << "\t" << "." << "\t" << "." << "\t" << "SVTYPE=" << sv_type_str << ";SVLEN=" << sv_len << ";DP=" << depth << std::endl;
     }