From c1921ac769aba0aa434d3b168e05eae4163ec895 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 27 Sep 2023 13:09:46 -0400 Subject: [PATCH] Update vcf format and work on PFB --- include/fasta_query.h | 3 + python/cnv_plots.py | 380 ++++++++++++++++++-------------------- python/pfb_from_gnomad.py | 39 ++++ python/train_model.py | 22 +-- python/utils.py | 34 +++- src/fasta_query.cpp | 32 ++++ src/sv_data.cpp | 62 ++++++- 7 files changed, 354 insertions(+), 218 deletions(-) create mode 100644 python/pfb_from_gnomad.py diff --git a/include/fasta_query.h b/include/fasta_query.h index 5ed70c62..0b237c44 100644 --- a/include/fasta_query.h +++ b/include/fasta_query.h @@ -17,6 +17,9 @@ class FASTAQuery { int setFilepath(std::string fasta_filepath); std::string getFilepath(); std::string query(std::string chr, int pos_start, int pos_end); + + // Get the chromosome contig lengths in VCF header format + std::string getContigHeader(); }; #endif // FASTA_QUERY_H diff --git a/python/cnv_plots.py b/python/cnv_plots.py index 9456d707..90b2fad4 100644 --- a/python/cnv_plots.py +++ b/python/cnv_plots.py @@ -1,4 +1,4 @@ -# Plot the copy number variants and their log2_ratio, BAF values. +"""Plot the copy number variants and their log2_ratio, BAF values.""" import os import sys @@ -7,7 +7,7 @@ from plotly.subplots import make_subplots import pandas as pd -from .utils import parse_region +from .utils import parse_region, get_info_field_column, get_info_field_value # Set up logging. log.basicConfig( @@ -18,31 +18,6 @@ ] ) -def get_info_field_value(info_field, field_name): - """ - Get the value of a field in the INFO field of a VCF file. - - Args: - info_field (str): The INFO field. - field_name (str): The name of the field to get the value of. - - Returns: - str: The value of the field. - """ - - # Split the INFO field into its parts. - info_field_parts = info_field.split(";") - - # Get the field value. - field_value = "" - for info_field_part in info_field_parts: - if info_field_part.startswith("{}=".format(field_name)): - field_value = info_field_part.split("=")[1] - break - - # Return the field value. - return field_value - def run(vcf_file, cnv_data_file, output_path, region): """ Saves a plot of the CNVs and their log2 ratio and B-allele frequency @@ -66,9 +41,9 @@ def run(vcf_file, cnv_data_file, output_path, region): chromosome, start_position, end_position = parse_region(region) if start_position is not None and end_position is not None: - log.info(f"Plotting CNVs in region %s:%d-%d.", chromosome, start_position, end_position) + log.info("Plotting CNVs in region %s:%d-%d.", chromosome, start_position, end_position) else: - log.info(f"Plotting CNVs in region %s.", chromosome) + log.info("Plotting CNVs in region %s.", chromosome) # Set up the CNV type string list for the 6 CNV states. cnv_types = ["NAN", "DEL", "DEL", "NEUT", "NEUT", "DUP", "DUP"] @@ -95,9 +70,9 @@ def run(vcf_file, cnv_data_file, output_path, region): # Create an output html file where we will append the CNV plots. if start_position is not None and end_position is not None: - html_filename = "cnv_plots_{}_{}_{}.html".format(chromosome, start_position, end_position) + html_filename = f"cnv_plots_{chromosome}_{start_position}_{end_position}.html" else: - html_filename = "cnv_plots_{}.html".format(chromosome) + html_filename = f"cnv_plots_{chromosome}.html" # Create the output directory if it doesn't exist. if not os.path.exists(output_path): @@ -108,177 +83,176 @@ def run(vcf_file, cnv_data_file, output_path, region): if os.path.exists(output_html_filepath): os.remove(output_html_filepath) - output_html_file = open(output_html_filepath, "w", encoding="utf-8") - - #output_html_file = open(os.path.join(output_path, "cnv_plots.html"), "w") - - # Loop through the VCF data and plot each CNV (DEL or DUP) along with log2 - # ratio and BAF values for the SNPs in the CNV. - cnv_count = 0 - for _, row in vcf_data.iterrows(): - - # Get the INFO field. - info_field = row[7] - - # Get the SVTYPE field value. - svtype = get_info_field_value(info_field, "SVTYPE") - - # Analyze the CNV if it is a DEL or DUP. - if svtype in ("DEL", "DUP"): - - # Get the read depth (DP) value. - read_depth = int(get_info_field_value(info_field, "DP")) - - # Get the start and end positions. - start_position = int(row[1]) - end_position = int(get_info_field_value(info_field, "END")) - - # Get the chromosome. - chromosome = row[0] - - # Get the length of the CNV. - cnv_length = end_position - start_position + 1 - - # Get the plot range. - plot_start_position = start_position - (plot_range * cnv_length) - plot_end_position = end_position + (plot_range * cnv_length) - - # Get the CNV state, log2 ratio, and BAF values for all SNPs in the - # plot range. - sv_data = cnv_data[(cnv_data["position"] >= plot_start_position) & (cnv_data["position"] <= plot_end_position)] - - # Get the marker colors for the state sequence. - marker_colors = [] - for state in sv_data["cnv_state"]: - if state in [1, 2]: - marker_colors.append("red") - elif state in [3, 4]: - marker_colors.append("black") - elif state in [5, 6]: - marker_colors.append("blue") - - # Get the hover text for the state sequence markers. - hover_text = [] - for _, row in sv_data.iterrows(): - hover_text.append(f"TYPE: {cnv_types[row['cnv_state']]}
CHR: {row['chromosome']}
POS: {row['position']}
L2R: {row['log2_ratio']}
BAF: {row['b_allele_freq']}") - - # Create the log2 ratio trace. - log2_ratio_trace = plotly.graph_objs.Scatter( - x = sv_data["position"], - y = sv_data["log2_ratio"], - mode = "markers+lines", - name = r'Log2 Ratio', - text = hover_text, - hoverinfo = "text", - marker = dict( - color = marker_colors, - size = 10, - ), - line = dict( - color = "black", - width = 0 - ), - showlegend = False - ) - - # Create the B-allele frequency trace. - baf_trace = plotly.graph_objs.Scatter( - x = sv_data["position"], - y = sv_data["b_allele_freq"], - mode = "markers+lines", - name = "B-Allele Frequency", - text = hover_text, - marker = dict( - color = marker_colors, - size = 10, - ), - line = dict( - color = "black", - width = 0 - ), - showlegend = False - ) - - # Create a subplot for the CNV plot and the BAF plot. - fig = make_subplots( - rows=2, - cols=1, - shared_xaxes=True, - vertical_spacing=0.05, - subplot_titles=(r"SNP Log2 Ratio", "SNP B-Allele Frequency") - ) - - # Add the traces to the figure. - fig.append_trace(log2_ratio_trace, 1, 1) - fig.append_trace(baf_trace, 2, 1) - - # Set the x-axis title. - fig.update_xaxes( - title_text = "Chromosome Position", - row = 2, - col = 1 - ) - - # Set the y-axis titles. - fig.update_yaxes( - title_text = r"Log2 Ratio", - row = 1, - col = 1 - ) - - fig.update_yaxes( - title_text = "B-Allele Frequency", - row = 2, - col = 1 - ) - - # Set the figure title. - fig.update_layout( - title_text = f"CNV {svtype} {chromosome}:{start_position}-{end_position} (Read Depth: {read_depth})" - ) - - # Create a shaded rectangle for the CNV, layering it below the CNV - # trace and labeling it with the CNV type. - fig.add_vrect( - x0 = start_position, - x1 = end_position, - fillcolor = "Black", - layer = "below", - line_width = 0, - opacity = 0.1, - annotation_text = svtype, - annotation_position = "top left", - annotation_font_size = 20, - annotation_font_color = "black" - ) - - # Add vertical lines at the start and end positions of the CNV. - fig.add_vline( - x = start_position, - line_width = 2, - line_color = "black", - layer = "below" - ) - - fig.add_vline( - x = end_position, - line_width = 2, - line_color = "black", - layer = "below" - ) - - # Append the figure to the output html file. - output_html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) - log.info("Plotted CNV %s %s:%d-%d.", svtype, chromosome, start_position, end_position) - - # Increment the CNV count. - cnv_count += 1 - - # Break if the maximum number of CNVs has been reached. - if cnv_count == max_cnvs: - break - - # Close the output html file. - output_html_file.close() + with open(output_html_filepath, "w", encoding="utf-8") as output_html_file: + + # Get the INFO field column by searching for the first column that + # contains SVTYPE=. + info_column = get_info_field_column(vcf_data) + + # Loop through the VCF data and plot each CNV (DEL or DUP) along with log2 + # ratio and BAF values for the SNPs in the CNV. + cnv_count = 0 + for _, sv_data in vcf_data.iterrows(): + + # Get the INFO field + info_field = sv_data[info_column] + + # Get the SVTYPE field value. + svtype = get_info_field_value(info_field, "SVTYPE") + + # Analyze the CNV if it is a DEL or DUP. + if svtype in ("DEL", "DUP"): + + # Get the read depth (DP) value. + read_depth = int(get_info_field_value(info_field, "DP")) + + # Get the start and end positions. + start_position = int(sv_data[1]) + end_position = int(get_info_field_value(info_field, "END")) + + # Get the chromosome. + chromosome = sv_data[0] + + # Get the length of the CNV. + cnv_length = end_position - start_position + 1 + + # Get the plot range. + plot_start_position = start_position - (plot_range * cnv_length) + plot_end_position = end_position + (plot_range * cnv_length) + + # Get the CNV state, log2 ratio, and BAF values for all SNPs in the + # plot range. + sv_data = cnv_data[(cnv_data["position"] >= plot_start_position) & (cnv_data["position"] <= plot_end_position)] + + # Get the marker colors for the state sequence. + marker_colors = [] + for state in sv_data["cnv_state"]: + if state in [1, 2]: + marker_colors.append("red") + elif state in [3, 4]: + marker_colors.append("black") + elif state in [5, 6]: + marker_colors.append("blue") + + # Get the hover text for the state sequence markers. + hover_text = [] + for _, row in sv_data.iterrows(): + hover_text.append(f"TYPE: {cnv_types[row['cnv_state']]}
CHR: {row['chromosome']}
POS: {row['position']}
L2R: {row['log2_ratio']}
BAF: {row['b_allele_freq']}") + + # Create the log2 ratio trace. + log2_ratio_trace = plotly.graph_objs.Scatter( + x = sv_data["position"], + y = sv_data["log2_ratio"], + mode = "markers+lines", + name = r'Log2 Ratio', + text = hover_text, + hoverinfo = "text", + marker = dict( + color = marker_colors, + size = 10, + ), + line = dict( + color = "black", + width = 0 + ), + showlegend = False + ) + + # Create the B-allele frequency trace. + baf_trace = plotly.graph_objs.Scatter( + x = sv_data["position"], + y = sv_data["b_allele_freq"], + mode = "markers+lines", + name = "B-Allele Frequency", + text = hover_text, + marker = dict( + color = marker_colors, + size = 10, + ), + line = dict( + color = "black", + width = 0 + ), + showlegend = False + ) + + # Create a subplot for the CNV plot and the BAF plot. + fig = make_subplots( + rows=2, + cols=1, + shared_xaxes=True, + vertical_spacing=0.05, + subplot_titles=(r"SNP Log2 Ratio", "SNP B-Allele Frequency") + ) + + # Add the traces to the figure. + fig.append_trace(log2_ratio_trace, 1, 1) + fig.append_trace(baf_trace, 2, 1) + + # Set the x-axis title. + fig.update_xaxes( + title_text = "Chromosome Position", + row = 2, + col = 1 + ) + + # Set the y-axis titles. + fig.update_yaxes( + title_text = r"Log2 Ratio", + row = 1, + col = 1 + ) + + fig.update_yaxes( + title_text = "B-Allele Frequency", + row = 2, + col = 1 + ) + + # Set the figure title. + fig.update_layout( + title_text = f"CNV {svtype} {chromosome}:{start_position}-{end_position} (Read Depth: {read_depth})" + ) + + # Create a shaded rectangle for the CNV, layering it below the CNV + # trace and labeling it with the CNV type. + fig.add_vrect( + x0 = start_position, + x1 = end_position, + fillcolor = "Black", + layer = "below", + line_width = 0, + opacity = 0.1, + annotation_text = svtype, + annotation_position = "top left", + annotation_font_size = 20, + annotation_font_color = "black" + ) + + # Add vertical lines at the start and end positions of the CNV. + fig.add_vline( + x = start_position, + line_width = 2, + line_color = "black", + layer = "below" + ) + + fig.add_vline( + x = end_position, + line_width = 2, + line_color = "black", + layer = "below" + ) + + # Append the figure to the output html file. + output_html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) + log.info("Plotted CNV %s %s:%d-%d.", svtype, chromosome, start_position, end_position) + + # Increment the CNV count. + cnv_count += 1 + + # Break if the maximum number of CNVs has been reached. + if cnv_count == max_cnvs: + break log.info("Saved CNV plots to %s.", output_html_filepath) diff --git a/python/pfb_from_gnomad.py b/python/pfb_from_gnomad.py new file mode 100644 index 00000000..c77b659e --- /dev/null +++ b/python/pfb_from_gnomad.py @@ -0,0 +1,39 @@ +"""Run benchmarking on the ContextSV output VCF file.""" +import os +import sys +import logging as log +import pandas as pd + +from .utils import get_info_field_column, get_info_field_value + + +def save_snp_af_to_tsv(input_vcf, output_tsv): + """ + Get the SNP Filtered Allele Frequency (FAF) from the input VCF file and save + to a TSV file. + This is the population B-allele frequency (PFB) of the SNP. + Any allele frequency less than 0.01 (1%) is considered a very rare variant + and is ignored in this analysis. + (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9160216/) + """ + # Read the input VCF file. + vcf_df = pd.read_csv(input_vcf, sep="\t", comment="#", header=None) + + # Get the column index of the INFO field. + + # Return the SNP FAF. + return snp_faf + + +# Run the program. +if __name__ == "__main__": + + # Get the GNOMAD VCF file of benchmarking variants. + benchmark_vcf = sys.argv[1] + + # Get the output TSV file. + output_tsv = sys.argv[2] + + # Run the program. + run_cnv_benchmarking(benchmark_vcf, output_tsv) + \ No newline at end of file diff --git a/python/train_model.py b/python/train_model.py index fe0d7b72..e5408313 100644 --- a/python/train_model.py +++ b/python/train_model.py @@ -5,16 +5,15 @@ import os import sys import joblib -import sklearn as sk from sklearn.linear_model import LogisticRegression import pandas as pd -import os -# Train the model. -def train(tp_filepath, fp_filepath): + +def train(true_positives_filepath, false_positives_filepath): + """Train the binary classification model.""" # Read in the true positive and false positive data. - tp_data = pd.read_csv(tp_filepath, sep="\t") - fp_data = pd.read_csv(fp_filepath, sep="\t") + tp_data = pd.read_csv(true_positives_filepath, sep="\t") + fp_data = pd.read_csv(false_positives_filepath, sep="\t") # Get the training data. tp_data["label"] = 1 @@ -33,12 +32,13 @@ def train(tp_filepath, fp_filepath): return model # Run the program. -def run(tp_filepath, fp_filepath, output_dir): +def run(true_positives_filepath, false_positives_filepath, output_directory): + """Run the program.""" # Train the model. - model = train(tp_filepath, fp_filepath) + model = train(true_positives_filepath, false_positives_filepath) # Save the model - model_path = os.path.join(output_dir, "model.pkl") + model_path = os.path.join(output_directory, "model.pkl") joblib.dump(model, model_path) # Print the model. @@ -47,8 +47,8 @@ def run(tp_filepath, fp_filepath, output_dir): # Return the model. return model -# Score the structural variants. def score(model, cnv_data): + """Score the structural variants.""" # Get the features. features = cnv_data[["lrr", "baf"]] @@ -58,7 +58,7 @@ def score(model, cnv_data): # Return the scores. return scores -# If this is the main program, run it. + if __name__ == '__main__': # Get the command line arguments. tp_filepath = sys.argv[1] diff --git a/python/utils.py b/python/utils.py index 3a494883..9176dc9f 100644 --- a/python/utils.py +++ b/python/utils.py @@ -1,7 +1,7 @@ -# Utility functions for genome data analysis. +"""Utility functions for genome data analysis.""" def parse_region(region): - # Parse a region string into its chromosome and start and end positions. + """Parse a region string into its chromosome and start and end positions.""" region_parts = region.split(":") chromosome = str(region_parts[0]) @@ -12,3 +12,33 @@ def parse_region(region): start_position, end_position = None, None return chromosome, start_position, end_position + +def get_info_field_column(vcf_data): + """Return the column index of the INFO field in a VCF file.""" + index = vcf_data.apply(lambda col: col.astype(str).str.contains("SVTYPE=").any(), axis=0).idxmax() + return index + +def get_info_field_value(info_field, field_name): + """ + Get the value of a field in the INFO field of a VCF file. + + Args: + info_field (str): The INFO field. + field_name (str): The name of the field to get the value of. + + Returns: + str: The value of the field. + """ + + # Split the INFO field into its parts. + info_field_parts = info_field.split(";") + + # Get the field value. + field_value = "" + for info_field_part in info_field_parts: + if info_field_part.startswith("{}=".format(field_name)): + field_value = info_field_part.split("=")[1] + break + + # Return the field value. + return field_value diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp index 000885e7..58fb8913 100644 --- a/src/fasta_query.cpp +++ b/src/fasta_query.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include /// @endcond @@ -134,3 +136,33 @@ std::string FASTAQuery::query(std::string chr, int pos_start, int pos_end) return subsequence; } + +// Function to get the chromosome contig lengths in VCF header format +std::string FASTAQuery::getContigHeader() +{ + std::string contig_header = ""; + + // Sort the chromosomes + std::vector chromosomes; + for (auto const& chr_seq : this->chr_to_seq) + { + chromosomes.push_back(chr_seq.first); + } + std::sort(chromosomes.begin(), chromosomes.end()); + + // Iterate over the chromosomes and add them to the contig header + for (auto const& chr : chromosomes) + { + // Add the contig header line + contig_header += "##contig=\n"; + } + // { + // std::string chr = chr_seq.first; + // std::string seq = chr_seq.second; + + // // Add the contig header line + // contig_header += "##contig=\n"; + // } + + return contig_header; +} \ No newline at end of file diff --git a/src/sv_data.cpp b/src/sv_data.cpp index cb3e06ed..6977ded8 100644 --- a/src/sv_data.cpp +++ b/src/sv_data.cpp @@ -69,14 +69,54 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) exit(1); } + // Set the sample name + std::string sample_name = "SAMPLE"; + // Write the header output_stream << "##fileformat=VCFv4.2" << std::endl; + + // Get the date + time_t rawtime; + struct tm * timeinfo; + char buffer[80]; + time(&rawtime); + timeinfo = localtime(&rawtime); + + // Write the date + strftime(buffer, sizeof(buffer), "%Y%m%d", timeinfo); + + // Write the date and source + output_stream << "##fileDate=" << buffer << std::endl; + output_stream << "##source=ContextSV" << std::endl; + + // Write the reference genome + std::string ref_genome_filepath = ref_genome.getFilepath(); + output_stream << "##reference=" << ref_genome_filepath << std::endl; + + // Write the contig headers + output_stream << ref_genome.getContigHeader(); + + // Write the INFO lines output_stream << "##INFO=" << std::endl; output_stream << "##INFO=" << std::endl; output_stream << "##INFO=" << std::endl; output_stream << "##INFO=" << std::endl; output_stream << "##INFO=" << std::endl; - output_stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO" << std::endl; + //output_stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO" << std::endl; + + // Write the FILTER lines + output_stream << "##FILTER=" << std::endl; + output_stream << "##FILTER=" << std::endl; + + // Write the FORMAT lines + // Genotype + output_stream << "##FORMAT=" << std::endl; + + // Read depth + output_stream << "##FORMAT=" << std::endl; + + // Add the header line + output_stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" << sample_name << std::endl; // Iterate over the SV calls std::cout << "Saving SV calls to " << output_vcf << "..." << std::endl; @@ -89,6 +129,11 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) int sv_type = info.first; int depth = info.second; + // If the SV type is unknown, skip it + if (sv_type == -1) { + continue; + } + // Get the CHROM, POS, END, and ALT std::string chr = std::get<0>(candidate); int pos = std::get<1>(candidate); @@ -116,8 +161,21 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) alt_allele = "<" + sv_type_str + ">"; } + // Set the genotype as unspecified for now (Haven't distinguished b/w homozygous, heterozygous) + std::string genotype = "./."; + + // Create the INFO string + std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_len) + ";DP=" + std::to_string(depth) + ";SVMETHOD=" + sv_method; + + // Create the FORMAT string + std::string format_str = "GT:DP"; + + // Create the sample string + std::string sample_str = genotype + ":" + std::to_string(depth); + // Write the SV call to the file - output_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele.substr(0, 10) << "\t" << alt_allele << "\t" << "." << "\t" << "." << "\t" << "END=" << end << ";SVTYPE=" << sv_type_str << ";SVLEN=" << sv_len << ";DP=" << depth << ";SVMETHOD=" << sv_method << std::endl; + output_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele.substr(0, 10) << "\t" << alt_allele << "\t" << "." << "\t" << "." << "\t" << info_str << "\t" << format_str << "\t" << sample_str << std::endl; + //output_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele.substr(0, 10) << "\t" << alt_allele << "\t" << "." << "\t" << "." << "\t" << "END=" << end << ";SVTYPE=" << sv_type_str << ";SVLEN=" << sv_len << ";DP=" << depth << ";SVMETHOD=" << sv_method << std::endl; //output_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele.substr(0, 10) << "\t" << alt_allele << "\t" << "." << "\t" << "." << "\t" << "SVTYPE=" << sv_type_str << ";SVLEN=" << sv_len << ";DP=" << depth << ";SVMETHOD=" << sv_method << std::endl; //output_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele.substr(0, 10) << "\t" << alt_allele << "\t" << "." << "\t" << "." << "\t" << "SVTYPE=" << sv_type_str << ";SVLEN=" << sv_len << ";DP=" << depth << std::endl; }