From b4c2115ab3fdbc8d8548b12b45c0e6cf5969cb9a Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:42:29 -0400 Subject: [PATCH] first working pass at using noodles to read bam and report counts --- gtars/Cargo.toml | 2 +- gtars/src/uniwig/mod.rs | 62 +++++++++++++-------- gtars/src/uniwig/reading.rs | 61 ++++++++++++++++++--- gtars/tests/test.rs | 104 ++++++++++++++++++------------------ 4 files changed, 147 insertions(+), 82 deletions(-) diff --git a/gtars/Cargo.toml b/gtars/Cargo.toml index f9f2831..ad0cbc6 100644 --- a/gtars/Cargo.toml +++ b/gtars/Cargo.toml @@ -20,7 +20,7 @@ ndarray-npy = "0.8.1" ndarray = "0.15.6" tempfile = "3.10.1" byteorder = "1.5.0" -noodles = { version = "0.83.0", features = ["bam"] } +noodles = { version = "0.83.0", features = ["bam", "sam"] } bstr = "1.10.0" rayon = "1.10.0" indicatif = "0.17.8" diff --git a/gtars/src/uniwig/mod.rs b/gtars/src/uniwig/mod.rs index d103573..5ba8d57 100644 --- a/gtars/src/uniwig/mod.rs +++ b/gtars/src/uniwig/mod.rs @@ -8,9 +8,7 @@ use std::error::Error; use std::io::{BufWriter, Write}; use crate::uniwig::counting::{core_counts, start_end_counts}; -use crate::uniwig::reading::{ - read_bam_header, read_bed_vec, read_chromosome_sizes, read_narrow_peak_vec, -}; +use crate::uniwig::reading::{get_seq_reads_bam, read_bam_header, read_bed_vec, read_chromosome_sizes, read_narrow_peak_vec}; use crate::uniwig::writing::{ write_bw_files, write_combined_files, write_to_bed_graph_file, write_to_npy_file, write_to_wig_file, @@ -49,6 +47,7 @@ impl FromStr for FileType { } // Chromosome representation for Bed File Inputs +#[derive(Debug)] pub struct Chromosome { pub chrom: String, pub starts: Vec<(i32, i32)>, @@ -187,7 +186,7 @@ pub fn uniwig_main( read_bed_vec(filepath) } } - Ok(FileType::BAM) => read_bam_header(filepath), + Ok(FileType::BAM) => read_bam_header(filepath), //TODO Also check for associated .bai file and if it does not exist create one. _ => read_bed_vec(filepath), }; @@ -226,8 +225,16 @@ pub fn uniwig_main( final_chromosomes .par_iter() .for_each(|chromosome: &Chromosome| { - // Need these for setting wiggle header + bar.inc(1); + match ft { + Ok(FileType::BAM) => { + let mut chromosome = chromosome.clone(); // empty vectors, so cloning is not a big deal. + get_seq_reads_bam(&mut chromosome, filepath) + }, + _ => {}, + }; + let primary_start = chromosome.starts[0].clone(); let primary_end = chromosome.ends[0].clone(); @@ -252,12 +259,12 @@ pub fn uniwig_main( smoothsize, stepsize, ), - Ok(FileType::BAM) => smooth_fixed_start_end_wiggle_bam( - &chromosome.starts, - current_chrom_size, - smoothsize, - stepsize, - ), + // Ok(FileType::BAM) => smooth_fixed_start_end_wiggle_bam( + // &chromosome.starts, + // current_chrom_size, + // smoothsize, + // stepsize, + // ), _ => start_end_counts( &chromosome.starts, current_chrom_size, @@ -346,12 +353,12 @@ pub fn uniwig_main( smoothsize, stepsize, ), - Ok(FileType::BAM) => smooth_fixed_start_end_wiggle_bam( - &chromosome.ends, - current_chrom_size, - smoothsize, - stepsize, - ), + // Ok(FileType::BAM) => smooth_fixed_start_end_wiggle_bam( + // &chromosome.ends, + // current_chrom_size, + // smoothsize, + // stepsize, + // ), _ => start_end_counts( &chromosome.ends, current_chrom_size, @@ -438,12 +445,12 @@ pub fn uniwig_main( current_chrom_size, stepsize, ), - Ok(FileType::BAM) => fixed_core_wiggle_bam( - &chromosome.starts, - &chromosome.ends, - current_chrom_size, - stepsize, - ), + // Ok(FileType::BAM) => fixed_core_wiggle_bam( + // &chromosome.starts, + // &chromosome.ends, + // current_chrom_size, + // stepsize, + // ), _ => core_counts( &chromosome.starts, &chromosome.ends, @@ -530,6 +537,13 @@ pub fn uniwig_main( }); bar.finish(); + + + + + + + let vec_strings = vec!["start", "core", "end"]; let bar = ProgressBar::new(vec_strings.len() as u64); @@ -560,6 +574,8 @@ pub fn uniwig_main( Ok(()) } + + fn fixed_core_wiggle_bam( _p0: &Vec<(i32, i32)>, _p1: &Vec<(i32, i32)>, diff --git a/gtars/src/uniwig/reading.rs b/gtars/src/uniwig/reading.rs index 0f500af..677a924 100644 --- a/gtars/src/uniwig/reading.rs +++ b/gtars/src/uniwig/reading.rs @@ -4,10 +4,15 @@ use flate2::read::GzDecoder; use noodles::bam; use std::error::Error; use std::fs::File; +use std::io; use std::io::{BufRead, BufReader, Read}; use std::ops::Deref; use std::path::Path; +use noodles::sam::alignment::Record; + +const UNMAPPED: &str = "*"; + /// Reads combined bed file from a given path. /// Returns Vec of Chromosome struct pub fn read_bed_vec(combinedbedpath: &str) -> Vec { @@ -261,8 +266,6 @@ pub fn read_chromosome_sizes( } pub fn read_bam_header(filepath: &str) -> Vec { - // BAM and SAM format specification https://samtools.github.io/hts-specs/SAMv1.pdf - println!("READ BAM HEADER PLACE HOLDER"); let mut reader = bam::io::reader::Builder.build_from_path(filepath).unwrap(); let header = reader.read_header(); @@ -280,16 +283,60 @@ pub fn read_bam_header(filepath: &str) -> Vec { for ref_key in references { let chrom_name_vec = ref_key.0.deref().clone(); let chrom_name = String::from_utf8((*chrom_name_vec).to_owned()).unwrap(); - - //For later - // use bstr::BString; - // - // let s = BString::from("Hello, world!"); chromosome.chrom = chrom_name; chromosome.starts.push((0, 0)); //default values for now, less important for bam chromosome.ends.push((0, 0)); //default values for now, less important for bam chromosome_vec.push(chromosome.clone()); } + // + // for c in &chromosome_vec{ + // println!("chromsome= {:?}", c); + // } chromosome_vec } + +pub fn get_seq_reads_bam(chromosome: &mut Chromosome, filepath: &str) { + // read bam seq info into the current Chromosome + + // TODO this function requires there to be an associated .bai file in the same directory as the .bam file + // And the error message if it does not exist is not very helpful. + + let src = String::from(filepath); + let raw_region = String::from(chromosome.chrom.clone()); + //let raw_region = String::from("chr1"); + + let mut reader = bam::io::indexed_reader::Builder::default().build_from_path(src).unwrap(); + let header = reader.read_header().unwrap(); + + let records: Box>> = if raw_region == UNMAPPED { + reader.query_unmapped().map(Box::new).unwrap() + } else { + let region = raw_region.parse().unwrap(); + reader.query(&header, ®ion).map(Box::new).unwrap() + }; + + // remove the placeholder (0,0 )) + chromosome.starts.remove(0); + chromosome.ends.remove(0); + let default_score = 1; + + for result in records { + let record = result.unwrap(); + let flags = record.flags(); + //TODO Determine position shift via what flags are set + let start_position = record.alignment_start().unwrap().unwrap(); + let start = start_position.get(); + let end_position = record.alignment_end().unwrap().unwrap(); + let end = end_position.get(); + chromosome.starts.push((start as i32, default_score)); + chromosome.ends.push((end as i32, default_score)); + + } + + chromosome.starts.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + chromosome.ends.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + + + println!("Finished reading seq for chrom: {}",chromosome.chrom.clone() ); +} diff --git a/gtars/tests/test.rs b/gtars/tests/test.rs index 0b555fc..21d855d 100644 --- a/gtars/tests/test.rs +++ b/gtars/tests/test.rs @@ -19,10 +19,10 @@ fn path_to_sorted_small_bed_file() -> &'static str { "tests/data/test_sorted_small.bed" } -// #[fixture] -// fn path_to_small_bam_file() -> &'static str { -// "tests/data/test1_sort_dedup.bam" -// } +#[fixture] +fn path_to_small_bam_file() -> &'static str { + "/home/drc/Downloads/bam files for rust test/test1_sort_dedup.bam" //todo change back to relative to test folder +} #[fixture] fn path_to_chrom_sizes_file() -> &'static str { @@ -73,7 +73,7 @@ mod tests { use gtars::uniwig::counting::{core_counts, start_end_counts}; use gtars::uniwig::reading::{ - parse_bed_file, read_bed_vec, read_chromosome_sizes, read_narrow_peak_vec, + parse_bed_file, read_bed_vec, read_chromosome_sizes, read_narrow_peak_vec,read_bam_header, }; use gtars::uniwig::writing::write_bw_files; @@ -334,53 +334,55 @@ mod tests { assert_eq!(num_chromosomes, 5); } - // #[rstest] - // fn test_read_bam_header(path_to_small_bam_file: &str) { - // let chromosomes: Vec = read_bam_header(path_to_small_bam_file); - // let num_chromosomes = chromosomes.len(); - // println!("Number of chroms: {}", num_chromosomes); - // assert_eq!(num_chromosomes, 195); - // } + #[rstest] + fn test_read_bam_header(path_to_small_bam_file: &str) { + let chromosomes: Vec = read_bam_header(path_to_small_bam_file); + let num_chromosomes = chromosomes.len(); + println!("Number of chroms: {}", num_chromosomes); + assert_eq!(num_chromosomes, 195); + } + + #[rstest] + fn test_process_bam(path_to_small_bam_file: &str) -> Result<(), Box<(dyn std::error::Error + 'static)>> { + let path_to_crate = env!("CARGO_MANIFEST_DIR"); + let chromsizerefpath: String = format!("{}{}", path_to_crate, "/tests/hg38.chrom.sizes"); + let chromsizerefpath = chromsizerefpath.as_str(); + let combinedbedpath = path_to_small_bam_file; + + let tempdir = tempfile::tempdir().unwrap(); + let path = PathBuf::from(&tempdir.path()); + + // For some reason, you cannot chain .as_string() to .unwrap() and must create a new line. + //let bwfileheader_path = path.into_os_string().into_string().unwrap(); + //let bwfileheader = bwfileheader_path.as_str(); + let bwfileheader = "/home/drc/Downloads/baminput_bwoutput_test_rust/"; //todo change back to non local example + + + let smoothsize: i32 = 1; + let output_type = "wig"; + let filetype = "bam"; + let num_threads = 6; + let score = false; + let stepsize = 1; + let zoom = 0; + + uniwig_main( + smoothsize, + combinedbedpath, + chromsizerefpath, + bwfileheader, + output_type, + filetype, + num_threads, + score, + stepsize, + zoom, + ) + .expect("Uniwig main failed!"); + + Ok(()) + } - // #[rstest] - // fn test_run_uniwig_main_bam_input_wig_output( - // path_to_small_bam_file: &str, - // path_to_chrom_sizes_file: &str, - // ) -> Result<(), Box<(dyn std::error::Error + 'static)>> { - // // This test uses a chrom sizes file and a bam file and will take a long time to run. - // // only run this during dev/troubleshooting, comment out for normal test suite checks - // //let path_to_crate = env!("CARGO_MANIFEST_DIR"); - // - // //let tempbedpath = format!("{}{}", path_to_crate, "/tests/data/test5.bed"); - // let combinedbedpath = path_to_small_bam_file; - // - // let chromsizerefpath = path_to_chrom_sizes_file; - // - // let tempdir = tempfile::tempdir().unwrap(); - // let path = PathBuf::from(&tempdir.path()); - // - // // For some reason, you cannot chain .as_string() to .unwrap() and must create a new line. - // let bwfileheader_path = path.into_os_string().into_string().unwrap(); - // let bwfileheader = bwfileheader_path.as_str(); - // - // let smoothsize: i32 = 5; - // let output_type = "wig"; - // let filetype = "bam"; - // let num_threads =6; - // - // uniwig_main( - // smoothsize, - // combinedbedpath, - // chromsizerefpath, - // bwfileheader, - // output_type, - // filetype, - // num_threads, - // ) - // .expect("Uniwig main failed!"); - // - // Ok(()) - // } #[rstest] fn test_run_uniwig_main_wig_type() -> Result<(), Box<(dyn std::error::Error + 'static)>> {