Skip to content

Commit

Permalink
first working pass at using noodles to read bam and report counts
Browse files Browse the repository at this point in the history
  • Loading branch information
donaldcampbelljr committed Oct 24, 2024
1 parent 6c9c811 commit b4c2115
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 82 deletions.
2 changes: 1 addition & 1 deletion gtars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ ndarray-npy = "0.8.1"
ndarray = "0.15.6"
tempfile = "3.10.1"
byteorder = "1.5.0"
noodles = { version = "0.83.0", features = ["bam"] }
noodles = { version = "0.83.0", features = ["bam", "sam"] }
bstr = "1.10.0"
rayon = "1.10.0"
indicatif = "0.17.8"
Expand Down
62 changes: 39 additions & 23 deletions gtars/src/uniwig/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@ use std::error::Error;
use std::io::{BufWriter, Write};

use crate::uniwig::counting::{core_counts, start_end_counts};
use crate::uniwig::reading::{
read_bam_header, read_bed_vec, read_chromosome_sizes, read_narrow_peak_vec,
};
use crate::uniwig::reading::{get_seq_reads_bam, read_bam_header, read_bed_vec, read_chromosome_sizes, read_narrow_peak_vec};
use crate::uniwig::writing::{
write_bw_files, write_combined_files, write_to_bed_graph_file, write_to_npy_file,
write_to_wig_file,
Expand Down Expand Up @@ -49,6 +47,7 @@ impl FromStr for FileType {
}

// Chromosome representation for Bed File Inputs
#[derive(Debug)]
pub struct Chromosome {
pub chrom: String,
pub starts: Vec<(i32, i32)>,
Expand Down Expand Up @@ -187,7 +186,7 @@ pub fn uniwig_main(
read_bed_vec(filepath)
}
}
Ok(FileType::BAM) => read_bam_header(filepath),
Ok(FileType::BAM) => read_bam_header(filepath), //TODO Also check for associated .bai file and if it does not exist create one.
_ => read_bed_vec(filepath),
};

Expand Down Expand Up @@ -226,8 +225,16 @@ pub fn uniwig_main(
final_chromosomes
.par_iter()
.for_each(|chromosome: &Chromosome| {
// Need these for setting wiggle header

bar.inc(1);
match ft {
Ok(FileType::BAM) => {
let mut chromosome = chromosome.clone(); // empty vectors, so cloning is not a big deal.
get_seq_reads_bam(&mut chromosome, filepath)
},
_ => {},
};

let primary_start = chromosome.starts[0].clone();
let primary_end = chromosome.ends[0].clone();

Expand All @@ -252,12 +259,12 @@ pub fn uniwig_main(
smoothsize,
stepsize,
),
Ok(FileType::BAM) => smooth_fixed_start_end_wiggle_bam(
&chromosome.starts,
current_chrom_size,
smoothsize,
stepsize,
),
// Ok(FileType::BAM) => smooth_fixed_start_end_wiggle_bam(
// &chromosome.starts,
// current_chrom_size,
// smoothsize,
// stepsize,
// ),
_ => start_end_counts(
&chromosome.starts,
current_chrom_size,
Expand Down Expand Up @@ -346,12 +353,12 @@ pub fn uniwig_main(
smoothsize,
stepsize,
),
Ok(FileType::BAM) => smooth_fixed_start_end_wiggle_bam(
&chromosome.ends,
current_chrom_size,
smoothsize,
stepsize,
),
// Ok(FileType::BAM) => smooth_fixed_start_end_wiggle_bam(
// &chromosome.ends,
// current_chrom_size,
// smoothsize,
// stepsize,
// ),
_ => start_end_counts(
&chromosome.ends,
current_chrom_size,
Expand Down Expand Up @@ -438,12 +445,12 @@ pub fn uniwig_main(
current_chrom_size,
stepsize,
),
Ok(FileType::BAM) => fixed_core_wiggle_bam(
&chromosome.starts,
&chromosome.ends,
current_chrom_size,
stepsize,
),
// Ok(FileType::BAM) => fixed_core_wiggle_bam(
// &chromosome.starts,
// &chromosome.ends,
// current_chrom_size,
// stepsize,
// ),
_ => core_counts(
&chromosome.starts,
&chromosome.ends,
Expand Down Expand Up @@ -530,6 +537,13 @@ pub fn uniwig_main(
});

bar.finish();







let vec_strings = vec!["start", "core", "end"];

let bar = ProgressBar::new(vec_strings.len() as u64);
Expand Down Expand Up @@ -560,6 +574,8 @@ pub fn uniwig_main(
Ok(())
}



fn fixed_core_wiggle_bam(
_p0: &Vec<(i32, i32)>,
_p1: &Vec<(i32, i32)>,
Expand Down
61 changes: 54 additions & 7 deletions gtars/src/uniwig/reading.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,15 @@ use flate2::read::GzDecoder;
use noodles::bam;
use std::error::Error;
use std::fs::File;
use std::io;
use std::io::{BufRead, BufReader, Read};
use std::ops::Deref;
use std::path::Path;

use noodles::sam::alignment::Record;

const UNMAPPED: &str = "*";

/// Reads combined bed file from a given path.
/// Returns Vec of Chromosome struct
pub fn read_bed_vec(combinedbedpath: &str) -> Vec<Chromosome> {
Expand Down Expand Up @@ -261,8 +266,6 @@ pub fn read_chromosome_sizes(
}

pub fn read_bam_header(filepath: &str) -> Vec<Chromosome> {
// BAM and SAM format specification https://samtools.github.io/hts-specs/SAMv1.pdf
println!("READ BAM HEADER PLACE HOLDER");

let mut reader = bam::io::reader::Builder.build_from_path(filepath).unwrap();
let header = reader.read_header();
Expand All @@ -280,16 +283,60 @@ pub fn read_bam_header(filepath: &str) -> Vec<Chromosome> {
for ref_key in references {
let chrom_name_vec = ref_key.0.deref().clone();
let chrom_name = String::from_utf8((*chrom_name_vec).to_owned()).unwrap();

//For later
// use bstr::BString;
//
// let s = BString::from("Hello, world!");
chromosome.chrom = chrom_name;
chromosome.starts.push((0, 0)); //default values for now, less important for bam
chromosome.ends.push((0, 0)); //default values for now, less important for bam
chromosome_vec.push(chromosome.clone());
}
//
// for c in &chromosome_vec{
// println!("chromsome= {:?}", c);
// }

chromosome_vec
}

pub fn get_seq_reads_bam(chromosome: &mut Chromosome, filepath: &str) {
// read bam seq info into the current Chromosome

// TODO this function requires there to be an associated .bai file in the same directory as the .bam file
// And the error message if it does not exist is not very helpful.

let src = String::from(filepath);
let raw_region = String::from(chromosome.chrom.clone());
//let raw_region = String::from("chr1");

let mut reader = bam::io::indexed_reader::Builder::default().build_from_path(src).unwrap();
let header = reader.read_header().unwrap();

let records: Box<dyn Iterator<Item = io::Result<bam::Record>>> = if raw_region == UNMAPPED {
reader.query_unmapped().map(Box::new).unwrap()
} else {
let region = raw_region.parse().unwrap();
reader.query(&header, &region).map(Box::new).unwrap()
};

// remove the placeholder (0,0 ))
chromosome.starts.remove(0);
chromosome.ends.remove(0);
let default_score = 1;

for result in records {
let record = result.unwrap();
let flags = record.flags();
//TODO Determine position shift via what flags are set
let start_position = record.alignment_start().unwrap().unwrap();
let start = start_position.get();
let end_position = record.alignment_end().unwrap().unwrap();
let end = end_position.get();
chromosome.starts.push((start as i32, default_score));
chromosome.ends.push((end as i32, default_score));

}

chromosome.starts.sort_unstable_by(|a, b| a.0.cmp(&b.0));
chromosome.ends.sort_unstable_by(|a, b| a.0.cmp(&b.0));


println!("Finished reading seq for chrom: {}",chromosome.chrom.clone() );
}
104 changes: 53 additions & 51 deletions gtars/tests/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ fn path_to_sorted_small_bed_file() -> &'static str {
"tests/data/test_sorted_small.bed"
}

// #[fixture]
// fn path_to_small_bam_file() -> &'static str {
// "tests/data/test1_sort_dedup.bam"
// }
#[fixture]
fn path_to_small_bam_file() -> &'static str {
"/home/drc/Downloads/bam files for rust test/test1_sort_dedup.bam" //todo change back to relative to test folder
}

#[fixture]
fn path_to_chrom_sizes_file() -> &'static str {
Expand Down Expand Up @@ -73,7 +73,7 @@ mod tests {

use gtars::uniwig::counting::{core_counts, start_end_counts};
use gtars::uniwig::reading::{
parse_bed_file, read_bed_vec, read_chromosome_sizes, read_narrow_peak_vec,
parse_bed_file, read_bed_vec, read_chromosome_sizes, read_narrow_peak_vec,read_bam_header,
};

use gtars::uniwig::writing::write_bw_files;
Expand Down Expand Up @@ -334,53 +334,55 @@ mod tests {
assert_eq!(num_chromosomes, 5);
}

// #[rstest]
// fn test_read_bam_header(path_to_small_bam_file: &str) {
// let chromosomes: Vec<Chromosome> = read_bam_header(path_to_small_bam_file);
// let num_chromosomes = chromosomes.len();
// println!("Number of chroms: {}", num_chromosomes);
// assert_eq!(num_chromosomes, 195);
// }
#[rstest]
fn test_read_bam_header(path_to_small_bam_file: &str) {
let chromosomes: Vec<Chromosome> = read_bam_header(path_to_small_bam_file);
let num_chromosomes = chromosomes.len();
println!("Number of chroms: {}", num_chromosomes);
assert_eq!(num_chromosomes, 195);
}

#[rstest]
fn test_process_bam(path_to_small_bam_file: &str) -> Result<(), Box<(dyn std::error::Error + 'static)>> {
let path_to_crate = env!("CARGO_MANIFEST_DIR");
let chromsizerefpath: String = format!("{}{}", path_to_crate, "/tests/hg38.chrom.sizes");
let chromsizerefpath = chromsizerefpath.as_str();
let combinedbedpath = path_to_small_bam_file;

let tempdir = tempfile::tempdir().unwrap();
let path = PathBuf::from(&tempdir.path());

// For some reason, you cannot chain .as_string() to .unwrap() and must create a new line.
//let bwfileheader_path = path.into_os_string().into_string().unwrap();
//let bwfileheader = bwfileheader_path.as_str();
let bwfileheader = "/home/drc/Downloads/baminput_bwoutput_test_rust/"; //todo change back to non local example


let smoothsize: i32 = 1;
let output_type = "wig";
let filetype = "bam";
let num_threads = 6;
let score = false;
let stepsize = 1;
let zoom = 0;

uniwig_main(
smoothsize,
combinedbedpath,
chromsizerefpath,
bwfileheader,
output_type,
filetype,
num_threads,
score,
stepsize,
zoom,
)
.expect("Uniwig main failed!");

Ok(())
}

// #[rstest]
// fn test_run_uniwig_main_bam_input_wig_output(
// path_to_small_bam_file: &str,
// path_to_chrom_sizes_file: &str,
// ) -> Result<(), Box<(dyn std::error::Error + 'static)>> {
// // This test uses a chrom sizes file and a bam file and will take a long time to run.
// // only run this during dev/troubleshooting, comment out for normal test suite checks
// //let path_to_crate = env!("CARGO_MANIFEST_DIR");
//
// //let tempbedpath = format!("{}{}", path_to_crate, "/tests/data/test5.bed");
// let combinedbedpath = path_to_small_bam_file;
//
// let chromsizerefpath = path_to_chrom_sizes_file;
//
// let tempdir = tempfile::tempdir().unwrap();
// let path = PathBuf::from(&tempdir.path());
//
// // For some reason, you cannot chain .as_string() to .unwrap() and must create a new line.
// let bwfileheader_path = path.into_os_string().into_string().unwrap();
// let bwfileheader = bwfileheader_path.as_str();
//
// let smoothsize: i32 = 5;
// let output_type = "wig";
// let filetype = "bam";
// let num_threads =6;
//
// uniwig_main(
// smoothsize,
// combinedbedpath,
// chromsizerefpath,
// bwfileheader,
// output_type,
// filetype,
// num_threads,
// )
// .expect("Uniwig main failed!");
//
// Ok(())
// }

#[rstest]
fn test_run_uniwig_main_wig_type() -> Result<(), Box<(dyn std::error::Error + 'static)>> {
Expand Down

0 comments on commit b4c2115

Please sign in to comment.