Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fragment file region scoring matrix creation #38

Open
wants to merge 30 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7e3fc5b
create fragment file globber
nleroy917 Oct 15, 2024
649e98b
create more models/structs
nleroy917 Oct 15, 2024
99abd44
build trees
nleroy917 Oct 15, 2024
dddd778
implement the find-overlaps function for the consensus set
nleroy917 Oct 15, 2024
cc6b8a3
fix tests, set up scoring
nleroy917 Oct 15, 2024
7b3b01d
create fragment abstraction
nleroy917 Oct 15, 2024
cb48827
finish the loop
nleroy917 Oct 15, 2024
bc73511
safegaurd count matrix scoring
nleroy917 Oct 15, 2024
7be2399
create an iterator over the CountMatrix
nleroy917 Oct 15, 2024
2489572
write to file
nleroy917 Oct 15, 2024
87e7f6e
add whitelist
nleroy917 Oct 16, 2024
9bc1ac7
complete the CLI
nleroy917 Oct 16, 2024
f04e3a3
actualyl add teh cli
nleroy917 Oct 16, 2024
6d1e299
use file + barcode whitelist
nleroy917 Oct 16, 2024
575e96e
better error
nleroy917 Oct 16, 2024
7ab95c6
error handling
nleroy917 Oct 16, 2024
aff0042
count total files
nleroy917 Oct 16, 2024
a6a1a02
add better logging
nleroy917 Oct 16, 2024
653e011
dont chec file
nleroy917 Oct 16, 2024
015170b
ignore whitelist
nleroy917 Oct 16, 2024
eb048d7
combine the Some(olaps)
nleroy917 Oct 16, 2024
551c2a1
average total olaps
nleroy917 Oct 16, 2024
8199594
work on generics
nleroy917 Oct 16, 2024
443829c
overlap counting
nleroy917 Oct 16, 2024
025d1f7
implement two modes, and properly count
nleroy917 Oct 16, 2024
52d3b2d
test data
nleroy917 Oct 17, 2024
32d1ac6
test data
nleroy917 Oct 17, 2024
61cec8f
gzip the region scoring test fragments
nleroy917 Oct 17, 2024
f763b77
work on unit tests
nleroy917 Oct 17, 2024
0a11bd8
Merge branch 'dev' into region_scoring
nleroy917 Oct 22, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions gtars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ noodles = { version = "0.83.0", features = ["bam"] }
bstr = "1.10.0"
rayon = "1.10.0"
indicatif = "0.17.8"
glob = "0.3.1"


[dev-dependencies]
Expand Down
52 changes: 52 additions & 0 deletions gtars/src/common/models/fragments.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
use std::str::FromStr;

use anyhow::Result;

use crate::common::models::Region;

#[allow(unused)]
pub struct Fragment {
pub chr: String,
pub start: u32,
pub end: u32,
pub barcode: String,
pub read_support: u32,
}

impl FromStr for Fragment {
type Err = anyhow::Error;

fn from_str(s: &str) -> Result<Self> {
let parts: Vec<&str> = s.split_whitespace().collect();
// dont check file integrity right now
// if parts.len() != 6 {
// anyhow::bail!(
// "Error parsing fragment file line: {}. Is your fragment file malformed? Found {} parts.",
// s,
// parts.len()
// )
// }

let start = parts[1].parse::<u32>()?;
let end = parts[2].parse::<u32>()?;
let read_support = parts[4].parse::<u32>()?;

Ok(Fragment {
chr: parts[0].to_string(),
start,
end,
barcode: parts[3].to_string(),
read_support,
})
}
}

impl From<Fragment> for Region {
fn from(val: Fragment) -> Self {
Region {
chr: val.chr,
start: val.start,
end: val.end,
}
}
}
2 changes: 2 additions & 0 deletions gtars/src/common/models/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
pub mod fragments;
pub mod region;
pub mod region_set;
pub mod tokenized_region;
pub mod tokenized_regionset;
pub mod universe;

// re-export for cleaner imports
pub use self::fragments::Fragment;
pub use self::region::Region;
pub use self::region_set::RegionSet;
pub use self::tokenized_region::TokenizedRegion;
Expand Down
6 changes: 3 additions & 3 deletions gtars/src/fragsplit/split.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,17 +159,17 @@ mod tests {

#[fixture]
fn barcode_cluster_map_file() -> &'static str {
"tests/data/scatlas_leiden.csv"
"tests/data/barcode_cluster_map.tsv"
}

#[fixture]
fn path_to_fragment_files() -> &'static str {
"tests/data/fragments-test"
"tests/data/fragments/fragsplit"
}

#[fixture]
fn path_to_output() -> &'static str {
"tests/data/out-test"
"tests/data/out"
}

#[fixture]
Expand Down
1 change: 1 addition & 0 deletions gtars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,6 @@ pub mod common;
pub mod fragsplit;
pub mod igd;
pub mod io;
pub mod scoring;
pub mod tokenizers;
pub mod uniwig;
5 changes: 5 additions & 0 deletions gtars/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use clap::Command;
// go through the library crate to get the interfaces
use gtars::fragsplit;
use gtars::igd;
use gtars::scoring;
use gtars::tokenizers;
use gtars::uniwig;

Expand All @@ -25,6 +26,7 @@ fn build_parser() -> Command {
.subcommand(fragsplit::cli::make_fragsplit_cli())
.subcommand(uniwig::cli::create_uniwig_cli())
.subcommand(igd::cli::create_igd_cli())
.subcommand(scoring::cli::make_fscoring_cli())
}

fn main() -> Result<()> {
Expand All @@ -50,6 +52,9 @@ fn main() -> Result<()> {
}
_ => unreachable!("IGD Subcommand not found"),
},
Some((scoring::consts::FSCORING_CMD, matches)) => {
scoring::cli::handlers::region_fragment_scoring(matches)?;
}
_ => unreachable!("Subcommand not found"),
};

Expand Down
85 changes: 85 additions & 0 deletions gtars/src/scoring/cli.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
use std::collections::HashSet;
use std::io::BufRead;
use std::path::PathBuf;

use anyhow::Result;
use clap::{arg, Arg, ArgMatches, Command};

use super::*;
use crate::scoring::{region_scoring_from_fragments, ConsensusSet, FragmentFileGlob};

pub fn make_fscoring_cli() -> Command {
Command::new(consts::FSCORING_CMD)
.author("Nathan LeRoy")
.about("Create a scoring matrix for a set of fragment files over a consensus peak set.")
.arg(Arg::new("fragments"))
.arg(Arg::new("consensus"))
.arg(arg!(--mode <mode>))
.arg(arg!(--output <output>))
.arg(arg!(--whitelist <whitelist>))
Comment on lines +17 to +19
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add more info here? Like is it required/help/etc?

}

pub mod handlers {

use std::str::FromStr;

use consts::DEFAULT_SCORING_MODE;

use crate::common::utils::get_dynamic_reader;

use super::*;

pub fn region_fragment_scoring(matches: &ArgMatches) -> Result<()> {
// get arguments from CLI
let fragments = matches
.get_one::<String>("fragments")
.expect("A path to fragment files is required.");

let consensus = matches
.get_one::<String>("consensus")
.expect("A path to a mapping file is required.");

let default_out = consts::DEFAULT_OUT.to_string();
let output = matches.get_one::<String>("output").unwrap_or(&default_out);
let mode = match matches.get_one::<String>("mode") {
Some(mode) => ScoringMode::from_str(mode),
None => Ok(DEFAULT_SCORING_MODE),
};
let mode = mode.unwrap_or(DEFAULT_SCORING_MODE);
Comment on lines +44 to +48
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These lines seem redundant. Do you actually need line 48?


let whitelist = matches.get_one::<String>("whitelist");

// coerce arguments to types
let mut fragments = FragmentFileGlob::new(fragments)?;
let consensus = PathBuf::from(consensus);
let consensus = ConsensusSet::new(consensus)?;

let whitelist = match whitelist {
Some(whitelist) => {
// open whitelist and read to HashSet<String>
let whitelist = PathBuf::from(whitelist);
let reader = get_dynamic_reader(&whitelist)?;
let mut whitelist: HashSet<String> = HashSet::new();
for line in reader.lines() {
let line = line?;
if !whitelist.contains(&line) {
whitelist.insert(line);
}
}
Some(whitelist)
}
None => None,
};

let count_mat = region_scoring_from_fragments(
&mut fragments,
&consensus,
whitelist.as_ref(),
mode,
)?;

count_mat.write_to_file(output)?;

Ok(())
}
}
7 changes: 7 additions & 0 deletions gtars/src/scoring/consts.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
use crate::scoring::ScoringMode;

pub const FSCORING_CMD: &str = "fscoring";
pub const DEFAULT_OUT: &str = "fscoring.csv.gz";
pub const DEFAULT_SCORING_MODE: ScoringMode = ScoringMode::Atac;
pub const START_SHIFT: i8 = 4;
pub const END_SHIFT: i8 = 5;
105 changes: 105 additions & 0 deletions gtars/src/scoring/counts.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
use std::fs::File;
use std::io::{BufWriter, Write};
use std::ops::{Add, AddAssign};

use anyhow::Result;
use flate2::write::GzEncoder;
use flate2::Compression;

pub struct CountMatrix<T> {
data: Vec<T>,
pub rows: usize,
pub cols: usize,
}

pub struct RowIterator<'a, T> {
matrix: &'a CountMatrix<T>,
current_row: usize,
}

impl<T> CountMatrix<T>
where
T: Copy + Default + Add<Output = T> + AddAssign + From<u8>,
{
pub fn new(rows: usize, cols: usize) -> Self {
Self {
data: vec![T::default(); rows * cols],
rows,
cols,
}
}

pub fn get(&self, row: usize, col: usize) -> Option<&T> {
self.data.get(row * self.cols + col)
}

pub fn set(&mut self, row: usize, col: usize, value: T) -> Result<(), String> {
if row < self.rows && col < self.cols {
self.data[row * self.cols + col] = value;
Ok(())
} else {
Err(format!("Index out of bounds: row {}, col {}", row, col))
}
}

pub fn increment(&mut self, row: usize, col: usize) {
if row < self.rows && col < self.cols {
let index = row * self.cols + col;
if let Some(value) = self.data.get_mut(index) {
*value += 1.into();
}
}
}
}

impl<'a, T> Iterator for RowIterator<'a, T>
where
T: Copy + Default,
{
type Item = &'a [T];

fn next(&mut self) -> Option<Self::Item> {
if self.current_row < self.matrix.rows {
let start = self.current_row * self.matrix.cols;
let end = start + self.matrix.cols;
self.current_row += 1;
Some(&self.matrix.data[start..end])
} else {
None
}
}
}

impl<T> CountMatrix<T>
where
T: Copy + Default,
{
pub fn iter_rows(&self) -> RowIterator<T> {
RowIterator {
matrix: self,
current_row: 0,
}
}
}

impl<T> CountMatrix<T>
where
T: Copy + Default + ToString,
{
pub fn write_to_file(&self, filename: &str) -> std::io::Result<()> {
let file = File::create(filename)?;
let mut buf_writer = BufWriter::new(GzEncoder::new(file, Compression::default()));

for row in self.iter_rows() {
let row_str: String = row
.iter()
.map(|v| v.to_string())
.collect::<Vec<String>>()
.join(",");
buf_writer.write_all(row_str.as_bytes())?;
buf_writer.write_all(b"\n")?; // Add a newline after each row
}

Ok(())
}
}
Loading