Skip to content

Commit

Permalink
Merge pull request #151 from NCI-RBL/dev
Browse files Browse the repository at this point in the history
complete feature branch merge
  • Loading branch information
slsevilla authored May 25, 2023
2 parents b89dae8 + 4fb9392 commit 904e96c
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 97 deletions.
162 changes: 87 additions & 75 deletions .tests/cluster_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,125 +2,137 @@
__default__:
gres: lscratch:96
mem: 40g
partition: norm
time: 00-02:00:00
partition: ccr,norm
time: 00-08:00:00
threads: 32
output: .%j.{wildcards}.out
error: .%j.{wildcards}.err

qc_barcode:
threads: 3
mem: 3g
threads: 8
mem: 75g
time: 00-04:00:00

demultiplex:
threads: 3
mem: 3g
time: 04-00:00:00
threads: 56
mem: 32g
gres: lscratch:800
time: 00-05:00:00

remove_adaptors:
threads: 3
time: 1-00:00:00
mem: 3g

qc_fastq_pre:
threads: 3
mem: 3g
time: 00-03:00:00
nondemux:
time: 00-01:00:00

qc_fastq_post:
threads: 3
qc_fastq:
threads: 4
mem: 3g
time: 00-03:00:00

qc_screen_validator:
mem: 15g
mem: 32g
time: 00-03:00:00

split_files:
threads: 3
mem: 3g
time: 00-03:00:00

novoalign:
mem: 50g
time: 10-00:00:00

cleanup_conversion:
threads: 5
mem: 30g
time: 00-3:00:00

merge_unmapped_splits:
time: 01-00:00:00
mem: 75g

create_bam_mm_unique:
threads: 6
gres: lscratch:256
mem: 30g
star:
time: 04-00:00:00

merge_splits_unique_mm:
mem: 512g
time: 02-06:00:00
partition: largemem

merge_mm_and_unique:
threads: 2
gres: lscratch:256
mem: 5g
time: 02-00:00:00

qc_alignment:
mem: 10g
gres: lscratch:800
threads: 16
mem: 120g

index_stats:
threads: 8
gres: lscratch:800
mem: 200g
time: 01-00:00:00

qc_troubleshoot:
threads: 3
threads: 4
mem: 3g

dedup:
threads: 2
mem: 64g
threads: 8
mem: 200g
gres: lscratch:256
time: 01-00:00:00
time: 02-00:00:00

create_beds_safs:
mem: 350g
gres: lscratch:256
mem: 200g
gres: lscratch:512
threads: 8

bgzip_beds:
mem: 100g
threads: 4
partition: largemem

feature_counts:
threads: 8
mem: 200g

project_annotations:
threads: 2
mem: 10g
time: 00-01:00:00

peak_annotations:
threads: 3
peak_junctions:
threads: 10
gres: lscratch:128
mem: 36g
time: 04-00:00:00

peak_Transcripts:
threads: 4
gres: lscratch:128
mem: 30g
time: 04-00:00:00

peak_ExonIntron:
threads: 4
gres: lscratch:128
mem: 30g
time: 00-12:00:00
time: 04-00:00:00

peak_RMSK:
threads: 4
gres: lscratch:128
mem: 30g
time: 04-00:00:00

annotation_report:
mem: 10g
threads: 4
gres: lscratch:128
mem: 30g
time: 00-12:00:00

MANORM_beds:
threads: 4
mem: 30g

MANORM_analysis:
threads: 4
mem: 30g
time: 04-00:00:00

MANORM_post_processing:
threads: 2
mem: 2g
time: 00-01:00:00
mem: 30g
time: 00-12:00:00

MANORM_RMD:
threads: 2
mem: 3g
time: 00-01:00:00
mem: 30g
time: 00-02:00:00

mapq_recalc:
mem: 1TB
gres: lscratch:256
partition: largemem
time: 00-06:00:00
DIFFBIND_beds:
threads: 4
mem: 30g

DIFFBIND_preprocess:
threads: 4
mem: 30g

DIFFBIND_analysis:
threads: 4
mem: 30g


DIFFBIND_report:
threads: 4
mem: 30g
4 changes: 2 additions & 2 deletions .tests/multiplex_hg38_full.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
file_name multiplex
test_6.fastq.gz test_6
file_name,multiplex
test_6.fastq.gz,test_6
6 changes: 3 additions & 3 deletions .tests/sample_hg38_full.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
multiplex sample group barcode adaptor
test_6 Ro_Clip CLIP NNNNNCACTGTNNNN AGATCGGAAGAGCGTCGTG
test_6 Control_Clip CNTRL NNNNNATTGGCNNNN AGATCGGAAGAGCGTCGTG
multiplex,sample,group,barcode,adaptor
test_6,Ro_Clip,CLIP,NNNNNCACTGTNNNN,AGATCGGAAGAGCGTCGTG
test_6,Control_Clip,CNTRL,NNNNNATTGGCNNNN,AGATCGGAAGAGCGTCGTG
82 changes: 66 additions & 16 deletions .tests/snakemake_config.yaml
Original file line number Diff line number Diff line change
@@ -1,52 +1,102 @@
#########################################################################################
# Global configuration file for the pipeline
#########################################################################################

#########################################################################################
#Folders and Paths
#########################################################################################
#path to snakemake file
sourceDir: ""

#path to output directory
outputDir: "hg38_full/"

#path to fastq files
fastqDir: ".tests/"
#path to manifest files
sampleManifest: ".tests/sample_hg38_full.tsv"
multiplexManifest: ".tests/multiplex_hg38_full.tsv"
contrastManifest: ".test/contrasts_example.tsv"

#path to fastq files
fastqDir: ".tests/"

########################################################################################
#user parameters
filterlength: 20 #minimum read length to include in analysis [any int >20]
#########################################################################################
multiplexflag: "Y" #flag that samples are multiplexed ["Y","N"]
umiSeparator: "rbc:" #required for nondemultiplexed samples to determine delimiter for deduplication [":", "_", "rbc:"]
mismatch: 1 #number of bp mismatches allowed in demultiplexing [1,2,3]
barcode_qc_flag: "PROCESS" #barcodes will undergo QC to ensure uniformity within samples; ["PROCESS", "IGNORE"]
min_reads_mapped: 0.5 #minimum percent of reads that should be mapped; IE .5 for 50% of all reads must be mapped [0.5]
reference: "hg38" #reference organism ["mm10", "hg38"]
spliceaware: "N" #whether to run splice_aware part of the pipeline ['y', 'n']
filterlength: 20 #minimum read length to include in analysis [any int >20]
phredQuality: 20 #minimum quality score for 3’ end trimming
includerRNA: "N" #include refseq rRNA's in annotations ["Y", "N"]
spliceBPlength: 75 #length of splice index to use [50, 75, 150]
splicejunction: "N" #include splice junctions in peak calls: "manorm"
condenseexon: "N" #whether to collapse exons
AnnoAnchor: "max_total" #whether annotations for spliced peaks will be based on either 5' most region or region with max reads ["max","5prime"]
mincount: 3 #minimum number of matches to count as a peak [1,2,3]
ntmerge: 50 #minimum distance of nucleotides to merge peaks [10,20,30,40,50,60]
peakid: "ALL" #report peaks for unique peaks only or unique and fractional mm ["unique","all"]
DEmethod: "none" #choose DE method ["manorm","none"]
MANormWidth: 50 #Width of window to calculate read density. [any integer >1; default 50]
MNormDistance: 25 #Summit-to-summit distance cutoff for common peaks. [ any integer >1; default MANormWidth/2]
sampleoverlap: 1 #if DEmethod DIFFBIND, minimum number of samples a peak must be found in to be counted [>1]
pval: 0.005 #if DEmethod, pval cutoff for significance
fc: 1 #if DEmethod, fold change cut off for significance
single_qc_threshold: 95 #maximum threshold for unmampped reads in any single sample
project_qc_threshold: 50 #maximum threshold for unmapped reads across average of all project samples

#########################################################################################
# STAR parameters
#########################################################################################
alignEndsType: "Local" #type of read ends alignment ["Local", "EndToEnd", "Extend5pOfRead1", "Extend5pOfReads12"]
alignIntronMax: 50000 #maximum intron length
alignSJDBoverhangMin: 3 # minimum overhang value for annotated spliced junctions
alignSJoverhangMin: 5 # minimum overhang value for non-cannonical splied junctions
alignTranscriptsPerReadNmax: 10000 #max number of different alignments per read to consider [int>0]
alignWindowsPerReadNmax: 10000 #max number of windows per read [int>0]
limitOutSJcollapsed: 1000000 # max number of collapsed junctions [int>0]
outFilterMatchNmin: 15 # alignment will be output only if the number of matched bases is higher than or equal to this value.
outFilterMatchNminOverLread: 0.9 #alignment will be output only if the number of matched bases is >= to value; normalized to sum of mates’ lengths for paired-end reads
outFilterMismatchNmax: 999 #alignment will be output only if it has no more mismatches than this value.
outFilterMismatchNoverReadLmax: 0.04 #alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value.
outFilterMultimapNmax: 10000 #max number of multiple alignments allowed for a read: if exceeded, the read is considered unmapped
outFilterMultimapScoreRange: 0 #the score range below the maximum score for multimapping alignments
outFilterScoreMin: 0 #alignment will be output only if its score is higher than or equal to this value.
outFilterType: "Normal" #type of filtering ["Normal", "BySJout"]
outSAMattributes: "All" #a string of desired SAM attributes, in the order desired for the output SAM
outSAMunmapped: "None" #output of unmapped reads in the SAM format ["None", "Within"]
outSJfilterCountTotalMin: "3,1,1,1" #minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif
outSJfilterOverhangMin: "30,12,12,12" #minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif
outSJfilterReads: "All" #which reads to consider for collapsed splice junctions output ["All", "Unique"]
seedMultimapNmax: 10000 #only pieces that map fewer than this value are utilized in the stitching procedure [int>0]
seedNoneLociPerWindow: 20 #max number of one seed loci per window [int>0]
seedPerReadNmax: 10000 #max number of seeds per read
seedPerWindowNmax: 500 #max number of seeds per window
sjdbScore: 2 #extra alignment score for alignmets that cross database junctions
winAnchorMultimapNmax: 500 #max number of loci anchors are allowed to map to


#########################################################################################
# modules, container parameters
#########################################################################################
#modules, container parameters
containerDir: "/data/CCBR_Pipeliner/iCLIP/container"
fastq_val: "/data/CCBR_Pipeliner/db/PipeDB/bin/fastQValidator"

bedtools: "bedtools/2.29.2"
bowtie2: "bowtie/2-2.3.4"
fastq_screen: "fastq_screen/0.14.0"
fastqc: "fastqc/0.11.9"
java: "java/12.0.1"
manorm: "manorm/1.1.4"
multiqc: "multiqc/1.9"
novocraft: "novocraft/4.03.01"
perl: "perl/5.24.3"
python: "python/3.7"
Qt: "Qt/5.13.2"
singularity: "singularity"
python: "python/3.8"
R: "R/4.0"
samtools: "samtools/1.11"
umitools: "umitools/1.1.1"
star: "STAR/2.7.8a"
subread: "subread/2.0.1"
R: "R/4.0"
ultraplex: "ultraplex/1.2.5"
umitools: "umitools/1.1.1"

#########################################################################################
# dev
#########################################################################################
#testing parameter
testing_option: "N"
1 change: 1 addition & 0 deletions config/snakemake_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ seedPerReadNmax: 10000 #max number of seeds per read
seedPerWindowNmax: 500 #max number of seeds per window
sjdbScore: 2 #extra alignment score for alignmets that cross database junctions
winAnchorMultimapNmax: 500 #max number of loci anchors are allowed to map to
quantmod: 'TranscriptomeSAM' #additionnal alignment on transcriptome

#########################################################################################
# modules, container parameters
Expand Down
5 changes: 4 additions & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ star_seed_read = config['seedPerReadNmax']
star_seed_wind = config['seedPerWindowNmax']
star_sj = config['sjdbScore']
star_win_anchor = config['winAnchorMultimapNmax']
star_quantmod = config['quantmod']

# modules, container
cont_dir = config['containerDir']
Expand Down Expand Up @@ -771,6 +772,7 @@ rule star:
s_wind = star_seed_wind,
s_sj = star_sj,
s_anchor = star_win_anchor,
s_quantmod = star_quantmod,
out_prefix = '{sp}_'
envmodules:
config['star'],
Expand Down Expand Up @@ -824,7 +826,8 @@ rule star:
--seedPerReadNmax {params.s_read} \
--seedPerWindowNmax {params.s_wind} \
--sjdbScore {params.s_sj} \
--winAnchorMultimapNmax {params.s_anchor}
--winAnchorMultimapNmax {params.s_anchor} \
--quantMode {params.s_quantmod}
# sort file
samtools sort -m 80G -T $tmp_dir $tmp_dir/{params.out_prefix}Aligned.out.bam -o $tmp_dir/{params.out_prefix}Aligned.sortedByCoord.out.bam
Expand Down
1 change: 1 addition & 0 deletions workflow/scripts/02_barcode_qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ barcode_input = args$barcode_input
output_dir = args$output_dir
mismatch = as.integer(args$mismatch)
mpid = args$mpid
qc_dir = args$qc_dir

#test input
testing="N"
Expand Down

0 comments on commit 904e96c

Please sign in to comment.