From ae604418a2c70cd80f4d0b6a2f09dd9a9cbff067 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 29 Aug 2022 09:50:14 -0400 Subject: [PATCH 01/19] new warning when samples do not produce high enough overlaps --- workflow/scripts/07_MAnorm_Process.R | 86 ++++++++++++++++++++++++---- 1 file changed, 74 insertions(+), 12 deletions(-) diff --git a/workflow/scripts/07_MAnorm_Process.R b/workflow/scripts/07_MAnorm_Process.R index 013360b..a80bc2a 100644 --- a/workflow/scripts/07_MAnorm_Process.R +++ b/workflow/scripts/07_MAnorm_Process.R @@ -6,6 +6,7 @@ suppressMessages(library(dplyr)) suppressMessages(library(data.table)) suppressMessages(library("readxl")) suppressMessages(library(argparse)) +suppressMessages(library(stringr)) #set args parser <- ArgumentParser() @@ -43,18 +44,30 @@ background_countsMM=args$background_countsMM # background_countsMM="/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mES_fclip_1_YL_012122/05_demethod_PH/02_analysis/YKO_fclip_vs_FLAG_Ro_fclip/counts/FLAG_Ro_fclip_allFracMMCounts_YKO_fclip_Peaks.txt" # background_countsMMall="/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mES_fclip_1_YL_012122/05_demethod_PH/02_analysis/YKO_fclip_vs_FLAG_Ro_fclip/counts/FLAG_Ro_fclip_allFracMMallCounts_YKO_fclip_Peaks.txt" -# #testing -# setwd("/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mES_fclip_1_YL_012122/05_demethod_PH") -# samplename = "FLAG_Ro_fclip_wd" -# background = "YKO_fclip_wd" -# peak_anno_g1 = "/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mES_fclip_1_YL_012122/05_demethod_PH/04_annotation/FLAG_Ro_fclip_annotation_final_table.txt" -# peak_anno_g2 = "/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mES_fclip_1_YL_012122/05_demethod_PH/04_annotation/YKO_fclip_annotation_final_table.txt" -# pos_manorm = "/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mES_fclip_1_YL_012122/05_demethod_PH/02_analysis/YKO_fclip_vs_FLAG_Ro_fclip/YKO_fclip_vs_FLAG_Ro_fclip_P/FLAG_Ro_fclip_wd_MAvalues.xls" -# neg_manorm = "/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mES_fclip_1_YL_012122/05_demethod_PH/02_analysis/YKO_fclip_vs_FLAG_Ro_fclip/YKO_fclip_vs_FLAG_Ro_fclip_N/FLAG_Ro_fclip_wd_MAvalues.xls" -# output_file = "/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mES_fclip_1_YL_012122/05_demethod_PH/02_analysis/YKO_fclip_vs_FLAG_Ro_fclip/FLAG_Ro_fclip_wd_vs_YKO_fclip_wd_post_processingTest.txt" -# background_countsUnq="/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mES_fclip_1_YL_012122/05_demethod_PH/02_analysis/YKO_fclip_vs_FLAG_Ro_fclip/counts/YKO_fclip_allUnique_FLAG_Ro_fclip_Peaks.txt" -# background_countsMM="/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mES_fclip_1_YL_012122/05_demethod_PH/02_analysis/YKO_fclip_vs_FLAG_Ro_fclip/counts/YKO_fclip_allFracMMCounts_FLAG_Ro_fclip_Peaks.txt" -# background_countsMMall="/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mES_fclip_1_YL_012122/05_demethod_PH/02_analysis/YKO_fclip_vs_FLAG_Ro_fclip/counts/FLAG_Ro_fclip_allFracMMallCounts_YKO_fclip_Peaks.txt" + +##testing +#setwd("/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/05_demethod") +# samplename = "Y5KO_fCLIP" +# background = "WT_fCLIP" +# peak_anno_g1 = "/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/04_annotation/Y5KO_fCLIP_annotation_ALLreadPeaks_final_table.txt" +# peak_anno_g2 = "/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/04_annotation/WT_fCLIP_annotation_ALLreadPeaks_final_table.txt" +# pos_manorm = "/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0/05_demethod/02_analysis/WT_fCLIP_vs_Y5KO_fCLIP/WT_fCLIP_vs_Y5KO_fCLIP_P/WT_fCLIP_MAvalues.xls" +# neg_manorm = "/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0/05_demethod/02_analysis/WT_fCLIP_vs_Y5KO_fCLIP/WT_fCLIP_vs_Y5KO_fCLIP_N/WT_fCLIP_MAvalues.xls" +# output_file = "/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0/05_demethod/02_analysis/WT_fCLIP_vs_Y5KO_fCLIP/WT_fCLIP_vs_Y5KO_fCLIP_post_processing.txt" +# background_countsUnq="/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0/05_demethod/02_analysis/WT_fCLIP_vs_Y5KO_fCLIP/counts/Y5KO_fCLIP_allUnique_WT_fCLIP_Peaks.txt" +# background_countsMM="/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0/05_demethod/02_analysis/WT_fCLIP_vs_Y5KO_fCLIP/counts/counts/Y5KO_fCLIP_allFracMMCounts_WT_fCLIP_Peaks.txt" +# background_countsMMall="/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0/05_demethod/02_analysis/WT_fCLIP_vs_Y5KO_fCLIP/counts/Y5KO_fCLIP_allFracMMallCounts_WT_fCLIP_Peaks.txt" + +# samplename = "Y5KO_fCLIP" +# background = "KO_fCLIP" +# peak_anno_g1 = "/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/04_annotation/Y5KO_fCLIP_annotation_ALLreadPeaks_final_table.txt" +# peak_anno_g2 = "/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/04_annotation/KO_fCLIP_annotation_ALLreadPeaks_final_table.txt" +# pos_manorm = "/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0/05_demethod/02_analysis/Y5KO_fCLIP_vs_KO_fCLIP/Y5KO_fCLIP_vs_KO_fCLIP_P/KO_fCLIP_MAvalues.xls" +# neg_manorm = "/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0/05_demethod/02_analysis/Y5KO_fCLIP_vs_KO_fCLIP/Y5KO_fCLIP_vs_KO_fCLIP_N/KO_fCLIP_MAvalues.xls" +# output_file = "/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0/05_demethod/02_analysis/Y5KO_fCLIP_vs_KO_fCLIP/Y5KO_fCLIP_vs_KO_fCLIP_post_processing.txt" +# background_countsUnq="/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0/05_demethod/02_analysis/Y5KO_fCLIP_vs_KO_fCLIP/counts/Y5KO_fCLIP_allUnique_KO_fCLIP_Peaks.txt" +# background_countsMM="/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0/05_demethod/02_analysis/Y5KO_fCLIP_vs_KO_fCLIP/counts/counts/Y5KO_fCLIP_allFracMMCounts_KO_fCLIP_Peaks.txt" +# background_countsMMall="/Volumes/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0/05_demethod/02_analysis/Y5KO_fCLIP_vs_KO_fCLIP/counts/Y5KO_fCLIP_allFracMMallCounts_KO_fCLIP_Peaks.txt" ############################################################################################################ # Sample1 Processing @@ -95,6 +108,10 @@ SmplB_Peaks.GR <- GRanges(seqnames = as.character(SmplB_Peaks$chr), ranges=IRang ############################################################################################################ #Input positive and negative +if(file.exists(neg_manorm)&file.exists(pos_manorm)){ + + + Neg_MAval=fread(neg_manorm, header=T, sep="\t",stringsAsFactors = F,data.table=F) colnames(Neg_MAval)=gsub("_Neg","",colnames(Neg_MAval)) Neg_MAval$strand="-" @@ -113,6 +130,51 @@ MAval$Peak_Group=gsub("_Neg","",MAval$Peak_Group) MAval$Peak_Group=gsub("_Pos","",MAval$Peak_Group) colnames(MAval)=gsub('_50nt_peakDepth5',"",colnames(MAval)) + +}else{ + + +inPATH=str_split(neg_manorm,'02_analysis')[[1]][1] +outPATH=str_split(neg_manorm,'/')[[1]]%>%grep(pattern=".xls$",value = T,invert = T)%>%paste(collapse = '/')%>%gsub(pattern = '.{2}$', replacement = '') +writePATH=str_split(neg_manorm,'/')[[1]]%>%grep(pattern=".xls$",value = T,invert = T) + writePATH=writePATH[1:(length(writePATH)-1)]%>%paste(collapse = '/') +errPATH=str_split(outPATH,'/')[[1]] + errPATH=paste(errPATH[grep('05_demethod',errPATH):length(errPATH)],collapse = '/') + +background_Negfile=list.files(paste0(inPATH,'01_input'))%>%grep(pattern=paste0('^',background,'_'),value=T)%>%grep(pattern = 'Peaks_forMANORM',value=T)%>%grep(pattern = '_N',value=T) +background_Posfile=list.files(paste0(inPATH,'01_input'))%>%grep(pattern=paste0('^',background,'_'),value=T)%>%grep(pattern = 'Peaks_forMANORM',value=T)%>%grep(pattern = '_P',value=T) +samplename_Negfile=list.files(paste0(inPATH,'01_input'))%>%grep(pattern=paste0('^',samplename,'_'),value=T)%>%grep(pattern = 'Peaks_forMANORM',value=T)%>%grep(pattern = '_N',value=T) +samplename_Posfile=list.files(paste0(inPATH,'01_input'))%>%grep(pattern=paste0('^',samplename,'_'),value=T)%>%grep(pattern = 'Peaks_forMANORM',value=T)%>%grep(pattern = '_P',value=T) + + + +system(paste0('bedtools intersect -a ',inPATH,'01_input/',samplename_Negfile,' -b ',inPATH,'01_input/',background_Negfile,' -wo > ',outPATH,'_N/PeakOverlap_MAnormERR.txt')) +system(paste0('bedtools intersect -a ',inPATH,'01_input/',samplename_Posfile,' -b ',inPATH,'01_input/',background_Posfile,' -wo > ',outPATH,'_P/PeakOverlap_MAnormERR.txt')) + +OL_P=fread(paste0(outPATH,'_P/PeakOverlap_MAnormERR.txt'), header=T, sep="\t",stringsAsFactors = F,data.table=F) +OL_P=nrow(OL_P) +OL_N=fread(paste0(outPATH,'_N/PeakOverlap_MAnormERR.txt'), header=T, sep="\t",stringsAsFactors = F,data.table=F) +OL_N=nrow(OL_N) + +if(OL_P>0|OL_N>0){ + + error=paste0("MAnorm did not generate output table. ","\n","MAnorm Normalization Error","\n","Overlapping Peaks found in ","\n",errPATH,"_P/PeakOverlap_MAnormERR.txt","\n",errPATH,"_N/PeakOverlap_MAnormERR.txt") + + write.table(error,file=paste0(writePATH,'/MAnorm_Norm_ERR.txt'), sep = "\t", row.names = FALSE, col.names = T, append = F, quote= FALSE,na = "") + + +}else{ + + error=paste0("MAnorm did not generate output table. ","\n","Overlaps not found between","\n",samplename," and ",background," Peaks.","\n"," ","\n"," ") + + write.table(error,file=paste0(writePATH,'/MAnorm_noOL_ERR.txt'), sep = "\t", row.names = FALSE, col.names = T, append = F, quote= FALSE,na = "") + +} + + # cat(error) + + +} ############################################################################################################ ## for spliced peaks select stats from peak with greatest read density SmplA_Peaks=merge(SmplA_Peaks,MAval[,c("ID",'P_value',paste0('normalized_read_density_in_',samplename), paste0('normalized_read_density_in_',background))],by='ID',all.x=T) From 9fcf35504692d1d2858938129c89970d6389b52e Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 29 Aug 2022 11:14:14 -0400 Subject: [PATCH 02/19] add comments to all star parameters --- config/snakemake_config.yaml | 44 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/config/snakemake_config.yaml b/config/snakemake_config.yaml index b5c92cb..7ad5bad 100644 --- a/config/snakemake_config.yaml +++ b/config/snakemake_config.yaml @@ -41,31 +41,31 @@ fc: 1 #if DEmethod DIFFBIND or MANNORM, fold change cut off for significance ######################################################################################### # STAR parameters ######################################################################################### -alignEndsType: "Local" -alignIntronMax: 50000 +alignEndsType: "Local" #type of read ends alignment ["Local", "EndToEnd", "Extend5pOfRead1", "Extend5pOfReads12"] +alignIntronMax: 50000 #maximum intron length alignSJDBoverhangMin: 3 # minimum overhang value for annotated spliced junctions alignSJoverhangMin: 5 # minimum overhang value for non-cannonical splied junctions -alignTranscriptsPerReadNmax: 10000 -alignWindowsPerReadNmax: 10000 +alignTranscriptsPerReadNmax: 10000 #max number of different alignments per read to consider [int>0] +alignWindowsPerReadNmax: 10000 #max number of windows per read [int>0] outFilterMatchNmin: 15 # alignment will be output only if the number of matched bases is higher than or equal to this value. -outFilterMatchNminOverLread: 0.9 -outFilterMismatchNmax: 999 -outFilterMismatchNoverReadLmax: 0.04 -outFilterMultimapNmax: 10000 -outFilterMultimapScoreRange: 0 -outFilterScoreMin: 0 -outFilterType: "Normal" -outSAMattributes: "All" -outSAMunmapped: "None" -outSJfilterCountTotalMin: "3 1 1 1" -outSJfilterOverhangMin: "30 12 12 12" -outSJfilterReads: "All" -seedMultimapNmax: 10000 -seedNoneLociPerWindow: 20 -seedPerReadNmax: 10000 -seedPerWindowNmax: 500 -sjdbScore: 2 -winAnchorMultimapNmax: 500 +outFilterMatchNminOverLread: 0.9 #alignment will be output only if the number of matched bases is >= to value; normalized to sum of mates’ lengths for paired-end reads +outFilterMismatchNmax: 999 #alignment will be output only if it has no more mismatches than this value. +outFilterMismatchNoverReadLmax: 0.04 #alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value. +outFilterMultimapNmax: 10000 #max number of multiple alignments allowed for a read: if exceeded, the read is considered unmapped +outFilterMultimapScoreRange: 0 #the score range below the maximum score for multimapping alignments +outFilterScoreMin: 0 #alignment will be output only if its score is higher than or equal to this value. +outFilterType: "Normal" #type of filtering ["Normal", "BySJout"] +outSAMattributes: "All" #a string of desired SAM attributes, in the order desired for the output SAM +outSAMunmapped: "None" #output of unmapped reads in the SAM format ["None", "Within"] +outSJfilterCountTotalMin: "3 1 1 1" #minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif +outSJfilterOverhangMin: "30 12 12 12" #minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif +outSJfilterReads: "All" #which reads to consider for collapsed splice junctions output ["All", "Unique"] +seedMultimapNmax: 10000 #only pieces that map fewer than this value are utilized in the stitching procedure [int>0] +seedNoneLociPerWindow: 20 #max number of one seed loci per window [int>0] +seedPerReadNmax: 10000 #max number of seeds per read +seedPerWindowNmax: 500 #max number of seeds per window +sjdbScore: 2 #extra alignment score for alignmets that cross database junctions +winAnchorMultimapNmax: 500 #max number of loci anchors are allowed to map to ######################################################################################### # modules, container parameters From 6a2c2d15aea683989bc3adaa4c0b0c53f0709293 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Thu, 1 Sep 2022 09:48:07 -0400 Subject: [PATCH 03/19] add masterid to jobname --- run_snakemake.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run_snakemake.sh b/run_snakemake.sh index 798df26..210ed7a 100755 --- a/run_snakemake.sh +++ b/run_snakemake.sh @@ -409,7 +409,7 @@ elif [[ $pipeline = "cluster" ]] || [[ $pipeline = "local" ]]; then cat > ${output_dir}/submit_script.sbatch << EOF #!/bin/bash -#SBATCH --job-name="RBLiCLIP" +#SBATCH --job-name="iCLIP_%j" #SBATCH --mem=40g #SBATCH --gres=lscratch:200 #SBATCH --time=10-00:00:00 @@ -435,7 +435,7 @@ cd \$SLURM_SUBMIT_DIR --cluster \ "sbatch --gres {cluster.gres} --cpus-per-task {cluster.threads} \ -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} \ - --job-name={params.rname} --output=${output_dir}/log/${log_time}/{params.rname}{cluster.output} --mail-type=BEGIN,END,FAIL\ + --job-name={params.rname} --output=${output_dir}/log/${log_time}/{params.rname}{cluster.output} \ --error=${output_dir}/log/${log_time}/{params.rname}{cluster.error}" \ 2>&1|tee ${output_dir}/log/${log_time}/snakemake.log From 8eafd063e189d6f162fd264ea2a30a927480188a Mon Sep 17 00:00:00 2001 From: slsevilla Date: Thu, 1 Sep 2022 09:48:22 -0400 Subject: [PATCH 04/19] add bam limit for sorting --- workflow/Snakefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflow/Snakefile b/workflow/Snakefile index 6bacc14..c78eb61 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -58,6 +58,7 @@ pval = config['pval'] fc = config['fc'] # STAR parameters +star_bam_limit = "50297600554" star_align_type = config['alignEndsType'] star_align_intron = config['alignIntronMax'] star_align_sjdb = config['alignSJDBoverhangMin'] @@ -707,6 +708,7 @@ rule star: s_asj = star_align_sj, s_transc = star_align_transc, s_windows = star_align_windows, + s_bam_limit = star_bam_limit, s_match = star_filt_match, s_readmatch = star_filt_readmatch, s_mismatch = star_filt_mismatch, @@ -757,6 +759,7 @@ rule star: --alignSJoverhangMin {params.s_asj} \ --alignTranscriptsPerReadNmax {params.s_transc} \ --alignWindowsPerReadNmax {params.s_windows} \ + --limitBAMsortRAM {params.s_bam_limit} \ --outFilterMatchNmin {params.s_match} \ --outFilterMatchNminOverLread {params.s_readmatch} \ --outFilterMismatchNmax {params.s_mismatch} \ From 068cc67090f9064e48a4b850e038437c5310b1dd Mon Sep 17 00:00:00 2001 From: slsevilla Date: Thu, 1 Sep 2022 09:52:01 -0400 Subject: [PATCH 05/19] fix EOF error --- workflow/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index c78eb61..7fbb89b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1258,7 +1258,7 @@ else: total = join(out_dir,'03_peaks','03_counts','{sp}_' + peak_id + 'readpeaks_totalCounts.txt'), anno = rules.project_annotations.output.anno params: - rname = '14_peak_junctions, + rname = '14_peak_junctions', script = join(source_dir,'workflow','scripts','05_Anno_junctions.R'), functions = join(source_dir,'workflow','scripts','05_peak_annotation_functions.R'), bashscript = join(source_dir,'workflow','scripts','05_get_site2peak_lookup.sh'), From 93a7425c1152baa1eaf30eb4067a81b9d99c9a75 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Thu, 1 Sep 2022 09:52:14 -0400 Subject: [PATCH 06/19] update job name --- run_snakemake.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_snakemake.sh b/run_snakemake.sh index 210ed7a..61a69ea 100755 --- a/run_snakemake.sh +++ b/run_snakemake.sh @@ -409,7 +409,7 @@ elif [[ $pipeline = "cluster" ]] || [[ $pipeline = "local" ]]; then cat > ${output_dir}/submit_script.sbatch << EOF #!/bin/bash -#SBATCH --job-name="iCLIP_%j" +#SBATCH --job-name=iCLIP_%j #SBATCH --mem=40g #SBATCH --gres=lscratch:200 #SBATCH --time=10-00:00:00 From ff24883665214bc7b64c2a6a9333ed4692e852b9 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Tue, 6 Sep 2022 10:05:26 -0400 Subject: [PATCH 07/19] remove reference species default --- config/snakemake_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/snakemake_config.yaml b/config/snakemake_config.yaml index 7ad5bad..a6788a8 100644 --- a/config/snakemake_config.yaml +++ b/config/snakemake_config.yaml @@ -22,7 +22,7 @@ contrastManifest: "OUTPUT_DIR/manifests/contrasts.tsv" multiplexflag: "Y" #whether samples are multiplexed ["Y","N"] umiSeparator: "rbc:" #required for nondemultiplexed samples to determine delimiter for deduplication [":", "_", "rbc:"] mismatch: 1 #required for multiplexed samples, number of bp mismatches allowed in demultiplexing [1,2,3] -reference: "mm10" #reference organism ["hg38","mm10"] +reference: "" #reference organism ["hg38","mm10"] filterlength: 20 #minimum read length to include in analysis [any int >20] phredQuality: 20 #minimum quality score for 3’ end trimming includerRNA: "Y" #whether to include refseq rRNA's in annotations ["Y", "N"] From e74b7bdf3c7a6594424f6606e6af755ad060eba5 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Thu, 15 Sep 2022 12:32:29 -0400 Subject: [PATCH 08/19] split sorting from STAR to samtools --- workflow/Snakefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 7fbb89b..9d04daf 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -744,6 +744,7 @@ rule star: tmp_dir="/lscratch/${{SLURM_JOB_ID}}" export tmp_dir + # STAR cannot handle sorting large files - allow samtools to sort output files STAR \ --runMode alignReads \ --genomeDir {params.s_index} \ @@ -752,7 +753,7 @@ rule star: --readFilesIn {input.f1} \ --outFileNamePrefix $tmp_dir/{params.out_prefix} \ --outReadsUnmapped Fastx \ - --outSAMtype BAM SortedByCoordinate \ + --outSAMtype BAM Unsorted \ --alignEndsType {params.s_atype} \ --alignIntronMax {params.s_intron} \ --alignSJDBoverhangMin {params.s_sjdb} \ @@ -780,6 +781,9 @@ rule star: --sjdbScore {params.s_sj} \ --winAnchorMultimapNmax {params.s_anchor} + # sort file + samtools sort -m 80G -T $tmp_dir $tmp_dir/${params.out_prefix}Aligned.out.bam -o $tmp_dir/${params.out_prefix}Aligned.sortedByCoord.out.bam + # move STAR files and final log file to output mv $tmp_dir/{params.out_prefix}Aligned.sortedByCoord.out.bam {output.bam} mv $tmp_dir/{params.out_prefix}Log.final.out {output.log} From 7c4b7154a25dedadc5f4dcb71643dbb8f830c28e Mon Sep 17 00:00:00 2001 From: slsevilla Date: Thu, 15 Sep 2022 12:32:50 -0400 Subject: [PATCH 09/19] add module samtools --- workflow/Snakefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 9d04daf..b1b97c8 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -730,7 +730,8 @@ rule star: s_anchor = star_win_anchor, out_prefix = '{sp}_' envmodules: - config['star'] + config['star'], + config['samtools'] threads: getthreads("star") output: unmapped = join(out_dir,'01_preprocess','02_alignment','01_unmapped','{sp}.unmapped.out'), From da92c23a108ac5242ec7471a86a19594938cae60 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Fri, 16 Sep 2022 13:21:38 -0400 Subject: [PATCH 10/19] scripts for testing star sorting params --- build/star_align_sort_testing/README.md | 18 +++++++ .../sbatch_template_run1.sh | 40 ++++++++++++++++ .../sbatch_template_run10.sh | 40 ++++++++++++++++ .../sbatch_template_run11.sh | 40 ++++++++++++++++ .../sbatch_template_run12.sh | 40 ++++++++++++++++ .../sbatch_template_run13.sh | 47 +++++++++++++++++++ .../sbatch_template_run14.sh | 18 +++++++ .../sbatch_template_run15.sh | 28 +++++++++++ .../sbatch_template_run2.sh | 42 +++++++++++++++++ .../sbatch_template_run3.sh | 41 ++++++++++++++++ .../sbatch_template_run4.sh | 40 ++++++++++++++++ .../sbatch_template_run5.sh | 40 ++++++++++++++++ .../sbatch_template_run6.sh | 40 ++++++++++++++++ .../sbatch_template_run7.sh | 40 ++++++++++++++++ .../sbatch_template_run8.sh | 40 ++++++++++++++++ .../sbatch_template_run9.sh | 40 ++++++++++++++++ .../submit_sbatches.sh | 36 ++++++++++++++ 17 files changed, 630 insertions(+) create mode 100644 build/star_align_sort_testing/README.md create mode 100644 build/star_align_sort_testing/sbatch_template_run1.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run10.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run11.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run12.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run13.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run14.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run15.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run2.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run3.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run4.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run5.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run6.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run7.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run8.sh create mode 100644 build/star_align_sort_testing/sbatch_template_run9.sh create mode 100644 build/star_align_sort_testing/submit_sbatches.sh diff --git a/build/star_align_sort_testing/README.md b/build/star_align_sort_testing/README.md new file mode 100644 index 0000000..258ea2f --- /dev/null +++ b/build/star_align_sort_testing/README.md @@ -0,0 +1,18 @@ +Samples +- WT_fCLIP +- Y5KO_fCLIP + +Runs +1) --outBAMsortingBinsN 200 +2) --outBAMsortingBinsN 200 --limitBAMsortRAM 41143265264 +3) --limitBAMsortRAM 41143265264 +4) --outBAMsortingBinsN 400 +5) --outBAMsortingBinsN 600 +6) --outBAMsortingBinsN 800 +7) --outBAMsortingBinsN 400 --limitBAMsortRAM 41143265264 +8) --outBAMsortingBinsN 600 --limitBAMsortRAM 41143265264 +9) --outBAMsortingBinsN 800 --limitBAMsortRAM 41143265264 +10) --limitBAMsortRAM 60000000000 +11) --limitBAMsortRAM 70000000000 +12) --limitBAMsortRAM 80000000000 +13) output without sorting; pipe to samtools \ No newline at end of file diff --git a/build/star_align_sort_testing/sbatch_template_run1.sh b/build/star_align_sort_testing/sbatch_template_run1.sh new file mode 100644 index 0000000..2094f4d --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run1.sh @@ -0,0 +1,40 @@ +#!/bin/sh +module load STAR + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 --outBAMsortingBinsN 200 + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done \ No newline at end of file diff --git a/build/star_align_sort_testing/sbatch_template_run10.sh b/build/star_align_sort_testing/sbatch_template_run10.sh new file mode 100644 index 0000000..cc22fb8 --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run10.sh @@ -0,0 +1,40 @@ +#!/bin/sh +module load STAR + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 --limitBAMsortRAM 60000000000 + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done \ No newline at end of file diff --git a/build/star_align_sort_testing/sbatch_template_run11.sh b/build/star_align_sort_testing/sbatch_template_run11.sh new file mode 100644 index 0000000..bfd6a6a --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run11.sh @@ -0,0 +1,40 @@ +#!/bin/sh +module load STAR + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 --limitBAMsortRAM 70000000000 + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done \ No newline at end of file diff --git a/build/star_align_sort_testing/sbatch_template_run12.sh b/build/star_align_sort_testing/sbatch_template_run12.sh new file mode 100644 index 0000000..54351a3 --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run12.sh @@ -0,0 +1,40 @@ +#!/bin/sh +module load STAR + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 --limitBAMsortRAM 80000000000 + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done \ No newline at end of file diff --git a/build/star_align_sort_testing/sbatch_template_run13.sh b/build/star_align_sort_testing/sbatch_template_run13.sh new file mode 100644 index 0000000..db5e4e1 --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run13.sh @@ -0,0 +1,47 @@ +#!/bin/sh +module load STAR samtools + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx \ +--outSAMtype BAM Unsorted \ +--alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 + +# Sort and Index +cp $tmp_dir/* $output_dir/$run_id/tmp +samtools sort --threads 32 -m 10G -T $tmp_dir $tmp_dir/${sample_id}_Aligned.out.bam -o $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam +samtools index -@ 32 $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done \ No newline at end of file diff --git a/build/star_align_sort_testing/sbatch_template_run14.sh b/build/star_align_sort_testing/sbatch_template_run14.sh new file mode 100644 index 0000000..2bc83f5 --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run14.sh @@ -0,0 +1,18 @@ +#!/bin/sh +module load samtools + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star/$run_id" +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# Sort and Index +#samtools sort -m 80G -T $tmp_dir $output_dir/tmp/${sample_id}_Aligned.out.bam -o $output_dir/${sample_id}_Aligned.sortedByCoord.out.bam +samtools index -@ 32 $output_dir/${sample_id}_Aligned.sortedByCoord.out.bam diff --git a/build/star_align_sort_testing/sbatch_template_run15.sh b/build/star_align_sort_testing/sbatch_template_run15.sh new file mode 100644 index 0000000..4d5b09d --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run15.sh @@ -0,0 +1,28 @@ +#!/bin/sh +module load samtools + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star/$run_id" +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# Sort and Index +samtools sort --threads 8 -m 100G -T $tmp_dir $output_dir/tmp/${sample_id}_Aligned.out.bam -o $output_dir/${sample_id}_Aligned.sortedByCoord.out.bam +samtools index -@ 32 $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam +#samtools sort --threads 32 -m 40G -T $tmp_dir $output_dir/tmp/${sample_id}_Aligned.out.bam -o $output_dir/${sample_id}_Aligned.sortedByCoord.out.bam +#samtools index -@ 32 $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done \ No newline at end of file diff --git a/build/star_align_sort_testing/sbatch_template_run2.sh b/build/star_align_sort_testing/sbatch_template_run2.sh new file mode 100644 index 0000000..acfb87a --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run2.sh @@ -0,0 +1,42 @@ +#!/bin/sh +module load STAR + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 --outBAMsortingBinsN 200 \ +--limitBAMsortRAM 41143265264 + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done + diff --git a/build/star_align_sort_testing/sbatch_template_run3.sh b/build/star_align_sort_testing/sbatch_template_run3.sh new file mode 100644 index 0000000..46be3f3 --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run3.sh @@ -0,0 +1,41 @@ +#!/bin/sh +module load STAR + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 --limitBAMsortRAM 41143265264 + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done + diff --git a/build/star_align_sort_testing/sbatch_template_run4.sh b/build/star_align_sort_testing/sbatch_template_run4.sh new file mode 100644 index 0000000..2e92916 --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run4.sh @@ -0,0 +1,40 @@ +#!/bin/sh +module load STAR + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 --outBAMsortingBinsN 400 + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done \ No newline at end of file diff --git a/build/star_align_sort_testing/sbatch_template_run5.sh b/build/star_align_sort_testing/sbatch_template_run5.sh new file mode 100644 index 0000000..f91efb8 --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run5.sh @@ -0,0 +1,40 @@ +#!/bin/sh +module load STAR + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 --outBAMsortingBinsN 600 + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done \ No newline at end of file diff --git a/build/star_align_sort_testing/sbatch_template_run6.sh b/build/star_align_sort_testing/sbatch_template_run6.sh new file mode 100644 index 0000000..03b06d5 --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run6.sh @@ -0,0 +1,40 @@ +#!/bin/sh +module load STAR + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 --outBAMsortingBinsN 800 + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done \ No newline at end of file diff --git a/build/star_align_sort_testing/sbatch_template_run7.sh b/build/star_align_sort_testing/sbatch_template_run7.sh new file mode 100644 index 0000000..9473b51 --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run7.sh @@ -0,0 +1,40 @@ +#!/bin/sh +module load STAR + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 --outBAMsortingBinsN 400 --limitBAMsortRAM 41143265264 + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done \ No newline at end of file diff --git a/build/star_align_sort_testing/sbatch_template_run8.sh b/build/star_align_sort_testing/sbatch_template_run8.sh new file mode 100644 index 0000000..41dc418 --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run8.sh @@ -0,0 +1,40 @@ +#!/bin/sh +module load STAR + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 --outBAMsortingBinsN 600 --limitBAMsortRAM 41143265264 + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done \ No newline at end of file diff --git a/build/star_align_sort_testing/sbatch_template_run9.sh b/build/star_align_sort_testing/sbatch_template_run9.sh new file mode 100644 index 0000000..95a4344 --- /dev/null +++ b/build/star_align_sort_testing/sbatch_template_run9.sh @@ -0,0 +1,40 @@ +#!/bin/sh +module load STAR + +# set tmp dir +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +# set other variables +sample_id="FILL_SAMPLE" +run_id="FILL_RUN" +output_dir="/data/sevillas2/marco_star" + +# check dirs exits +if [[ ! -d $output_dir/$run_id ]]; then mkdir -p $output_dir/$run_id; fi + +# output files +output_bam="$output_dir/$run_id/$sample_id.bam" +output_log="$output_dir/logs/${run_id}_${sample_id}.log" +output_unmapped="$output_dir/$run_id/${sample_id}_unmapped.bam" + +# run star +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/hg38/ref/gencode.v32.annotation.gtf \ +--readFilesCommand zcat \ +--readFilesIn /data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/01_preprocess/01_fastq/$sample_id.fastq.gz \ +--outFileNamePrefix $tmp_dir/${sample_id}_ \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 \ +--alignSJDBoverhangMin 3 --alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 \ +--outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ +--outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 --outFilterType Normal --outSAMattributes All \ +--outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 --outSJfilterReads All --seedMultimapNmax 10000 \ +--seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax 500 --sjdbScore 2 --winAnchorMultimapNmax 500 --outBAMsortingBinsN 800 --limitBAMsortRAM 41143265264 + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id}_Aligned.sortedByCoord.out.bam $output_bam +mv $tmp_dir/${sample_id}_Log.final.out $output_log + +# move mates to unmapped file +touch $output_unmapped +for f in $tmp_dir/${sample_id}_Unmapped.out.mate*; do cat $f >> $output_unmapped; done \ No newline at end of file diff --git a/build/star_align_sort_testing/submit_sbatches.sh b/build/star_align_sort_testing/submit_sbatches.sh new file mode 100644 index 0000000..48e560d --- /dev/null +++ b/build/star_align_sort_testing/submit_sbatches.sh @@ -0,0 +1,36 @@ +#!/bin/sh +create="Y" +submit="Y" +run_list=("run1" "run2" "run3" "run4" "run5" "run6" "run7" "run8" "run9" "run10" "run11" "run12" "run13" "run14" "run15") +sample_list=("Y5KO_fCLIP" "WT_fCLIP") + +# to run +run_list=("run14") +sample_list=("Y5KO_fCLIP") + +if [[ $create == "Y" ]]; then + for run_id in ${run_list[@]}; do + for sample_id in ${sample_list[@]}; do + sbatch_name="marco_${sample_id}_${run_id}.sh" + cp sbatch_template_${run_id}.sh $sbatch_name + sed -i "s/FILL_SAMPLE/$sample_id/g" $sbatch_name + sed -i "s/FILL_RUN/$run_id/g" $sbatch_name + done + done +fi +if [[ $submit == "Y" ]]; then + for run_id in ${run_list[@]}; do + for sample_id in ${sample_list[@]}; do + sbatch_name="marco_${sample_id}_${run_id}.sh" + + # submit sbatch + sbatch --cpus-per-task=32 --verbose \ + --output=/data/sevillas2/marco_star/logs/%j.out \ + --mem=200g --gres=lscratch:800 --time 1-00:00:00 \ + --error=/data/sevillas2/marco_star/logs/%j.err \ + $sbatch_name + done + done +fi + +# --mem=80g --gres=lscratch:800 --time 1-00:00:00 \ From 7074411fc8bdcbbc14c0b59e25c7cb80e111a938 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Fri, 16 Sep 2022 13:26:09 -0400 Subject: [PATCH 11/19] add scripts from star param testing --- build/STAR_testing/sh_scripts/sbatch.sh | 11 + .../STAR_testing/sh_scripts/star_commands.sh | 410 ++++++++++++++++ .../sh_scripts/star_comparison.Rmd | 37 ++ .../sh_scripts/star_comparison.html | 447 ++++++++++++++++++ build/STAR_testing/sh_scripts/stats.Rmd | 36 ++ build/STAR_testing/sh_scripts/stats.csv | 71 +++ build/STAR_testing/sh_scripts/stats.html | 445 +++++++++++++++++ build/STAR_testing/sh_scripts/submit_star.sh | 31 ++ 8 files changed, 1488 insertions(+) create mode 100644 build/STAR_testing/sh_scripts/sbatch.sh create mode 100644 build/STAR_testing/sh_scripts/star_commands.sh create mode 100644 build/STAR_testing/sh_scripts/star_comparison.Rmd create mode 100644 build/STAR_testing/sh_scripts/star_comparison.html create mode 100644 build/STAR_testing/sh_scripts/stats.Rmd create mode 100644 build/STAR_testing/sh_scripts/stats.csv create mode 100644 build/STAR_testing/sh_scripts/stats.html create mode 100644 build/STAR_testing/sh_scripts/submit_star.sh diff --git a/build/STAR_testing/sh_scripts/sbatch.sh b/build/STAR_testing/sh_scripts/sbatch.sh new file mode 100644 index 0000000..a4000c0 --- /dev/null +++ b/build/STAR_testing/sh_scripts/sbatch.sh @@ -0,0 +1,11 @@ +#sbatch --cpus-per-task=56 --verbose --output=/data/sevillas2/star/logs/%j.out --mem=75g --gres=lscratch:800 --time 10:00:00 --error=/data/sevillas2/star/logs/%j.err /data/sevillas2/star/logs/Ro7hr2_Clip_10.sh +sbatch --cpus-per-task=56 --verbose --output=/data/sevillas2/star/logs/%j.out --mem=75g --gres=lscratch:800 --time 10:00:00 --error=/data/sevillas2/star/logs/%j.err /data/sevillas2/star/logs/Ro7hr2_Clip_50.sh +sbatch --cpus-per-task=56 --verbose --output=/data/sevillas2/star/logs/%j.out --mem=75g --gres=lscratch:800 --time 10:00:00 --error=/data/sevillas2/star/logs/%j.err /data/sevillas2/star/logs/Ro7hr2_Clip_1000.sh +sbatch --cpus-per-task=56 --verbose --output=/data/sevillas2/star/logs/%j.out --mem=75g --gres=lscratch:800 --time 10:00:00 --error=/data/sevillas2/star/logs/%j.err /data/sevillas2/star/logs/Ro7hr2_Clip_5000.sh +sbatch --cpus-per-task=56 --verbose --output=/data/sevillas2/star/logs/%j.out --mem=75g --gres=lscratch:800 --time 10:00:00 --error=/data/sevillas2/star/logs/%j.err /data/sevillas2/star/logs/Ro7hr2_Clip_10000.sh + +sbatch --cpus-per-task=56 --verbose --output=/data/sevillas2/star/logs/%j.out --mem=75g --gres=lscratch:800 --time 10:00:00 --error=/data/sevillas2/star/logs/%j.err /data/sevillas2/star/logs/YKO7hr_Clip_10.sh +sbatch --cpus-per-task=56 --verbose --output=/data/sevillas2/star/logs/%j.out --mem=75g --gres=lscratch:800 --time 10:00:00 --error=/data/sevillas2/star/logs/%j.err /data/sevillas2/star/logs/YKO7hr_Clip_50.sh +sbatch --cpus-per-task=56 --verbose --output=/data/sevillas2/star/logs/%j.out --mem=75g --gres=lscratch:800 --time 10:00:00 --error=/data/sevillas2/star/logs/%j.err /data/sevillas2/star/logs/YKO7hr_Clip_1000.sh +sbatch --cpus-per-task=56 --verbose --output=/data/sevillas2/star/logs/%j.out --mem=75g --gres=lscratch:800 --time 10:00:00 --error=/data/sevillas2/star/logs/%j.err /data/sevillas2/star/logs/YKO7hr_Clip_5000.sh +sbatch --cpus-per-task=56 --verbose --output=/data/sevillas2/star/logs/%j.out --mem=75g --gres=lscratch:800 --time 10:00:00 --error=/data/sevillas2/star/logs/%j.err /data/sevillas2/star/logs/YKO7hr_Clip_10000.sh \ No newline at end of file diff --git a/build/STAR_testing/sh_scripts/star_commands.sh b/build/STAR_testing/sh_scripts/star_commands.sh new file mode 100644 index 0000000..d7010b1 --- /dev/null +++ b/build/STAR_testing/sh_scripts/star_commands.sh @@ -0,0 +1,410 @@ +#!/bin/bash + +function create_sh() +{ + cp /data/sevillas2/star/logs/submit_star.sh $sh_file + sed -i "s/fill_sample/$sample_id/g" $sh_file + sed -i "s/fill_project/$project_id/g" $sh_file + sed -i "s/fill_trial/$trial/g" $sh_file + sed -i "s/fill_seed/$seedPerWindowNmax/g" $sh_file + sed -i "s/fill_anchor/$winAnchorMultimapNmax/g" $sh_file +} + +function set_variables() +{ + trial="${cpu}_${mem}_${seedPerWindowNmax}_${winAnchorMultimapNmax}" + output_dir="/data/sevillas2/star/${sample_id}/${trial}" + sh_file="/data/sevillas2/star/logs/${sample_id}_${trial}.sh" + + if [[ ! -d $output_dir ]]; then mkdir -p $output_dir; fi +} + +function submit_sh() +{ + sbatch --cpus-per-task=${cpu} --verbose --output=/data/sevillas2/star/logs/%j.out --mem=${mem} --gres=lscratch:800 --time 10:00:00 --error=/data/sevillas2/star/logs/%j.err $sh_file +} + +function stats() +{ + val=`cat $output_dir/$sample_id.out | grep "Uniquely mapped reads %"` + descrip="unique%" + cleanup + val=`cat $output_dir/$sample_id.out | grep "Number of splices: Total"` + descrip="splice_num" + cleanup + val=`cat $output_dir/$sample_id.out | grep "Number of splices: Annotated (sjdb)"` + descrip="splice_anno_num" + cleanup + val=`cat $output_dir/$sample_id.out | grep "% of reads mapped to multiple loci"` + descrip="mm%" + cleanup + val=`cat $output_dir/$sample_id.out | grep "% of reads mapped to too many loci"` + descrip="unmapped_toomany_%" + cleanup + val=`cat $output_dir/$sample_id.out | grep "% of reads unmapped: too short"` + descrip="unmapped_tooshort_%" + cleanup + val=`cat $output_dir/$sample_id.out | grep "% of reads unmapped: other"` + descrip="unmapped_other_%" + cleanup +} + +function cleanup () +{ + local cleanup=`echo $val | cut -f2 -d"|" | sed -s "s/\t//g" | sed -s "s/%//g" | sed -s "s/ //g"` + echo "$cleanup,$descrip,$sample_id,$cpu,$mem,$seedPerWindowNmax,$winAnchorMultimapNmax" >> stats.csv + +} + +# create output +touch stats.csv +echo "val,descrip,sample_id,cpu,mem,seed,anchor" > stats.csv +################################################################################################ +project_id="mESC_clip_4_v2.0" +################################################################################################ +sample_id="Control1hr_Clip" +seedPerWindowNmax="5000" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +stats +################################################ +sample_id="Control1hr_Clip" +seedPerWindowNmax="1000" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +stats +################################################ +sample_id="Control1hr_Clip" +seedPerWindowNmax="500" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +stats +################################################ +sample_id="Control1hr_Clip" +seedPerWindowNmax="100" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +stats +################################################ +sample_id="Control1hr_Clip" +seedPerWindowNmax="50" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +stats +################################################ +sample_id="Control1hr_Clip" +seedPerWindowNmax="10" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +stats +################################################ +sample_id="Control1hr_Clip" +seedPerWindowNmax="5" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +# stats - failed +################################################ +sample_id="Control1hr_Clip" +seedPerWindowNmax="1000" +winAnchorMultimapNmax="50" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +stats +################################################ +sample_id="Control1hr_Clip" +seedPerWindowNmax="500" +winAnchorMultimapNmax="50" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +stats +################################################ +sample_id="Control1hr_Clip" +seedPerWindowNmax="100" +winAnchorMultimapNmax="50" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +stats +################################################ +sample_id="Control1hr_Clip" +seedPerWindowNmax="50" +winAnchorMultimapNmax="50" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +stats +################################################ +sample_id="YKO7hr_Clip" +seedPerWindowNmax="5000" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +################################################ +sample_id="YKO7hr_Clip" +seedPerWindowNmax="1000" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +################################################ +sample_id="YKO7hr_Clip" +seedPerWindowNmax="50" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +################################################ +sample_id="YKO7hr_Clip" +seedPerWindowNmax="10" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +################################################ +sample_id="Ro7hr2_Clip" +seedPerWindowNmax="1000" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +################################################ +sample_id="Ro7hr2_Clip" +seedPerWindowNmax="5000" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +################################################ +sample_id="Ro7hr2_Clip" +seedPerWindowNmax="50" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +################################################ +sample_id="Ro7hr2_Clip" +seedPerWindowNmax="10" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +################################################ +sample_id="Ro1hrNuc_Clip" +seedPerWindowNmax="500" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +################################################ +sample_id="Ro1hrCyt_Clip" +seedPerWindowNmax="500" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +################################################ +sample_id="Ro7hrNuc_Clip" +seedPerWindowNmax="500" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +################################################ +sample_id="Ro7hrCyt_Clip" +seedPerWindowNmax="500" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +################################################ +sample_id="Ro1hrNuc_Clip" +seedPerWindowNmax="500" +winAnchorMultimapNmax="50" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +################################################ +sample_id="Ro1hrCyt_Clip" +seedPerWindowNmax="500" +winAnchorMultimapNmax="50" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +################################################ +sample_id="Ro7hrNuc_Clip" +seedPerWindowNmax="500" +winAnchorMultimapNmax="50" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +################################################ +sample_id="Ro7hrCyt_Clip" +seedPerWindowNmax="500" +winAnchorMultimapNmax="50" +cpu="32" +mem="75g" + +set_variables +create_sh +# submit_sh +################################################################################################ +project_id="mESC_clip_2_v2.0" +################################################################################################ +sample_id="Y_Clip_2" +seedPerWindowNmax="1000" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +################################################ +sample_id="Y_Clip_2" +seedPerWindowNmax="500" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +################################################################################################ +project_id="8-09-21-HaCaT_fCLIP_v2.0" +################################################################################################ +sample_id="Y5KO_fCLIP" +seedPerWindowNmax="1000" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +################################################ +sample_id="Y5KO_fCLIP" +seedPerWindowNmax="500" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +#create_sh - fixed index for hg38 +#submit_sh +################################################################################################ +project_id="mESC_clip_3_clash_v2.0" +################################################################################################ +sample_id="Ro_clash_uc" +seedPerWindowNmax="1000" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh +################################################ +sample_id="Ro_clash_uc" +seedPerWindowNmax="500" +winAnchorMultimapNmax="10000" +cpu="32" +mem="75g" + +set_variables +create_sh +#submit_sh diff --git a/build/STAR_testing/sh_scripts/star_comparison.Rmd b/build/STAR_testing/sh_scripts/star_comparison.Rmd new file mode 100644 index 0000000..f22ef8e --- /dev/null +++ b/build/STAR_testing/sh_scripts/star_comparison.Rmd @@ -0,0 +1,37 @@ +--- +title: "Untitled" +author: "Samantha Sevilla" +date: '2022-06-23' +output: html_document +editor_options: + chunk_output_type: console +--- + +```{r} +library(dplyr) +library(ggplot2) +``` + + +```{r} +stats_df=read.csv("~/../../Volumes/data/star/logs/stats.csv") +``` + +```{r} +for (sid in unique(stats_df$sample_id)){ + for (did in unique(stats_df$descrip)){ + p = subset(stats_df, sample_id==sid & descrip==did) %>% + ggplot(aes(x=descrip, y=val)) + + facet_grid(~seed)+ + geom_bar(stat="identity", fill="steelblue")+ + geom_text(aes(label=val), vjust=1.6, color="white", size=3.5)+ + xlab(did)+ + theme_minimal() + + theme(axis.text.x=element_blank(), + axis.ticks.x=element_blank() + ) + print(p) + } +} + +``` diff --git a/build/STAR_testing/sh_scripts/star_comparison.html b/build/STAR_testing/sh_scripts/star_comparison.html new file mode 100644 index 0000000..8469f18 --- /dev/null +++ b/build/STAR_testing/sh_scripts/star_comparison.html @@ -0,0 +1,447 @@ + + + + + + + + + + + + + + + +Untitled + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
library(dplyr)
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
library(ggplot2)
+
stats_df=read.csv("~/../../Volumes/data/star/logs/stats.csv")
+
for (sid in unique(stats_df$sample_id)){
+  for (did in unique(stats_df$descrip)){
+    p = subset(stats_df, sample_id==sid & descrip==did) %>%
+      ggplot(aes(x=descrip, y=val)) +
+      facet_grid(~seed)+
+      geom_bar(stat="identity", fill="steelblue")+
+      geom_text(aes(label=val), vjust=1.6, color="white", size=3.5)+
+      xlab(did)+
+      theme_minimal() + 
+      theme(axis.text.x=element_blank(),
+        axis.ticks.x=element_blank() 
+        )
+    print(p)
+  }
+}
+

+ + + + +
+ + + + + + + + + + + + + + + diff --git a/build/STAR_testing/sh_scripts/stats.Rmd b/build/STAR_testing/sh_scripts/stats.Rmd new file mode 100644 index 0000000..f92747f --- /dev/null +++ b/build/STAR_testing/sh_scripts/stats.Rmd @@ -0,0 +1,36 @@ +--- +title: "STAR_anchor_seed_testing" +author: "Samantha Sevilla" +date: "7/5;2022" +output: pdf_document +editor_options: + chunk_output_type: console +--- + + +```{r} +library(dplyr) +library(ggplot2) +``` + + +```{r} +stats_df=read.csv("~/../../Volumes/data/star/logs/stats.csv") +``` + +```{r} +# print for 10000 anchor +for (sid in unique(stats_df$sample_id)){ + for (did in unique(stats_df$descrip)){ + p = subset(stats_df, sample_id==sid & descrip==did) %>% + ggplot(aes(x=descrip, y=val)) + + facet_grid(anchor~seed)+ + geom_bar(stat="identity", fill="steelblue")+ + geom_text(aes(label=val), vjust=1.6, color="white", size=3.5)+ + ggtitle(paste0(sid, " anchor (row) by seed (col)")) + + theme(axis.text.x=element_blank())+ + xlab(did) + print(p) + } +} +``` \ No newline at end of file diff --git a/build/STAR_testing/sh_scripts/stats.csv b/build/STAR_testing/sh_scripts/stats.csv new file mode 100644 index 0000000..35367e5 --- /dev/null +++ b/build/STAR_testing/sh_scripts/stats.csv @@ -0,0 +1,71 @@ +val,descrip,sample_id,cpu,mem,seed,anchor +76.12,unique%,Control1hr_Clip,32,75g,5000,10000 +2983,splice_num,Control1hr_Clip,32,75g,5000,10000 +589,splice_anno_num,Control1hr_Clip,32,75g,5000,10000 +4.30,mm%,Control1hr_Clip,32,75g,5000,10000 +0.00,unmapped_toomany_%,Control1hr_Clip,32,75g,5000,10000 +19.59,unmapped_tooshort_%,Control1hr_Clip,32,75g,5000,10000 +0.00,unmapped_other_%,Control1hr_Clip,32,75g,5000,10000 +76.12,unique%,Control1hr_Clip,32,75g,1000,10000 +2983,splice_num,Control1hr_Clip,32,75g,1000,10000 +589,splice_anno_num,Control1hr_Clip,32,75g,1000,10000 +4.30,mm%,Control1hr_Clip,32,75g,1000,10000 +0.00,unmapped_toomany_%,Control1hr_Clip,32,75g,1000,10000 +19.59,unmapped_tooshort_%,Control1hr_Clip,32,75g,1000,10000 +0.00,unmapped_other_%,Control1hr_Clip,32,75g,1000,10000 +76.12,unique%,Control1hr_Clip,32,75g,500,10000 +2983,splice_num,Control1hr_Clip,32,75g,500,10000 +589,splice_anno_num,Control1hr_Clip,32,75g,500,10000 +4.30,mm%,Control1hr_Clip,32,75g,500,10000 +0.00,unmapped_toomany_%,Control1hr_Clip,32,75g,500,10000 +19.59,unmapped_tooshort_%,Control1hr_Clip,32,75g,500,10000 +0.00,unmapped_other_%,Control1hr_Clip,32,75g,500,10000 +76.07,unique%,Control1hr_Clip,32,75g,100,10000 +2982,splice_num,Control1hr_Clip,32,75g,100,10000 +589,splice_anno_num,Control1hr_Clip,32,75g,100,10000 +4.30,mm%,Control1hr_Clip,32,75g,100,10000 +0.00,unmapped_toomany_%,Control1hr_Clip,32,75g,100,10000 +19.55,unmapped_tooshort_%,Control1hr_Clip,32,75g,100,10000 +0.09,unmapped_other_%,Control1hr_Clip,32,75g,100,10000 +76.01,unique%,Control1hr_Clip,32,75g,50,10000 +2950,splice_num,Control1hr_Clip,32,75g,50,10000 +589,splice_anno_num,Control1hr_Clip,32,75g,50,10000 +4.24,mm%,Control1hr_Clip,32,75g,50,10000 +0.00,unmapped_toomany_%,Control1hr_Clip,32,75g,50,10000 +19.33,unmapped_tooshort_%,Control1hr_Clip,32,75g,50,10000 +0.42,unmapped_other_%,Control1hr_Clip,32,75g,50,10000 +16.63,unique%,Control1hr_Clip,32,75g,10,10000 +1876,splice_num,Control1hr_Clip,32,75g,10,10000 +589,splice_anno_num,Control1hr_Clip,32,75g,10,10000 +3.58,mm%,Control1hr_Clip,32,75g,10,10000 +0.00,unmapped_toomany_%,Control1hr_Clip,32,75g,10,10000 +15.59,unmapped_tooshort_%,Control1hr_Clip,32,75g,10,10000 +64.20,unmapped_other_%,Control1hr_Clip,32,75g,10,10000 +76.13,unique%,Control1hr_Clip,32,75g,1000,50 +1604,splice_num,Control1hr_Clip,32,75g,1000,50 +589,splice_anno_num,Control1hr_Clip,32,75g,1000,50 +4.01,mm%,Control1hr_Clip,32,75g,1000,50 +0.00,unmapped_toomany_%,Control1hr_Clip,32,75g,1000,50 +19.83,unmapped_tooshort_%,Control1hr_Clip,32,75g,1000,50 +0.03,unmapped_other_%,Control1hr_Clip,32,75g,1000,50 +76.13,unique%,Control1hr_Clip,32,75g,500,50 +1604,splice_num,Control1hr_Clip,32,75g,500,50 +589,splice_anno_num,Control1hr_Clip,32,75g,500,50 +4.01,mm%,Control1hr_Clip,32,75g,500,50 +0.00,unmapped_toomany_%,Control1hr_Clip,32,75g,500,50 +19.83,unmapped_tooshort_%,Control1hr_Clip,32,75g,500,50 +0.03,unmapped_other_%,Control1hr_Clip,32,75g,500,50 +76.13,unique%,Control1hr_Clip,32,75g,100,50 +1604,splice_num,Control1hr_Clip,32,75g,100,50 +589,splice_anno_num,Control1hr_Clip,32,75g,100,50 +4.01,mm%,Control1hr_Clip,32,75g,100,50 +0.00,unmapped_toomany_%,Control1hr_Clip,32,75g,100,50 +19.83,unmapped_tooshort_%,Control1hr_Clip,32,75g,100,50 +0.03,unmapped_other_%,Control1hr_Clip,32,75g,100,50 +76.13,unique%,Control1hr_Clip,32,75g,50,50 +1604,splice_num,Control1hr_Clip,32,75g,50,50 +589,splice_anno_num,Control1hr_Clip,32,75g,50,50 +4.00,mm%,Control1hr_Clip,32,75g,50,50 +0.00,unmapped_toomany_%,Control1hr_Clip,32,75g,50,50 +19.83,unmapped_tooshort_%,Control1hr_Clip,32,75g,50,50 +0.03,unmapped_other_%,Control1hr_Clip,32,75g,50,50 diff --git a/build/STAR_testing/sh_scripts/stats.html b/build/STAR_testing/sh_scripts/stats.html new file mode 100644 index 0000000..c0a63fb --- /dev/null +++ b/build/STAR_testing/sh_scripts/stats.html @@ -0,0 +1,445 @@ + + + + + + + + + + + + + + +STAR_anchor_seed_testing + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
library(dplyr)
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
library(ggplot2)
+
stats_df=read.csv("~/../../Volumes/data/star/logs/stats.csv")
+
# print for 10000 anchor
+for (sid in unique(stats_df$sample_id)){
+  for (did in unique(stats_df$descrip)){
+    p = subset(stats_df, sample_id==sid & descrip==did) %>%
+      ggplot(aes(x=descrip, y=val)) +
+      facet_grid(anchor~seed)+
+      geom_bar(stat="identity", fill="steelblue")+
+      geom_text(aes(label=val), vjust=1.6, color="white", size=3.5)+
+      ggtitle(paste0(sid, " anchor (row) by seed (col)")) + 
+      theme(axis.text.x=element_blank())+
+      xlab(did)
+    print(p)
+  }
+}
+

+ + + + +
+ + + + + + + + + + + + + + + diff --git a/build/STAR_testing/sh_scripts/submit_star.sh b/build/STAR_testing/sh_scripts/submit_star.sh new file mode 100644 index 0000000..3bd534c --- /dev/null +++ b/build/STAR_testing/sh_scripts/submit_star.sh @@ -0,0 +1,31 @@ +#!/bin/bash +module load STAR +sample_id="fill_sample" +project_id="fill_project" +seedPerWindowNmax="fill_seed" +winAnchorMultimapNmax="fill_anchor" +output_dir="/data/sevillas2/star/${sample_id}/fill_trial" + +input_fq="/data/RBL_NCI/Wolin/$project_id/01_preprocess/01_fastq/${sample_id}.fastq.gz"; +sample_id_prefix="${sample_id}_" + +tmp_dir="/lscratch/${SLURM_JOB_ID}" +export tmp_dir + +STAR --runMode alignReads --genomeDir /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/mm10/index \ +--sjdbGTFfile /data/CCBR_Pipeliner/iCLIP/index/active/2022_0505/mm10/ref/gencode.vM23.annotation.gtf --readFilesCommand zcat \ +--readFilesIn $input_fq \ +--outFileNamePrefix $tmp_dir/${sample_id_prefix} \ +--outReadsUnmapped Fastx --outSAMtype BAM SortedByCoordinate --alignEndsType Local --alignIntronMax 50000 --alignSJDBoverhangMin 3 \ +--alignSJoverhangMin 5 --alignTranscriptsPerReadNmax 10000 --alignWindowsPerReadNmax 10000 --outFilterMatchNmin 15 --outFilterMatchNminOverLread 0.9 \ +--outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 10000 --outFilterMultimapScoreRange 0 --outFilterScoreMin 0 \ +--outFilterType Normal --outSAMattributes All --outSAMunmapped None --outSJfilterCountTotalMin 3 1 1 1 --outSJfilterOverhangMin 30 12 12 12 \ +--outSJfilterReads All --seedMultimapNmax 10000 --seedNoneLociPerWindow 20 --seedPerReadNmax 10000 --seedPerWindowNmax $seedPerWindowNmax --sjdbScore 2 --winAnchorMultimapNmax $winAnchorMultimapNmax + +# move STAR files and final log file to output +mv $tmp_dir/${sample_id_prefix}Aligned.sortedByCoord.out.bam ${output_dir}/${sample_id}.bam; +mv $tmp_dir/${sample_id_prefix}Log.final.out ${output_dir}/${sample_id}.out; + +# move mates to unmapped file +touch ${output_dir}/${sample_id}.unmapped.out; +for f in $tmp_dir/${sample_id_prefix}Unmapped.out.mate*; do cat $f >> ${output_dir}/${sample_id}.unmapped.out; done \ No newline at end of file From 37777ddbd9507c40bc2486cd0792b60539bbdcdc Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 19 Sep 2022 13:20:07 -0400 Subject: [PATCH 12/19] add qc check for unmapped read percentage --- config/snakemake_config.yaml | 2 + workflow/Snakefile | 23 ++++++++-- workflow/scripts/03_qc_unmapped_check.sh | 53 ++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 workflow/scripts/03_qc_unmapped_check.sh diff --git a/config/snakemake_config.yaml b/config/snakemake_config.yaml index a6788a8..87c6857 100644 --- a/config/snakemake_config.yaml +++ b/config/snakemake_config.yaml @@ -37,6 +37,8 @@ MNormDistance: 25 #Summit-to-summit distance cutoff for common peaks. [ any inte sampleoverlap: 1 #if DEmethod DIFFBIND, minimum number of samples a peak must be found in to be counted [>1] pval: 0.005 #if DEmethod DIFFBIND or MANNORM, pval cutoff for significance fc: 1 #if DEmethod DIFFBIND or MANNORM, fold change cut off for significance +single_qc: 95 #maximum threshold for unmampped reads in any single sample +proj_qc: 50 #maximum threshold for unmapped reads across average of all project samples ######################################################################################### # STAR parameters diff --git a/workflow/Snakefile b/workflow/Snakefile index b1b97c8..1a15f69 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -56,6 +56,8 @@ manorm_d = config['MNormDistance'] sample_overlap = int(config['sampleoverlap']) pval = config['pval'] fc = config['fc'] +single_qc = config['single_qc_threshold'], +proj_qc = config['project_qc_threshold'] # STAR parameters star_bam_limit = "50297600554" @@ -871,10 +873,15 @@ if (multiplex_flag == 'Y'): params: rname = "08_qc_troubleshoot", R = join(source_dir,'workflow','scripts','03_qc_report.Rmd'), + qc = join(source_dir,'workflow','scripts','03_qc_unmapped_check.sh'), + log_dir = join(out_dir,'log','STAR') + single_qc = single_qc_threshold, + proj_qc = project_qc_threshold envmodules: config['R'] output: - o1 = join(out_dir,'qc','qc_report_troubleshooting.html') + html = join(out_dir,'qc','qc_report_troubleshooting.html'), + qc = join(out_dir, 'log','STAR','qc_pass.txt') shell: """ Rscript -e 'library(rmarkdown); \ @@ -882,6 +889,8 @@ if (multiplex_flag == 'Y'): output_file = "{output.o1}", \ params= list(log_list = "{input.log_list}", \ b_txt = "{input.txt_bc}"))' + + sh {params.qc} {params.log_dir} {params.single_qc} {params.proj_qc} """ else: rule qc_troubleshoot: @@ -893,16 +902,23 @@ else: params: rname = "08_qc_troubleshoot", R = join(source_dir,'workflow','scripts','03_qc_report.Rmd'), + qc = join(source_dir,'workflow','scripts','03_qc_unmapped_check.sh'), + log_dir = join(out_dir,'log','STAR'), + single_qc = single_qc_threshold, + proj_qc = project_qc_threshold envmodules: config['R'] output: - o1 = join(out_dir,'qc','qc_report_troubleshooting.html') + html = join(out_dir,'qc','qc_report_troubleshooting.html'), + qc = join(out_dir, 'log','STAR','qc_pass.txt') shell: """ Rscript -e 'library(rmarkdown); \ rmarkdown::render("{params.R}", output_file = "{output.o1}", \ params= list(log_list = "{input.log_list}"))' + + sh {params.qc} {params.log_dir} {params.single_qc} {params.proj_qc} """ rule dedup: @@ -912,7 +928,8 @@ rule dedup: get header of dedup file """ input: - f1 = rules.index_stats.output.bam + f1 = rules.index_stats.output.bam, + qc = rules.qc_troubleshoot.output.qc params: rname='09_dedup', umi = umi_sep, diff --git a/workflow/scripts/03_qc_unmapped_check.sh b/workflow/scripts/03_qc_unmapped_check.sh new file mode 100644 index 0000000..c3276bd --- /dev/null +++ b/workflow/scripts/03_qc_unmapped_check.sh @@ -0,0 +1,53 @@ +# set args +qc_dir="$1" +single_qc_threshold="$2" +project_qc_threshold="$3" + +#qc_dir="/data/RBL_NCI/Wolin/8-09-21-HaCaT_fCLIP_v2.0_2/log/STAR" +qc_failure=$qc_dir/qc_failure.txt +qc_pass=$qc_dir/qc_pass.txt + +#single_qc_threshold=95 +#project_qc_threshold=50 + +# set counters +total_unmapped=0 +counter=0 + +# create logs +if [[ -f $qc_failure ]]; then rm $qc_failure; fi; touch $qc_failure +if [[ -f $qc_pass ]]; then rm $qc_pass; fi; touch $qc_pass + +# for each of the STAR log files, determine unmapped read percentages +for f in $qc_dir/*; do + mismatch=`cat $f | grep "% of reads unmapped: too many mismatches" | cut -f2 -d"|" | cut -f1 -d"." | sed "s/%//g"` + tooshort=`cat $f | grep "% of reads unmapped: too short" | cut -f2 -d"|" | cut -f1 -d"." | sed "s/%//g"` + other=`cat $f | grep "% of reads unmapped: other" | cut -f2 -d"|" | cut -f1 -d"." | sed "s/%//g"` + + #check unmapped in any one samples is under treshold + single_unmapped=$((mismatch + tooshort + other)) + if [[ $single_unmapped -gt $single_qc_threshold ]]; then + echo "Sample $f does not meet the requirements ($single_qc_threshold%), with a percent unampped reads of $single_unmapped%. Please review details and if acceptable \ + increase the single_qc_threshold accordingly. If not acceptable, remove sample from sample_manifest and multiplex_manifest to continue" >> $qc_failure + fi + + # save single to total unmapped + total_unmapped=$((total_unmapped + single_unmapped)) + counter=$((counter+1)) +done + +# check total unmapped across all samples is under treshold +average_unmapped=$((total_unmapped / counter)) +if [[ $average_unmapped -gt $project_qc_threshold ]]; then + echo "Project unmapped percentages do not meet the requirements ($project_qc_threshold%), with an averaged percent unampped reads of $average_unmapped%. Please review details and if acceptable \ + increase the project_qc_threshold accordingly. If not acceptable, remove sample{s} from sample_manifest and multiplex_manifest to continue". >> $qc_failure +fi + +# # check if project has any failures, if so create failure log and fail project +length_fails=`cat $qc_failure | wc -l ` +if [[ $length_fails -eq 0 ]]; then + rm $qc_failure +else + rm $qc_pass + echo "fail qc" +fi \ No newline at end of file From 52f73e2ce630abfe6481cea4a377a7c1127d369d Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 19 Sep 2022 13:24:22 -0400 Subject: [PATCH 13/19] add wilfried to contributors --- docs/iCLIP/contributions.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/iCLIP/contributions.md b/docs/iCLIP/contributions.md index 2e3c395..999f758 100644 --- a/docs/iCLIP/contributions.md +++ b/docs/iCLIP/contributions.md @@ -5,10 +5,11 @@ The following members contributed to the development of the iCLIP pipeline: - [Phillip Homan](https://github.com/phoman14) - [Vishal Koparde](https://github.com/kopardev) - [Sklyer Kuhn](https://github.com/skchronicles) +- [Wilfried Guiblet](https://github.com/wilfriedguiblet) - Parthav Jailwala - Sandra Wolin - Marco Boccitto - Yuanyuan Leng - Soyeong Sim -SS, PH, VP, and SK contributed to the generating the source code and all members contributed to the main concepts and analysis. \ No newline at end of file +SS, PH, VP, SK, WG contributed to the generating the source code and all members contributed to the main concepts and analysis. \ No newline at end of file From 8849cbc78b1d84d9cfefd79bcc3b231dd64225b7 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 19 Sep 2022 13:24:42 -0400 Subject: [PATCH 14/19] update for v2.2 --- docs/iCLIP/test.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/iCLIP/test.md b/docs/iCLIP/test.md index 385e671..dd2b93b 100644 --- a/docs/iCLIP/test.md +++ b/docs/iCLIP/test.md @@ -2,7 +2,7 @@ Welcome to the iCLIP Pipeline Tutorial! ## 5.1 Getting Started -Review the information on the [Getting Started](https://rbl-nci.github.io/iCLIP/iCLIP/getting-started/) for a complete overview the pipeline. The tutorial below will use test data available on NIH Biowulf HPC only. All example code will assume you are running v2.1 of the pipeline, from the shared RBL_NCI storage directory, using test_1 data. +Review the information on the [Getting Started](https://rbl-nci.github.io/iCLIP/iCLIP/getting-started/) for a complete overview the pipeline. The tutorial below will use test data available on NIH Biowulf HPC only. All example code will assume you are running v2.2 of the pipeline, from the shared RBL_NCI storage directory, using test_1 data. A. Change working directory to the iCLIP repository ``` @@ -10,7 +10,7 @@ A. Change working directory to the iCLIP repository cd /data/RBL_NCI/Pipelines/iCLIP/[version number] # example -cd /data/RBL_NCI/Pipelines/iCLIP/v2.1 +cd /data/RBL_NCI/Pipelines/iCLIP/v2.2 ``` B. Initialize Pipeline @@ -28,7 +28,7 @@ A. Four different test data sets are available, depending on the need. These inc - test_4: DIFFBIND test (multiplex_flag="N", splice_aware="Y", DE_method="DIFFBIND") B. Pull the test data to your output directory -NOTE: Test data is currently available for v1.8, v2.0, v2.1. Please contact samantha.sevilla@nih.gov to create other test data. +NOTE: Test data is currently available for v1.8, v2.0, v2.1, v2.2. Please contact samantha.sevilla@nih.gov to create other test data. ``` # general format @@ -38,11 +38,11 @@ sh /data/CCBR_Pipeliner/iCLIP/test/run_test.sh \ -s /path/to/source/dir -o /path/to/output/dir -# example running test_1, v2.1: +# example running test_1, v2.2: sh /data/CCBR_Pipeliner/iCLIP/test/run_test.sh \ -t test_1 \ - -v v2.1 \ - -s /data/RBL_NCI/Pipelines/iCLIP/v2.1 \ + -v v2.2 \ + -s /data/RBL_NCI/Pipelines/iCLIP/v2.2 \ -o /path/to/output/dir ``` From 882b5036dd343c7248ca9f848e762767d3bd8ca3 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 19 Sep 2022 13:26:49 -0400 Subject: [PATCH 15/19] fix incorrect variable name --- config/snakemake_config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/snakemake_config.yaml b/config/snakemake_config.yaml index 87c6857..08e0d99 100644 --- a/config/snakemake_config.yaml +++ b/config/snakemake_config.yaml @@ -37,8 +37,8 @@ MNormDistance: 25 #Summit-to-summit distance cutoff for common peaks. [ any inte sampleoverlap: 1 #if DEmethod DIFFBIND, minimum number of samples a peak must be found in to be counted [>1] pval: 0.005 #if DEmethod DIFFBIND or MANNORM, pval cutoff for significance fc: 1 #if DEmethod DIFFBIND or MANNORM, fold change cut off for significance -single_qc: 95 #maximum threshold for unmampped reads in any single sample -proj_qc: 50 #maximum threshold for unmapped reads across average of all project samples +single_qc_threshold: 95 #maximum threshold for unmampped reads in any single sample +project_qc_threshold: 50 #maximum threshold for unmapped reads across average of all project samples ######################################################################################### # STAR parameters From 5a9f91815e46571a2317c6228ec0abda797cdc13 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 19 Sep 2022 14:29:30 -0400 Subject: [PATCH 16/19] qc input/config errors --- workflow/Snakefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 1a15f69..734e522 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -56,8 +56,8 @@ manorm_d = config['MNormDistance'] sample_overlap = int(config['sampleoverlap']) pval = config['pval'] fc = config['fc'] -single_qc = config['single_qc_threshold'], -proj_qc = config['project_qc_threshold'] +single_qc_threshold = config['single_qc_threshold'], +project_qc_threshold = config['project_qc_threshold'] # STAR parameters star_bam_limit = "50297600554" @@ -874,7 +874,7 @@ if (multiplex_flag == 'Y'): rname = "08_qc_troubleshoot", R = join(source_dir,'workflow','scripts','03_qc_report.Rmd'), qc = join(source_dir,'workflow','scripts','03_qc_unmapped_check.sh'), - log_dir = join(out_dir,'log','STAR') + log_dir = join(out_dir,'log','STAR'), single_qc = single_qc_threshold, proj_qc = project_qc_threshold envmodules: @@ -886,7 +886,7 @@ if (multiplex_flag == 'Y'): """ Rscript -e 'library(rmarkdown); \ rmarkdown::render("{params.R}", - output_file = "{output.o1}", \ + output_file = "{output.html}", \ params= list(log_list = "{input.log_list}", \ b_txt = "{input.txt_bc}"))' @@ -915,7 +915,7 @@ else: """ Rscript -e 'library(rmarkdown); \ rmarkdown::render("{params.R}", - output_file = "{output.o1}", \ + output_file = "{output.html}", \ params= list(log_list = "{input.log_list}"))' sh {params.qc} {params.log_dir} {params.single_qc} {params.proj_qc} From bdd7f8cfb23dccbb717674cd9bd18430f8ed5666 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 19 Sep 2022 14:31:39 -0400 Subject: [PATCH 17/19] change jobname --- run_snakemake.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_snakemake.sh b/run_snakemake.sh index 61a69ea..491eac1 100755 --- a/run_snakemake.sh +++ b/run_snakemake.sh @@ -409,7 +409,7 @@ elif [[ $pipeline = "cluster" ]] || [[ $pipeline = "local" ]]; then cat > ${output_dir}/submit_script.sbatch << EOF #!/bin/bash -#SBATCH --job-name=iCLIP_%j +#SBATCH --job-name=iCLIP_v2.2 #SBATCH --mem=40g #SBATCH --gres=lscratch:200 #SBATCH --time=10-00:00:00 From 16fc27b2df2a077c04bd0118673cf753bf817574 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 19 Sep 2022 14:35:32 -0400 Subject: [PATCH 18/19] add echo for DAG output --- run_snakemake.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/run_snakemake.sh b/run_snakemake.sh index 491eac1..35f8565 100755 --- a/run_snakemake.sh +++ b/run_snakemake.sh @@ -511,6 +511,8 @@ elif [[ $pipeline = "DAG" ]]; then -s "${output_dir}/config/Snakefile" \ --configfile ${output_dir}/config/snakemake_config.yaml \ --rulegraph | dot -Tpdf > ${output_dir}/log/dag.pdf + + echo "DAG is available at ${output_dir}/log/dag.pdf" ######################## Report ####################### elif [[ $pipeline = "report" ]]; then echo "------------------------------------------------------------------------" From 861bc62eba0c362176bfb5ec4abb0d41c967c264 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Wed, 21 Sep 2022 12:58:02 -0400 Subject: [PATCH 19/19] fix variable error --- workflow/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 734e522..fe3fbaf 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -785,7 +785,7 @@ rule star: --winAnchorMultimapNmax {params.s_anchor} # sort file - samtools sort -m 80G -T $tmp_dir $tmp_dir/${params.out_prefix}Aligned.out.bam -o $tmp_dir/${params.out_prefix}Aligned.sortedByCoord.out.bam + samtools sort -m 80G -T $tmp_dir $tmp_dir/{params.out_prefix}Aligned.out.bam -o $tmp_dir/{params.out_prefix}Aligned.sortedByCoord.out.bam # move STAR files and final log file to output mv $tmp_dir/{params.out_prefix}Aligned.sortedByCoord.out.bam {output.bam}