Skip to content

Commit

Permalink
Merge pull request #1 from ratschlab/[email protected]
Browse files Browse the repository at this point in the history
[release]
  • Loading branch information
akahles authored Jul 29, 2021
2 parents 60e9444 + e94df44 commit 836d363
Show file tree
Hide file tree
Showing 70 changed files with 21,479 additions and 0 deletions.
5 changes: 5 additions & 0 deletions genome-alignment-star/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.gitignore
.nextflow*
tests
work
outdir
55 changes: 55 additions & 0 deletions genome-alignment-star/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
FROM ratschlab/wf-base-miniconda4.8.3:v1.0.1

LABEL org.opencontainers.image.source https://github.com/ratschlab/wftools-alignment-rna
LABEL org.opencontainers.image.authors Andre Kahles ([email protected])
LABEL org.opencontainers.image.title Raetschlab RNA-Seq alignment image - STAR

ENV PATH="/opt/local:${PATH}"

RUN apt-get update
RUN apt-get -y install wget zip make gcc g++ libcurl4-openssl-dev libgomp1 bzip2 curl libncurses5-dev libbz2-dev liblzma-dev autotools-dev automake libz-dev pkg-config libtool m4

# install STAR
RUN mkdir -p /opt/local/src/STAR && \
cd /opt/local/src/STAR && \
wget https://github.com/alexdobin/STAR/archive/2.7.2d.tar.gz && \
tar -xzvf 2.7.2d.tar.gz && \
ln -s /opt/local/src/STAR/STAR-2.7.2d/bin/Linux_x86_64/STAR /usr/local/bin/STAR && \
ln -s /opt/local/src/STAR/STAR-2.7.2d/bin/Linux_x86_64/STARlong /usr/local/bin/STARlong

# install samtools 1.10
RUN cd /tmp \
&& curl -sSL -o samtools-1.10.tar.bz2 --retry 10 https://github.com/samtools/samtools/releases/download/1.10/samtools-1.10.tar.bz2 \
&& bunzip2 -c samtools-1.10.tar.bz2 |tar xf - \
&& cd samtools-1.10 \
&& ./configure --prefix=/usr/local \
&& make \
&& make install

# Install OpenJDK-8
RUN apt-get update && \
apt-get install -y openjdk-8-jdk && \
apt-get install -y ant && \
apt-get clean;

RUN mkdir -p /tools

# Picard installation
RUN curl -sSL -o /tools/picard.jar --retry 10 https://github.com/broadinstitute/picard/releases/download/2.21.8/picard.jar

### get run script
COPY main.py /tools

### set up environment
ENV PATH="$PATH:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:"
ENV LC_ALL=C

RUN groupadd -g 1000 ubuntu &&\
useradd -l -u 1000 -g ubuntu ubuntu &&\
install -d -m 0755 -o ubuntu -g ubuntu /home/ubuntu

USER ubuntu

ENTRYPOINT ["/usr/bin/env"]

CMD ["/bin/bash"]
108 changes: 108 additions & 0 deletions genome-alignment-star/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env nextflow

/*
Copyright (c) 2021, ratschlab
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Authors:
Andre Kahles
*/

/********************************************************************/
/* this block is auto-generated based on info from pkg.json where */
/* changes can be made if needed, do NOT modify this block manually */
nextflow.enable.dsl = 2
version = '0.1.0' // package version

container = [
'ghcr.io': 'ghcr.io/ratschlab/wftools-alignment-rna.genome-alignment-star'
]
default_container_registry = 'ghcr.io'
/********************************************************************/


// universal params go here
params.container_registry = ""
params.container_version = ""
params.container = ""

params.cpus = 1
params.mem = 1 // GB
params.publish_dir = "" // set to empty string will disable publishDir


// tool specific parmas go here, add / change as needed
params.index = "NO_FILE_1" //input/test_genome.index_STAR"
params.gtf = "NO_FILE_2" //input/test_annotation.gtf"
params.input_files = ["NO_FILE_3"]//input/TEST-PRO.donor1.donor1_sample1_id.sample_01.b22541e45ff72d9a042e877a0531af0b.lane.bam"
params.input_format = "ubam"
params.sample = ""
params.sjdboverhang = 100
params.pair_status = "paired"

process genomeAlignmentSTAR {
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir

cpus params.cpus
memory "${params.mem} GB"

input:
file index
file gtf
path input_files
val input_format
val pair_status
val sample
val sjdboverhang

output: // output, make update as needed
path("${sample}_Aligned.out.bam"), emit: bam
path("${sample}_SJ.out.tab"), emit: junctions
path("${sample}_Log*")

script:
"""
python /tools/main.py \\
--sample ${sample} \\
--index ${index} \\
--annotation ${gtf} \\
--threads ${params.cpus} \\
--sjdbOverhang ${sjdboverhang} \\
--pair-status ${pair_status} \\
--input-files ${input_files} \\
--input-format ${input_format} \\
--mem ${params.mem * 1000} > ${sample}_align.log 2>&1
"""
}

// this provides an entry point for this main script, so it can be run directly without clone the repo
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
workflow {
genomeAlignmentSTAR(
file(params.index),
file(params.gtf),
params.input_files.collect({it -> file(it)}),
params.input_format,
params.pair_status,
params.sample,
params.sjdboverhang
)
}
165 changes: 165 additions & 0 deletions genome-alignment-star/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import sys
import argparse
import subprocess
import glob

def replace_bam_header(bam, header, out, mem=None):

jvm_Xmx = "-Xmx%iM" % int(mem) if mem else ""
try:
cmd = [
'java', jvm_Xmx, '-Dpicard.useLegacyParser=false', '-jar', '/tools/picard.jar', 'ReplaceSamHeader', '-I', bam,
'--HEADER', header, '--OUTPUT', out
]

subprocess.run(cmd, check=True)
except Exception as e:
sys.exit("Error: %s. ReplaceSamHeader failed: %s\n" % (e, bam))


def generate_fastq_from_ubam(ubam, outdir, mem=None):

jvm_Xmx = "-Xmx%iM" % int(mem) if mem else ""
try:
cmd = [
'java', jvm_Xmx, '-Dpicard.useLegacyParser=false', '-jar', '/tools/picard.jar', 'SamToFastq', '-I', ubam,
'--OUTPUT_PER_RG', 'true', '--RG_TAG', 'ID', '--COMPRESS_OUTPUTS_PER_RG', 'true', '--OUTPUT_DIR', outdir
]

subprocess.run(cmd, check=True)
except Exception as e:
sys.exit("Error: %s. SamToFastq failed: %s\n" % (e, ubam))

def main():
"""
Python wrapper for calling STAR aligner on various input configurations,
assuring that read group information is provided appropriately.
"""

parser = argparse.ArgumentParser(description='Tool: STAR aligner')
parser.add_argument('--sample', dest='sample', type=str,
help='Input sample name / ID.', required=True)
parser.add_argument('--index', dest='index', type=str,
help='Path to STAR genome index.', required=True)
parser.add_argument('--annotation', dest='annotation', type=str,
help='Gene annotation (preferably in GTF format).', required=True)
parser.add_argument('--sjdbOverhang', dest='sjdboverhang', type=int,
help='Overhang for splice junction DB. [100]', default=100, required=False)
parser.add_argument('--threads', dest='threads', type=int,
help='Number of threads. [1]', default=1, required=False)
parser.add_argument('--pair-status', dest='pair_status', type=str,
help='Paired-end status of input samples. Choices are: single, paired.', required=True)
parser.add_argument('--input-files', dest='input_files', type=str, nargs='+',
help='Input read files in fastq or bam format. For paired end fastq, first mate comes first.', default=[])
parser.add_argument('--input-format', dest='input_format', type=str,
help='Format of read in put: fastq or bam', required=True)
parser.add_argument('--mem', dest='mem', help="Maximal allocated memory in MB", type=float, default=None)

args = parser.parse_args()

if not os.path.exists(args.index):
sys.exit('Error: specified index path %s does not exist or is not accessible!' % args.index)

if not os.path.exists(args.annotation):
sys.exit('Error: specified annotation file %s does not exist or is not accessible!' % args.annotation)

### handle ubam input
outdir = '.'
if args.input_format == 'bam':
if len(args.input_files) != 1:
sys.exit('Error: number of input files %s needs to be exactly 1!.' % str(args.input_files))
generate_fastq_from_ubam(args.input_files[0], outdir, mem=args.mem)
fqr1 = glob.glob(os.path.join(outdir, '*_1.fastq.gz'))
print(fqr1)
fqr2 = []
if args.pair_status == 'paired':
for fq in fqr1:
fqr2.append(fq[:-10] + '2.fastq.gz')
assert os.path.exists(fqr2[-1])
rgs = [os.path.basename(_)[:-11] for _ in fqr1]
### handle fastq input
elif args.input_format == 'fastq':
fqr1 = [args.input_files[0]]
if args.pair_status == 'paired':
fqr2 = [args.input_files[1]]
if len(args.input_files) != 2:
sys.exit('Error: Paired-end status was given as %s. But files provided were: %s' % (args.pair_status, str(args.input_files)))
elif args.pair_status == 'single' and len(args.input_files != 1):
sys.exit('Error: Paired-end status was given as %s. But files provided were: %s' % (args.pair_status, str(args.input_files)))
### get read group names
rgs = []
for fq in fqr1:
rg = fq
if rg.lower().endswith('.gz'):
rg = rg[:-3]
if rg.lower().endswith('.bz2'):
rg = rg[:-4]
if rg.lower().endswith('.fastq'):
rg = rg[:-6]
rgs.append(rg[:-2])
### this should not happen
else:
sys.exit('Error: The input type needs to be either bam or fastq. Currently given: %s' % args.input_format)

### figure out correct read command
read_command = 'cat'
for fq in fqr1:
if fq.lower().endswith('.gz'):
read_command = 'zcat'
break
if fq.lower().endswith('.bz2'):
read_command = 'bzcat'

### assemble STAR command
cmd = ['STAR',
'--genomeDir', args.index,
'--sjdbGTFfile', args.annotation,
'--runThreadN', str(args.threads),
'--sjdbOverhang', str(args.sjdboverhang),
'--outFileNamePrefix', args.sample + '_',
'--readFilesIn', ','.join(fqr1), ','.join(fqr2),
'--outSAMattrRGline', ' , '.join(['ID:%s\tSM:%s' % (_, args.sample) for _ in rgs]),
'--readFilesCommand', read_command,
'--twopassMode Basic',
'--outFilterMultimapScoreRange', '1',
'--outFilterMultimapNmax', '20',
'--outFilterMismatchNmax', '10',
'--alignIntronMax', '500000',
'--alignMatesGapMax', '1000000',
'--sjdbScore', '2',
'--alignSJDBoverhangMin', '1',
'--genomeLoad NoSharedMemory',
'--outFilterMatchNminOverLread', '0.33',
'--outFilterScoreMinOverLread', '0.33',
'--outSAMstrandField intronMotif',
'--outSAMmode Full',
'--limitBAMsortRAM', '7000000000',
'--outSAMattributes NH HI NM MD AS XS',
'--outSAMunmapped Within',
'--limitSjdbInsertNsj', '2000000',
'--outSAMtype BAM Unsorted',
'--outSAMheaderHD', '@HD VN:1.4',
'--outSAMmultNmax', '1',
]
subprocess.run(' '.join(cmd), shell=True, check=True)

### replace original read group line from ubam
if args.input_format == 'bam':
bam = args.sample + '_Aligned.out.bam'
### get current header and drop RG info
subprocess.run('samtools view -H %s --no-PG | grep -v "@RG" > new_header.sam' % bam, shell=True, check=True)
### append ald read group info
for fq in fqr1:
subprocess.run('samtools view -H %s --no-PG | grep -e @RG | grep %s >> new_header.sam' % (args.input_files[0], os.path.basename(fq)[:-11]), shell=True, check=True) # capture_output=True, shell=True, check=True).stdout.decode('utf-8').strip())
### replace header
bam_rg = args.sample + '_Aligned.out.rg.bam'
replace_bam_header(bam, 'new_header.sam', bam_rg, mem=args.mem)
### clean up
subprocess.run('mv %s %s && rm new_header.sam' % (bam_rg, bam), shell=True, check=True)

if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions genome-alignment-star/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
docker {
enabled = true
runOptions = '-u \$(id -u):\$(id -g)'
}
38 changes: 38 additions & 0 deletions genome-alignment-star/pkg.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"name": "genome-alignment-star",
"version": "0.1.0",
"description": "Genome alignment with STAR",
"main": "main.nf",
"deprecated": false,
"keywords": [
"bioinformatics",
"rna-seq",
"alignment",
"star"
],
"repository": {
"type": "git",
"url": "https://github.com/ratschlab/wftools-alignment-rna.git"
},
"container": {
"registries": [
{
"registry": "ghcr.io",
"type": "docker",
"org": "ratschlab",
"default": true
}
]
},
"dependencies": [],
"devDependencies": [],
"contributors": [
{
"name": "Andre Kahles",
"email": "[email protected]"
}
],
"license": "MIT",
"bugReport": "https://github.com/ratschlab/wftools-alignment-rna/issues",
"homepage": "https://github.com/ratschlab/wftools-alignment-rna#readme"
}
Loading

0 comments on commit 836d363

Please sign in to comment.