Merge pull request #1 from ratschlab/[email protected]

[release]
ratschlab · Jul 29, 2021 · 836d363 · 836d363
2 parents 60e9444 + e94df44
commit 836d363
Show file tree

Hide file tree

Showing 70 changed files with 21,479 additions and 0 deletions.
diff --git a/genome-alignment-star/.dockerignore b/genome-alignment-star/.dockerignore
@@ -0,0 +1,5 @@
+.gitignore
+.nextflow*
+tests
+work
+outdir
diff --git a/genome-alignment-star/Dockerfile b/genome-alignment-star/Dockerfile
@@ -0,0 +1,55 @@
+FROM ratschlab/wf-base-miniconda4.8.3:v1.0.1
+
+LABEL org.opencontainers.image.source https://github.com/ratschlab/wftools-alignment-rna
+LABEL org.opencontainers.image.authors Andre Kahles ([email protected])
+LABEL org.opencontainers.image.title Raetschlab RNA-Seq alignment image - STAR
+
+ENV PATH="/opt/local:${PATH}"
+
+RUN apt-get update
+RUN apt-get -y install wget zip make gcc g++ libcurl4-openssl-dev libgomp1 bzip2 curl libncurses5-dev libbz2-dev liblzma-dev autotools-dev automake libz-dev pkg-config libtool m4 
+
+# install STAR
+RUN mkdir -p /opt/local/src/STAR && \
+    cd /opt/local/src/STAR && \
+    wget https://github.com/alexdobin/STAR/archive/2.7.2d.tar.gz && \
+    tar -xzvf 2.7.2d.tar.gz && \
+    ln -s /opt/local/src/STAR/STAR-2.7.2d/bin/Linux_x86_64/STAR /usr/local/bin/STAR && \
+    ln -s /opt/local/src/STAR/STAR-2.7.2d/bin/Linux_x86_64/STARlong /usr/local/bin/STARlong
+
+# install samtools 1.10
+RUN cd /tmp \
+    && curl -sSL -o samtools-1.10.tar.bz2 --retry 10 https://github.com/samtools/samtools/releases/download/1.10/samtools-1.10.tar.bz2 \
+    && bunzip2 -c samtools-1.10.tar.bz2 |tar xf - \
+    && cd samtools-1.10 \
+    && ./configure --prefix=/usr/local \
+    && make \
+    && make install
+
+# Install OpenJDK-8
+RUN apt-get update && \
+    apt-get install -y openjdk-8-jdk && \
+    apt-get install -y ant && \
+    apt-get clean;
+
+RUN mkdir -p /tools
+
+# Picard installation
+RUN curl -sSL -o /tools/picard.jar --retry 10 https://github.com/broadinstitute/picard/releases/download/2.21.8/picard.jar
+
+### get run script
+COPY main.py /tools
+
+### set up environment
+ENV PATH="$PATH:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:"
+ENV LC_ALL=C
+
+RUN groupadd -g 1000 ubuntu &&\
+    useradd -l -u 1000 -g ubuntu ubuntu &&\
+    install -d -m 0755 -o ubuntu -g ubuntu /home/ubuntu
+
+USER ubuntu
+
+ENTRYPOINT ["/usr/bin/env"]
+
+CMD ["/bin/bash"]
diff --git a/genome-alignment-star/main.nf b/genome-alignment-star/main.nf
@@ -0,0 +1,108 @@
+#!/usr/bin/env nextflow
+
+/*
+  Copyright (c) 2021, ratschlab
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in all
+  copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
+
+  Authors:
+    Andre Kahles
+*/
+
+/********************************************************************/
+/* this block is auto-generated based on info from pkg.json where   */
+/* changes can be made if needed, do NOT modify this block manually */
+nextflow.enable.dsl = 2
+version = '0.1.0'  // package version
+
+container = [
+    'ghcr.io': 'ghcr.io/ratschlab/wftools-alignment-rna.genome-alignment-star'
+]
+default_container_registry = 'ghcr.io'
+/********************************************************************/
+
+
+// universal params go here
+params.container_registry = ""
+params.container_version = ""
+params.container = ""
+
+params.cpus = 1
+params.mem = 1  // GB
+params.publish_dir = ""  // set to empty string will disable publishDir
+
+
+// tool specific parmas go here, add / change as needed
+params.index = "NO_FILE_1" //input/test_genome.index_STAR"
+params.gtf = "NO_FILE_2" //input/test_annotation.gtf"
+params.input_files = ["NO_FILE_3"]//input/TEST-PRO.donor1.donor1_sample1_id.sample_01.b22541e45ff72d9a042e877a0531af0b.lane.bam"
+params.input_format = "ubam"
+params.sample = ""
+params.sjdboverhang = 100
+params.pair_status = "paired"
+
+process genomeAlignmentSTAR {
+  container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
+  publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir
+
+  cpus params.cpus
+  memory "${params.mem} GB"
+
+  input:  
+    file index
+    file gtf
+    path input_files
+    val input_format
+    val pair_status
+    val sample
+    val sjdboverhang
+
+  output:  // output, make update as needed
+    path("${sample}_Aligned.out.bam"), emit: bam
+    path("${sample}_SJ.out.tab"), emit: junctions
+    path("${sample}_Log*")
+
+  script:
+    """
+    python /tools/main.py \\
+           --sample ${sample} \\
+           --index ${index} \\
+           --annotation ${gtf} \\
+           --threads ${params.cpus} \\
+           --sjdbOverhang ${sjdboverhang} \\
+           --pair-status ${pair_status} \\
+           --input-files ${input_files} \\
+           --input-format ${input_format} \\
+           --mem ${params.mem * 1000} > ${sample}_align.log 2>&1
+    """
+}
+
+// this provides an entry point for this main script, so it can be run directly without clone the repo
+// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
+workflow {
+  genomeAlignmentSTAR(
+    file(params.index),
+    file(params.gtf),
+    params.input_files.collect({it -> file(it)}),
+    params.input_format,
+    params.pair_status,
+    params.sample,
+    params.sjdboverhang
+  )
+}
diff --git a/genome-alignment-star/main.py b/genome-alignment-star/main.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import argparse
+import subprocess
+import glob
+
+def replace_bam_header(bam, header, out, mem=None):
+
+    jvm_Xmx = "-Xmx%iM" % int(mem) if mem else ""
+    try:
+        cmd = [
+                'java', jvm_Xmx, '-Dpicard.useLegacyParser=false', '-jar', '/tools/picard.jar', 'ReplaceSamHeader', '-I', bam,
+                '--HEADER', header, '--OUTPUT', out
+              ]
+
+        subprocess.run(cmd, check=True)
+    except Exception as e:
+        sys.exit("Error: %s. ReplaceSamHeader failed: %s\n" % (e, bam))
+
+
+def generate_fastq_from_ubam(ubam, outdir, mem=None):
+
+    jvm_Xmx = "-Xmx%iM" % int(mem) if mem else ""
+    try:
+        cmd = [
+                'java', jvm_Xmx, '-Dpicard.useLegacyParser=false', '-jar', '/tools/picard.jar', 'SamToFastq', '-I', ubam,
+                '--OUTPUT_PER_RG', 'true', '--RG_TAG', 'ID', '--COMPRESS_OUTPUTS_PER_RG', 'true', '--OUTPUT_DIR', outdir
+              ]
+
+        subprocess.run(cmd, check=True)
+    except Exception as e:
+        sys.exit("Error: %s. SamToFastq failed: %s\n" % (e, ubam))
+
+def main():
+    """
+    Python wrapper for calling STAR aligner on various input configurations,
+    assuring that read group information is provided appropriately.
+    """
+
+    parser = argparse.ArgumentParser(description='Tool: STAR aligner')
+    parser.add_argument('--sample', dest='sample', type=str,
+                        help='Input sample name / ID.', required=True)
+    parser.add_argument('--index', dest='index', type=str,
+                        help='Path to STAR genome index.', required=True)
+    parser.add_argument('--annotation', dest='annotation', type=str,
+                        help='Gene annotation (preferably in GTF format).', required=True)
+    parser.add_argument('--sjdbOverhang', dest='sjdboverhang', type=int,
+                        help='Overhang for splice junction DB. [100]', default=100, required=False)
+    parser.add_argument('--threads', dest='threads', type=int,
+                        help='Number of threads. [1]', default=1, required=False)
+    parser.add_argument('--pair-status', dest='pair_status', type=str,
+                        help='Paired-end status of input samples. Choices are: single, paired.', required=True)
+    parser.add_argument('--input-files', dest='input_files', type=str, nargs='+',
+                        help='Input read files in fastq or bam format. For paired end fastq, first mate comes first.', default=[])
+    parser.add_argument('--input-format', dest='input_format', type=str,
+                        help='Format of read in put: fastq or bam', required=True)
+    parser.add_argument('--mem', dest='mem', help="Maximal allocated memory in MB", type=float, default=None)
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.index):
+        sys.exit('Error: specified index path %s does not exist or is not accessible!' % args.index)
+
+    if not os.path.exists(args.annotation):
+        sys.exit('Error: specified annotation file %s does not exist or is not accessible!' % args.annotation)
+
+    ### handle ubam input
+    outdir = '.'
+    if args.input_format == 'bam':
+        if len(args.input_files) != 1:
+            sys.exit('Error: number of input files %s needs to be exactly 1!.' % str(args.input_files))
+        generate_fastq_from_ubam(args.input_files[0], outdir, mem=args.mem)
+        fqr1 = glob.glob(os.path.join(outdir, '*_1.fastq.gz'))
+        print(fqr1)
+        fqr2 = []
+        if args.pair_status == 'paired':
+            for fq in fqr1:
+                fqr2.append(fq[:-10] + '2.fastq.gz')
+                assert os.path.exists(fqr2[-1])
+        rgs = [os.path.basename(_)[:-11] for _ in fqr1]
+    ### handle fastq input
+    elif args.input_format == 'fastq':
+        fqr1 = [args.input_files[0]]
+        if args.pair_status == 'paired': 
+            fqr2 = [args.input_files[1]]
+            if len(args.input_files) != 2:
+                sys.exit('Error: Paired-end status was given as %s. But files provided were: %s' % (args.pair_status, str(args.input_files)))
+        elif args.pair_status == 'single' and len(args.input_files != 1):
+            sys.exit('Error: Paired-end status was given as %s. But files provided were: %s' % (args.pair_status, str(args.input_files)))
+        ### get read group names
+        rgs = []
+        for fq in fqr1:
+            rg = fq
+            if rg.lower().endswith('.gz'):
+                rg = rg[:-3]
+            if rg.lower().endswith('.bz2'):
+                rg = rg[:-4]
+            if rg.lower().endswith('.fastq'):
+                rg = rg[:-6]
+        rgs.append(rg[:-2])
+    ### this should not happen
+    else:
+        sys.exit('Error: The input type needs to be either bam or fastq. Currently given: %s' % args.input_format)
+
+    ### figure out correct read command
+    read_command = 'cat'
+    for fq in fqr1:
+        if fq.lower().endswith('.gz'):
+            read_command = 'zcat'
+            break
+        if fq.lower().endswith('.bz2'):
+            read_command = 'bzcat'
+
+    ### assemble STAR command
+    cmd = ['STAR',
+           '--genomeDir',  args.index,
+           '--sjdbGTFfile', args.annotation,
+           '--runThreadN', str(args.threads),
+           '--sjdbOverhang', str(args.sjdboverhang),
+           '--outFileNamePrefix', args.sample + '_',
+           '--readFilesIn', ','.join(fqr1), ','.join(fqr2),
+           '--outSAMattrRGline', ' , '.join(['ID:%s\tSM:%s' % (_, args.sample) for _ in rgs]),
+           '--readFilesCommand', read_command,
+           '--twopassMode Basic',
+           '--outFilterMultimapScoreRange', '1',
+           '--outFilterMultimapNmax', '20', 
+           '--outFilterMismatchNmax', '10',
+           '--alignIntronMax', '500000', 
+           '--alignMatesGapMax', '1000000',
+           '--sjdbScore', '2',
+           '--alignSJDBoverhangMin', '1',
+           '--genomeLoad NoSharedMemory',
+           '--outFilterMatchNminOverLread', '0.33',
+           '--outFilterScoreMinOverLread', '0.33',
+           '--outSAMstrandField intronMotif',
+           '--outSAMmode Full',
+           '--limitBAMsortRAM', '7000000000',
+           '--outSAMattributes NH HI NM MD AS XS',
+           '--outSAMunmapped Within',
+           '--limitSjdbInsertNsj', '2000000',
+           '--outSAMtype BAM Unsorted',
+           '--outSAMheaderHD', '@HD VN:1.4',
+           '--outSAMmultNmax', '1',
+          ]
+    subprocess.run(' '.join(cmd), shell=True, check=True)
+
+    ### replace original read group line from ubam
+    if args.input_format == 'bam':
+        bam = args.sample + '_Aligned.out.bam'
+        ### get current header and drop RG info
+        subprocess.run('samtools view -H %s --no-PG | grep -v "@RG" > new_header.sam' % bam, shell=True, check=True)
+        ### append ald read group info 
+        for fq in fqr1:
+            subprocess.run('samtools view -H %s --no-PG | grep -e @RG | grep %s >> new_header.sam' % (args.input_files[0], os.path.basename(fq)[:-11]), shell=True, check=True) # capture_output=True, shell=True, check=True).stdout.decode('utf-8').strip())
+        ### replace header
+        bam_rg = args.sample + '_Aligned.out.rg.bam'
+        replace_bam_header(bam, 'new_header.sam', bam_rg, mem=args.mem) 
+        ### clean up
+        subprocess.run('mv %s %s && rm new_header.sam' % (bam_rg, bam), shell=True, check=True)
+
+if __name__ == "__main__":
+    main()
diff --git a/genome-alignment-star/nextflow.config b/genome-alignment-star/nextflow.config
@@ -0,0 +1,4 @@
+docker {
+    enabled = true
+    runOptions = '-u \$(id -u):\$(id -g)'
+}
diff --git a/genome-alignment-star/pkg.json b/genome-alignment-star/pkg.json
@@ -0,0 +1,38 @@
+{
+    "name": "genome-alignment-star",
+    "version": "0.1.0",
+    "description": "Genome alignment with STAR",
+    "main": "main.nf",
+    "deprecated": false,
+    "keywords": [
+        "bioinformatics",
+        "rna-seq",
+        "alignment",
+        "star"
+    ],
+    "repository": {
+        "type": "git",
+        "url": "https://github.com/ratschlab/wftools-alignment-rna.git"
+    },
+    "container": {
+        "registries": [
+            {
+                "registry": "ghcr.io",
+                "type": "docker",
+                "org": "ratschlab",
+                "default": true
+            }
+        ]
+    },
+    "dependencies": [],
+    "devDependencies": [],
+    "contributors": [
+        {
+            "name": "Andre Kahles",
+            "email": "[email protected]"
+        }
+    ],
+    "license": "MIT",
+    "bugReport": "https://github.com/ratschlab/wftools-alignment-rna/issues",
+    "homepage": "https://github.com/ratschlab/wftools-alignment-rna#readme"
+}