nf-core · drpatelh · Nov 15, 2023 · Nov 7, 2023 · Nov 7, 2023 · Nov 7, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ Special thanks to the following for their contributions to the release:
 - [Júlia Mir Pedrol](https://github.com/mirpedrol)
 - [Matthias Zepper](https://github.com/MatthiasZepper)
 - [Maxime Garcia](https://github.com/maxulysse)
+- [Jonathan Manning](https://github.com/pinin4fjords)
 
 Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form.
 
@@ -29,6 +30,10 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
 - [PR #1083](https://github.com/nf-core/rnaseq/pull/1083) - Move local modules and subworkflows to subfolders
 - [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10
 - [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability
+- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage
+- [#1082](https://github.com/nf-core/rnaseq/issues/1082) - More informative error message for filter_gtf_for_genes_in_genome.py
+- [#1102](https://github.com/nf-core/rnaseq/issues/1102) - gene entries with empty transcript_id fields
+- [#1074](https://github.com/nf-core/rnaseq/issues/1074) - Enable quantification using StringTie AND a custom Ensembl genome
 - [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - Kallisto quantification
 - [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - MultiQC [version bump](https://github.com/nf-core/rnaseq/pull/1106/commits/aebad067a10a45510a2b421da852cb436ae65fd8)
 - [#1050](https://github.com/nf-core/rnaseq/issues/1050) - Provide custom prefix/suffix for summary files to avoid overwriting

diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+import logging
+import argparse
+import re
+import statistics
+from typing import Set
+
+# Create a logger
+logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
+logger = logging.getLogger("fasta_gtf_filter")
+logger.setLevel(logging.INFO)
+
+
+def extract_fasta_seq_names(fasta_name: str) -> Set[str]:
+    """Extracts the sequence names from a FASTA file."""
+    with open(fasta_name) as fasta:
+        return {line[1:].split(None, 1)[0] for line in fasta if line.startswith(">")}
+
+
+def tab_delimited(file: str) -> float:
+    """Check if file is tab-delimited and return median number of tabs."""
+    with open(file, "r") as f:
+        data = f.read(1024)
+        return statistics.median(line.count("\t") for line in data.split("\n"))
+
+
+def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
+    """Filter GTF file based on FASTA sequence names."""
+    if tab_delimited(gtf_in) != 8:
+        raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.")
+
+    seq_names_in_genome = extract_fasta_seq_names(fasta)
+    logger.info(f"Extracted chromosome sequence names from {fasta}")
+    logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)))
+
+    seq_names_in_gtf = set()
+    try:
+        with open(gtf_in) as gtf, open(filtered_gtf_out, "w") as out:
+            line_count = 0
+            for line in gtf:
+                seq_name = line.split("\t")[0]
+                seq_names_in_gtf.add(seq_name)  # Add sequence name to the set
+
+                if seq_name in seq_names_in_genome:
+                    if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line):
+                        out.write(line)
+                        line_count += 1
+
+            if line_count == 0:
+                raise ValueError("All GTF lines removed by filters")
+
+    except IOError as e:
+        logger.error(f"File operation failed: {e}")
+        return
+
+    logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf)))
+    logger.info(f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.")
+    parser.add_argument("--gtf", type=str, required=True, help="GTF file")
+    parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file")
+    parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files")
+    parser.add_argument(
+        "--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file"
+    )
+
+    args = parser.parse_args()
+    filter_gtf(args.fasta, args.gtf, args.prefix + ".filtered.gtf", args.skip_transcript_id_check)
diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py
diff --git a/conf/modules.config b/conf/modules.config
@@ -117,7 +117,8 @@ process {
         ]
     }
 
-    withName: 'GTF_GENE_FILTER' {
+    withName: 'GTF_FILTER' {
+        ext.args   = { params.skip_gtf_transcript_filter ?: '--skip_transcript_id_check' }
         publishDir = [
             path: { "${params.outdir}/genome" },
             mode: params.publish_dir_mode,
@@ -164,13 +165,15 @@ process {
         ext.args   = '--record-count 1000000 --seed 1'
         ext.prefix = { "${meta.id}.subsampled" }
         publishDir = [
+            mode: params.publish_dir_mode,
             enabled: false
         ]
     }
 
     withName: '.*:FASTQ_SUBSAMPLE_FQ_SALMON:SALMON_QUANT' {
         ext.args   = '--skipQuant'
         publishDir = [
+            mode: params.publish_dir_mode,
             enabled: false
         ]
     }

diff --git a/docs/usage.md b/docs/usage.md
@@ -198,6 +198,10 @@ Notes:
 
 - As of v3.7 of the pipeline, if you are using a genome downloaded from AWS iGenomes and using `--aligner star_salmon` (default) the version of STAR to use for the alignment will be auto-detected (see [#808](https://github.com/nf-core/rnaseq/issues/808)).
 
+### GTF filtering
+
+By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome, and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` flag. The transcript identifer filter can be disabled specifically using `skip_gtf_transcript_filter`.
+
 ## Running the pipeline
 
 The typical command for running the pipeline is as follows:

diff --git a/modules.json b/modules.json
@@ -71,8 +71,8 @@
                         "installed_by": ["modules"]
                     },
                     "kallisto/quant": {
-                        "branch": "kallisto_updates",
-                        "git_sha": "bc4719dcd079fcdb650ddeac05739c2f7dd58c84",
+                        "branch": "master",
+                        "git_sha": "bdc2a97ced7adc423acfa390742db83cab98c1ad",
                         "installed_by": ["modules"]
                     },
                     "picard/markduplicates": {

diff --git a/modules/local/gtf_gene_filter/main.nf → modules/local/gtf_filter/main.nf b/modules/local/gtf_gene_filter/main.nf → modules/local/gtf_filter/main.nf
@@ -1,4 +1,4 @@
-process GTF_GENE_FILTER {
+process GTF_FILTER {
     tag "$fasta"
 
     conda "conda-forge::python=3.9.5"
@@ -11,18 +11,18 @@ process GTF_GENE_FILTER {
     path gtf
 
     output:
-    path "*.gtf"       , emit: gtf
-    path "versions.yml", emit: versions
+    path "*.filtered.gtf"           , emit: genome_gtf
+    path "versions.yml"              , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script: // filter_gtf_for_genes_in_genome.py is bundled with the pipeline, in nf-core/rnaseq/bin/
     """
-    filter_gtf_for_genes_in_genome.py \\
+    filter_gtf.py \\
         --gtf $gtf \\
         --fasta $fasta \\
-        -o ${fasta.baseName}_genes.gtf
+        --prefix ${fasta.baseName}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/multiqc/main.nf b/modules/local/multiqc/main.nf
@@ -3,8 +3,8 @@ process MULTIQC {
 
     conda "bioconda::multiqc=1.17"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' :
-        'biocontainers/multiqc:1.17--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_1' :
+        'biocontainers/multiqc:1.17--pyhdfd78af_1' }"
 
     input:
     path multiqc_config

diff --git a/modules/nf-core/kallisto/quant/main.nf b/modules/nf-core/kallisto/quant/main.nf
diff --git a/nextflow.config b/nextflow.config
@@ -17,6 +17,8 @@ params {
     splicesites                = null
     gtf_extra_attributes       = 'gene_name'
     gtf_group_features         = 'gene_id'
+    skip_gtf_filter            = false
+    skip_gtf_transcript_filter = false
     featurecounts_feature_type = 'exon'
     featurecounts_group_type   = 'gene_biotype'
     gencode                    = false

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -85,6 +85,17 @@
                     "description": "Path to GFF3 annotation file.",
                     "help_text": "This parameter must be specified if `--genome` or `--gtf` are not specified."
                 },
+                "skip_gtf_filter": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-forward",
+                    "description": "Skip filtering of GTF for valid scaffolds and/ or transcript IDs.",
+                    "help_text": "If you're confident on the validity of the GTF with respect to the genome file, or wish to disregard failures thriggered by the filtering module, activate this option."
+                },
+                "skip_gtf_transcript_filter": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-forward",
+                    "description": "Skip just the transcript_id check of GTF filtering."
+                },
                 "gene_bed": {
                     "type": "string",
                     "format": "file-path",