Skip to content

Commit

Permalink
Merge branch 'mk-model-parse' into 'dev'
Browse files Browse the repository at this point in the history
Parse model names from guppy/minknow files

See merge request research/medaka!570
  • Loading branch information
cjw85 committed Nov 29, 2023
2 parents 78d47ef + f20cb1a commit 159ada1
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 6 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [unreleased]
## [v1.11.2]
### Added
- Parsing model information from fastq headers output by Guppy and MinKNOW.
### Changed
- Additional explanatory information in VCF INFO fields concerning depth calculations.

Expand Down
2 changes: 1 addition & 1 deletion medaka/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import subprocess
import sys

__version__ = "1.11.1"
__version__ = "1.11.2"

try:
import pyabpoa as abpoa
Expand Down
16 changes: 12 additions & 4 deletions medaka/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,16 +173,24 @@ def _model_from_fastq(fname):
models = set()
with pysam.FastxFile(fname, 'r') as fastq:
for rec in itertools.islice(fastq, 100):
# model is embedded in RG:Z: tag of comment as
# <run_id>_<model>_<barcode>, but model has _
# characters in also so search for known models
try:
# dorado SAM converted to FASTQ with e.g. samtools fastq
# model is embedded in RG:Z: tag of comment as
# <run_id>_<model>_<barcode>, but model has _
# characters in also so search for known models
read_group = rec.comment.split("RG:Z:")[1].split()[0]
for model in known_models:
if model in read_group:
models.add(model)
except Exception:
pass
# minknow/guppy
# basecall_model_version_id=<model>
try:
model = rec.comment.split(
"basecall_model_version_id=")[1].split()[0]
models.add(model)
except Exception:
pass
if len(models) > 1:
# filter out any models without an `@`. These are likely FPs of
# the search above (there are unversioned models whose name
Expand Down
Binary file added medaka/test/data/bc_model_scrape_minknow.fastq.gz
Binary file not shown.
8 changes: 8 additions & 0 deletions medaka/test/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class TestScrapBasecaller(unittest.TestCase):
root_dir = os.path.abspath(os.path.dirname(__file__))
bam = os.path.join(root_dir, 'data/bc_model_scrape.bam')
fastq = os.path.join(root_dir, 'data/bc_model_scrape.fastq.gz')
fastq_minknow = os.path.join(root_dir, 'data/bc_model_scrape_minknow.fastq.gz')

def test_000_from_bam_consensus(self):
model = models.model_from_basecaller(self.bam, variant=False)
Expand All @@ -88,6 +89,13 @@ def test_011_from_fastq_variant(self):
model = models.model_from_basecaller(self.fastq, variant=True)
self.assertEqual(model, "r1041_e82_400bps_hac_variant_v4.2.0")

def test_020_from_fastq_minknow(self):
model = models.model_from_basecaller(self.fastq_minknow, variant=False)
self.assertEqual(model, "r1041_e82_400bps_sup_v4.2.0")

def test_021_from_fastq_minknow_variant(self):
model = models.model_from_basecaller(self.fastq_minknow, variant=True)
self.assertEqual(model, "r1041_e82_400bps_sup_variant_v4.2.0")

class TestBuildModel(unittest.TestCase):

Expand Down

0 comments on commit 159ada1

Please sign in to comment.