Skip to content

Commit

Permalink
Merge pull request #9 from ShawHahnLab/release-0.1.1
Browse files Browse the repository at this point in the history
Version 0.1.1
  • Loading branch information
ressy authored Dec 10, 2021
2 parents 8369b0f + 682669b commit 3e121fe
Show file tree
Hide file tree
Showing 37 changed files with 674 additions and 66 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
command: |
cd igseq
source ~/miniconda3/bin/activate igseq
TEST_IGSEQ_LIVE=no python -m unittest -v
TEST_IGSEQ_LIVE=yes python -m unittest -v
- run:
name: Install python package
command: |
Expand Down
29 changes: 27 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,30 @@
## Changelog
# Changelog

# 0.1.0 - 2021-11-19
## 0.1.1 - 2021-12-07

### Changed

* Show a warning for zero file matches in list and show commands ([#8])

### Fixed

* IgBLAST output is now shown even if it crashes ([#12])
* Header-only CSV/TSV files no longer crash the show command ([#8])
* summarize command now works with multiple references ([#7])
* vdj-match and summarize commands now work as intended via igblast, and all
have basic automated tests ([#5])
* igblastn arguments can now be given with a two-dash prefix to ensure they
aren't interpreted as igseq arguments ([#4])
* Duplicate FASTA paths found in vdj-gather will no longer result in
duplicated output sequences ([#2])

[#12]: https://github.com/ShawHahnLab/igseq/pull/12
[#8]: https://github.com/ShawHahnLab/igseq/pull/8
[#7]: https://github.com/ShawHahnLab/igseq/pull/7
[#5]: https://github.com/ShawHahnLab/igseq/pull/5
[#4]: https://github.com/ShawHahnLab/igseq/pull/4
[#2]: https://github.com/ShawHahnLab/igseq/pull/2

## 0.1.0 - 2021-11-19

First beta release
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ sequences. Your mileage may vary.

## Install

Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
First, install [Miniconda](https://docs.conda.io/en/latest/miniconda.html).

Install from the latest version via <https://anaconda.org/ShawHahnLab/igseq>:
Then install from the latest version via <https://anaconda.org/ShawHahnLab/igseq>:

conda create --name igseq -c conda-forge -c bioconda -c ShawHahnLab igseq
conda activate igseq

Or, install from the latest source here:

Expand Down
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# https://docs.conda.io/projects/conda-build/en/latest/resources/define-metadata.html
{% set version = "0.1.0" %}
{% set version = "0.1.1" %}
{% set build = "0" %}

package:
Expand Down
22 changes: 16 additions & 6 deletions igseq/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from . import vdj_gather
from . import vdj_match
from . import tab2seq
from .show import show_files, list_files
from . import show
from .util import IgSeqError
from .version import __version__

Expand All @@ -30,7 +30,11 @@ def rewrap(txt):
# if the terminal width was supposedly 0, we'll just use 80
if not width:
width = 80
wrap = lambda txt: "\n".join(textwrap.wrap(txt, width=width))
def wrap(txt):
if txt.startswith(" "):
return txt
return "\n".join(textwrap.wrap(txt, width=width))
#wrap = lambda txt: "\n".join(textwrap.wrap(txt, width=width))
chunks = txt.strip().split("\n\n")
return "\n\n".join([wrap(chunk) for chunk in chunks])

Expand Down Expand Up @@ -128,10 +132,10 @@ def _main_merge(args):
threads=args.threads)

def _main_show(args):
show_files(text_items=args.text, force=args.force)
show.show_files(text_items=args.text, force=args.force)

def _main_list(args):
list_files(text_items=args.text)
show.list_files(text_items=args.text)

def _main_igblast(args, extra_igblastn_args=None):
igblast.igblast(
Expand Down Expand Up @@ -240,8 +244,14 @@ def __setup_arg_parser():
help="Convert CSV/TSV to FASTA/FASTQ",
description=rewrap(tab2seq.__doc__),
formatter_class=argparse.RawDescriptionHelpFormatter)
p_show = subps.add_parser("show", help="show builtin reference data")
p_list = subps.add_parser("list", help="list builtin reference data files")
p_show = subps.add_parser("show",
help="show file contents",
description=rewrap(show.__doc__),
formatter_class=argparse.RawDescriptionHelpFormatter)
p_list = subps.add_parser("list",
help="list builtin reference data files",
description=rewrap(show.__doc__),
formatter_class=argparse.RawDescriptionHelpFormatter)

__add_common_args(p_get)
p_get.add_argument("input", help="one Illumina run directory")
Expand Down
32 changes: 32 additions & 0 deletions igseq/data/examples/igblast.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash

[ -v EXAMPLES ] || EXAMPLES=$(python -c 'import igseq.util; print(igseq.util.DATA)')/examples

# An arbitrary antibody sequence pulled from one of our datasets that looks
# complete and in-frame
QUERY=$EXAMPLES/inputs/igblast/query.fasta

# using the built-in Rhesus germline reference from IMGT and using the default
# text output
igseq igblast -r rhesus/imgt -Q $QUERY

# Trying with AIRR output, saved to a file
igseq igblast -r rhesus/imgt -Q $QUERY -outfmt 19 -out igblast.tsv

# With the AIRR output, attributes are stored in columns with one row for each
# query. For example, the V call and % identity:
cut -f 10,62 igblast.tsv

# or we can include all rhesus references, but igseq will append suffixes to
# the names for any overlapping loci/segments from the reference FASTA files.
# (IgBLAST can't handle repeated sequence IDs for the V/D/J databases.)
# Here we have two equivalent possible V genes, one from IMGT and one from
# Bernat et al. 2021 (https://doi.org/10.1016/j.immuni.2020.12.018)
igseq igblast -r rhesus -Q $QUERY -outfmt 19 | cut -f 10,62

# The AIRR format is one row per query sequence, but there's also IgBLAST's own
# tabular output format that can give more information on separate lines, like
# additional gene assignments.
# The -num_alignments_V argument clashes with iseq's -n, so we need to use --
# to clarify. igseq will remove the extra - when calling igblastn.
igseq igblast -r rhesus -Q $QUERY -outfmt 7 --num_alignments_V 5
8 changes: 8 additions & 0 deletions igseq/data/examples/inputs/igblast/query.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>query
CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTC
ACCTGCGCTGTCTCTGGTGGCTCCATCAGCAGTAACTACTGGAGCTGGATCCGCCAGCCC
CCAGGGAAGGGACTGGAGTGGATTGGACGTATCTCTGGTAGTGGTGGGAGCCCCGACTAC
AACCCCTCCCTCAAGAGTCGAGTCACCATTTCAACAGACACGTCCAAGAGCCAGTTCTCC
CTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGAGATACT
TACAGTAACATCCCACCCAACTTTGACTACTGGGGCCAGGGAGTCCTGGTCACCGTCTCC
TCAG
2 changes: 2 additions & 0 deletions igseq/data/examples/outputs/igblast/igblast.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sequence_id sequence locus stop_codon vj_in_frame v_frameshift productive rev_comp complete_vdj v_call d_call j_call sequence_alignment germline_alignment sequence_alignment_aa germline_alignment_aa v_alignment_start v_alignment_end d_alignment_start d_alignment_end j_alignment_start j_alignment_end v_sequence_alignment v_sequence_alignment_aa v_germline_alignment v_germline_alignment_aa d_sequence_alignment d_sequence_alignment_aa d_germline_alignment d_germline_alignment_aa j_sequence_alignment j_sequence_alignment_aa j_germline_alignment j_germline_alignment_aa fwr1 fwr1_aa cdr1 cdr1_aa fwr2 fwr2_aa cdr2 cdr2_aa fwr3 fwr3_aa fwr4 fwr4_aa cdr3 cdr3_aa junction junction_length junction_aa junction_aa_length v_score d_score j_score v_cigar d_cigar j_cigar v_support d_support j_support v_identity d_identity j_identity v_sequence_start v_sequence_end v_germline_start v_germline_end d_sequence_start d_sequence_end d_germline_start d_germline_end j_sequence_start j_sequence_end j_germline_start j_germline_end fwr1_start fwr1_end cdr1_start cdr1_end fwr2_start fwr2_end cdr2_start cdr2_end fwr3_start fwr3_end fwr4_start fwr4_end cdr3_start cdr3_end np1 np1_length np2 np2_length
query CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTCTGGTGGCTCCATCAGCAGTAACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGACTGGAGTGGATTGGACGTATCTCTGGTAGTGGTGGGAGCCCCGACTACAACCCCTCCCTCAAGAGTCGAGTCACCATTTCAACAGACACGTCCAAGAGCCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGAGATACTTACAGTAACATCCCACCCAACTTTGACTACTGGGGCCAGGGAGTCCTGGTCACCGTCTCCTCAG IGH F T F T F T IGHV4-173*01 IGHD4-23*01,IGHD4-41*01 IGHJ4*01 CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTCTGGTGGCTCCATCAGCAGTAACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGACTGGAGTGGATTGGACGTATCTCTGGTAGTGGTGGGAGCCCCGACTACAACCCCTCCCTCAAGAGTCGAGTCACCATTTCAACAGACACGTCCAAGAGCCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGAGATACTTACAGTAACATCCCACCCAACTTTGACTACTGGGGCCAGGGAGTCCTGGTCACCGTCTCCTCAG CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTCTGGTGGCTCCATCAGCAGTAACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGACTGGAGTGGATTGGACGTATCTCTGGTAGTGGTGGGAGCACCGACTACAACCCCTCCCTCAAGAGTCGAGTCACCATTTCAACAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGAGANNNNTACAGTAACNNNNNNNNNNACTTTGACTACTGGGGCCAGGGAGTCCTGGTCACCGTCTCCTCAG QLQLQESGPGLVKPSETLSLTCAVSGGSISSNYWSWIRQPPGKGLEWIGRISGSGGSPDYNPSLKSRVTISTDTSKSQFSLKLSSVTAADTAVYYCARDTYSNIPPNFDYWGQGVLVTVSS QLQLQESGPGLVKPSETLSLTCAVSGGSISSNYWSWIRQPPGKGLEWIGRISGSGGSTDYNPSLKSRVTISTDTSKNQFSLKLSSVTAADTAVYYCARXXYSNXXXXFDYWGQGVLVTVSS 1 296 301 309 320 364 CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTCTGGTGGCTCCATCAGCAGTAACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGACTGGAGTGGATTGGACGTATCTCTGGTAGTGGTGGGAGCCCCGACTACAACCCCTCCCTCAAGAGTCGAGTCACCATTTCAACAGACACGTCCAAGAGCCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGAGA QLQLQESGPGLVKPSETLSLTCAVSGGSISSNYWSWIRQPPGKGLEWIGRISGSGGSPDYNPSLKSRVTISTDTSKSQFSLKLSSVTAADTAVYYCAR CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTCTGGTGGCTCCATCAGCAGTAACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGACTGGAGTGGATTGGACGTATCTCTGGTAGTGGTGGGAGCACCGACTACAACCCCTCCCTCAAGAGTCGAGTCACCATTTCAACAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGAGA QLQLQESGPGLVKPSETLSLTCAVSGGSISSNYWSWIRQPPGKGLEWIGRISGSGGSTDYNPSLKSRVTISTDTSKNQFSLKLSSVTAADTAVYYCAR TACAGTAAC YSN TACAGTAAC YSN ACTTTGACTACTGGGGCCAGGGAGTCCTGGTCACCGTCTCCTCAG FDYWGQGVLVTVSS ACTTTGACTACTGGGGCCAGGGAGTCCTGGTCACCGTCTCCTCAG FDYWGQGVLVTVSS CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTCT QLQLQESGPGLVKPSETLSLTCAVS GGTGGCTCCATCAGCAGTAACTAC GGSISSNY TGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGACTGGAGTGGATTGGACGT WSWIRQPPGKGLEWIGR ATCTCTGGTAGTGGTGGGAGCCCC ISGSGGSP GACTACAACCCCTCCCTCAAGAGTCGAGTCACCATTTCAACAGACACGTCCAAGAGCCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGT DYNPSLKSRVTISTDTSKSQFSLKLSSVTAADTAVYYC TGGGGCCAGGGAGTCCTGGTCACCGTCTCCTCA WGQGVLVTVSS GCGAGAGATACTTACAGTAACATCCCACCCAACTTTGACTAC ARDTYSNIPPNFDY TGTGCGAGAGATACTTACAGTAACATCCCACCCAACTTTGACTACTGG 48 CARDTYSNIPPNFDYW 16 456.805 17.992 87.208 296M68S 300S4N9M55S3N 319S3N45M 1.262e-130 7.987e-01 1.596e-21 99.324 100.000 100.000 1 296 1 296 301 309 5 13 320 364 4 48 1 75 76 99 100 150 151 174 175 288 331 363 289 330 TACT 4 ATCCCACCCA 10
2 changes: 1 addition & 1 deletion igseq/data/examples/readproc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# output directory is left implicit, but given as the input for the next
# command.

EXAMPLES=$(python -c 'import igseq.util; print(igseq.util.DATA)')/examples
[ -v EXAMPLES ] || EXAMPLES=$(python -c 'import igseq.util; print(igseq.util.DATA)')/examples

# Extract an example Illumina run directory with just a handful of reads
tar xzf $EXAMPLES/runs/YYMMDD_M05588_0232_000000000-CLL8M.tgz
Expand Down
45 changes: 31 additions & 14 deletions igseq/igblast.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@
Any command-line arguments not recognized here are passed as-is to the igblastn
command, so you can configure things like the output format and file path. See
igblastn -help for those options.
igblastn -help for those options. Any igblastn argument can be given with two
dashes if needed to force igseq to handle it correctly (for example,
-num_alignments_V will be interprted as -n um_alignments_V, but
--num_alignments_V will work).
"""

import re
import sys
import logging
from pathlib import Path
import subprocess
Expand Down Expand Up @@ -61,23 +65,31 @@ def igblast(
for attrs in attrs_list:
LOGGER.info("detected ref path: %s", attrs["path"])
LOGGER.info("detected ref type: %s", attrs["type"])
organism = detect_organism(attrs_list, species)
species_det = {attrs.get("species") for attrs in attrs_list}
species_det = {s for s in species_det if s}
organism = detect_organism(species_det, species)
if not dry_run:
setup_db_dir_and_igblast(
[attrs["path"] for attrs in attrs_list],
organism, query_path, db_path, threads, extra_args)

def detect_organism(attrs_list, species=None):
try:
proc, _ = setup_db_dir_and_igblast(
[attrs["path"] for attrs in attrs_list],
organism, query_path, db_path, threads, extra_args,
capture_output=True, text=True, check=True)
except subprocess.CalledProcessError as err:
sys.stdout.write(err.stdout)
sys.stderr.write(err.stderr)
raise util.IgSeqError("IgBLAST crashed") from err
sys.stdout.write(proc.stdout)
sys.stderr.write(proc.stderr)

def detect_organism(species_det, species=None):
"""Determine IgBLAST organism name from multiple species name inputs
attrs_list: output from vdj.parse_vdj_paths
species_det: set of possible species names to use
species: optional overriding species name
This includes some fuzzy matching so things like "rhesus_monkey", "RHESUS",
"rhesus" will all map to "rhesus_monkey" for IgBLAST.
"""
species_det = {attrs.get("species") for attrs in attrs_list}
species_det = {s for s in species_det if s}
if not species and not species_det:
raise util.IgSeqError(
"species not detected from input. specify a species manually.")
Expand All @@ -98,7 +110,7 @@ def detect_organism(attrs_list, species=None):
organism = SPECIESMAP[species]
except KeyError as err:
keys = str(SPECIESMAP.keys())
raise util.IgSeqError("species not recognized. should be one of: %s" % keys) from err
raise util.IgSeqError(f"species not recognized. should be one of: {keys}") from err
LOGGER.info("detected IgBLAST organism: %s", organism)
return organism

Expand All @@ -117,7 +129,7 @@ def setup_db_dir_and_igblast(vdj_ref_paths, organism, query_path,
for segment, attrs in vdj.group(attrs_list).items():
LOGGER.info("detected %s references: %d", segment, len(attrs))
if len(attrs) == 0:
raise util.IgSeqError("No references for segment %s" % segment)
raise util.IgSeqError(f"No references for segment {segment}")
aux.make_aux_file(db_dir/"J.fasta", db_dir/f"{organism}_gl.aux")
makeblastdbs(db_dir)
args = [
Expand All @@ -130,6 +142,8 @@ def setup_db_dir_and_igblast(vdj_ref_paths, organism, query_path,
"-ig_seqtype", "Ig",
"-num_threads", threads]
if extra_args:
# remove any extra - at the start
extra_args = [re.sub("^--", "-", arg) for arg in extra_args]
# make sure none of the extra arguments, if there are any, clash
# with the ones we've defined above.
args_dashes = {arg for arg in args if str(arg).startswith("-")}
Expand All @@ -138,9 +152,11 @@ def setup_db_dir_and_igblast(vdj_ref_paths, organism, query_path,
if shared:
raise util.IgSeqError(f"igblastn arg collision from extra arguments: {shared}")
args += extra_args
return _run_igblastn(args, **runargs)
proc = _run_igblastn(args, **runargs)
return proc, attrs_list

def makeblastdbs(dir_path):
"""Run makeblastdb for existing V.fasta, D.fasta, J.fasta in a directory."""
for segment in ["V", "D", "J"]:
_run_makeblastdb(f"{dir_path}/{segment}.fasta")

Expand All @@ -162,4 +178,5 @@ def _run_igblastn(args, **runargs):
"""
args = [IGBLASTN] + [str(arg) for arg in args]
LOGGER.info("igblastn command: %s", args)
return subprocess.run(args, check=True, **runargs)
LOGGER.info("igblastn subprocess.run args: %s", runargs)
return subprocess.run(args, **runargs) # pylint: disable=subprocess-run-check
40 changes: 37 additions & 3 deletions igseq/show.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,59 @@
"""
Show contents of files built into the igseq package.
List or show contents of files.
The list and show commands can display files built into the igseq package, plus
some recognized file types from outside the package. Give one or more filename
fragments (for builtin files) or complete filenames (for all others). Builtin
files will be filtered to those matching ALL fragments, and external files will
include exact matches only.
igseq list list all data files built into the package
igseq list germ list germline reference files
igseq list germ V list germline ref files with V in their names
igseq show primers show igseq's primers.csv with nice formatting
igseq show IGHV.fasta show all builtin IGHV FASTAs concatenated
igseq show some/file.csv show an arbitrary CSV file
"""

import os
import sys
import logging
from pathlib import Path
from csv import DictReader
from .util import FILES

LOGGER = logging.getLogger(__name__)

def show_files(text_items, force=False):
"""Format and print matching files to stdout."""
if not text_items or not [item for item in text_items if item]:
LOGGER.warning("No items to show for %s", text_items)
return
text_items = [str(item) for item in text_items]
other_paths = []
for item in text_items:
if Path(item).is_file():
other_paths.append(item)
show_file(item, force)
text_items = [item for item in text_items if item not in other_paths]
internal_paths = []
if text_items:
for path in FILES:
if all([text in str(path) for text in text_items]):
if all(text in str(path) for text in text_items):
show_file(path, force)
internal_paths.append(path)
if not other_paths and not internal_paths:
LOGGER.warning("No files found for %s", text_items)

def list_files(text_items):
"""List all matching builtin data files on stdout."""
text_items = [str(item) for item in text_items]
for path in FILES:
if all([text in str(path) for text in text_items]):
if all(text in str(path) for text in text_items):
print(path)

def show_file(path, force=False):
"""Infer filetype from extention and pretty-print to stdout."""
path = Path(path)
if path.suffix in [".csv"]:
show_csv(path)
Expand All @@ -46,11 +74,15 @@ def show_file(path, force=False):
"Use --force if you're sure.\n")

def show_csv(path, **kwargs):
"""Pretty-print CSV file to stdout."""
with open(path, encoding="UTF8") as f_in:
reader = DictReader(f_in, **kwargs)
show_grid(list(reader))

def show_grid(grid):
"""Pretty-print list of dictionaries to stdout."""
if not grid:
return
fieldnames = grid[0].keys()
widths = {}
for key in fieldnames:
Expand All @@ -67,10 +99,12 @@ def show_grid(grid):
print("")

def show_text(path):
"""Print a plaintext file to stdout."""
with open(path, encoding="UTF8") as f_in:
sys.stdout.write(f_in.read())

def show_raw(path):
"""Print a binary file to stdout."""
# https://stackoverflow.com/a/54073813/4499968
with os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as stdout, open(path, "rb") as f_in:
stdout.write(f_in.read())
Expand Down
Loading

0 comments on commit 3e121fe

Please sign in to comment.