Skip to content

Commit

Permalink
Merge pull request #46 from nextstrain/nextclade_v3
Browse files Browse the repository at this point in the history
Nextclade v3
  • Loading branch information
rneher authored Dec 14, 2023
2 parents 6f84388 + b260651 commit 65667c1
Show file tree
Hide file tree
Showing 18 changed files with 139 additions and 1,608 deletions.
5 changes: 0 additions & 5 deletions config/auspice_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,6 @@
"build_url": "https://github.com/nextstrain/rsv",

"colorings": [
{
"key": "genome_clade",
"title": "Genome Clade",
"type": "categorical"
},
{
"key": "gt",
"title": "Genotype",
Expand Down
76 changes: 76 additions & 0 deletions ingest/bin/extend-metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from Bio import SeqIO
import numpy as np
import pandas as pd
from Bio import SeqIO
from collections import defaultdict

NEXTCLADE_JOIN_COLUMN_NAME = 'seqName'
VALUE_MISSING_DATA = '?'

column_map = {
"clade": "clade",
"lineage": "lineage",
"coverage": "genome_coverage",
"totalMissing": "missing_data",
"totalSubstitutions": "divergence",
"totalNonACGTNs": "nonACGTN"
}

coordinates = {'a':{'G':[4652, 5617], 'F':[5697,7421]},
'b':{'G':[4646, 5578], 'F':[5676,7400]}}

def coverage(target, total):
if total[0]>target[1] or total[1]<target[0]:
# to overlap
return 0
elif total[0]<=target[0] and total[1]>=target[1]:
# total overlap
return 1
elif total[0]>target[0] and total[1]<target[1]:
# total contained in target
return (total[1]-total[0])/(target[1]-target[0])
elif total[0]>target[0] and total[1]>target[1]:
# overlap with total to the right of target
return (target[1]-total[0])/(target[1]-target[0])
else:
# overlap with total to the left of target
return (total[1]-target[0])/(target[1]-target[0])


if __name__=="__main__":
import argparse, sys
parser = argparse.ArgumentParser()
parser.add_argument("--metadata")
parser.add_argument("--nextclade")
parser.add_argument("--id-field")
parser.add_argument("--virus-type")
parser.add_argument("--output", default=sys.stdout)
args = parser.parse_args()

metadata = pd.read_csv(args.metadata, index_col=args.id_field,
sep='\t', low_memory=False, na_filter = False)

# Read and rename clade column to be more descriptive
clades = pd.read_csv(args.nextclade, index_col=NEXTCLADE_JOIN_COLUMN_NAME,
sep='\t', low_memory=False, na_filter = False) \
.rename(columns=column_map)

# Concatenate on columns
result = pd.merge(
metadata, clades,
left_index=True,
right_index=True,
how='left'
)

for gene in coordinates[args.virus_type]:
def get_coverage(d):
try:
return coverage(coordinates[args.virus_type][gene], [int(d.alignmentStart), int(d.alignmentEnd)])
except:
print('missing alignment for ',d.name)
return np.nan

result[f"{gene}_coverage"] = result.apply(get_coverage, axis=1)

result.to_csv(args.output, index_label=args.id_field, sep='\t')
71 changes: 0 additions & 71 deletions ingest/bin/gene-coverage.py

This file was deleted.

19 changes: 0 additions & 19 deletions ingest/bin/metadata_dedup.py

This file was deleted.

31 changes: 0 additions & 31 deletions ingest/bin/sequencesandmetadata.py

This file was deleted.

75 changes: 0 additions & 75 deletions ingest/bin/sort.py

This file was deleted.

Loading

0 comments on commit 65667c1

Please sign in to comment.