generated from nextstrain/ncov-tutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
/
genomic-surveillance.yaml
99 lines (96 loc) · 4.26 KB
/
genomic-surveillance.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
inputs:
- name: reference_data
metadata: https://data.nextstrain.org/files/ncov/open/reference/metadata.tsv.xz
sequences: https://data.nextstrain.org/files/ncov/open/reference/sequences.fasta.xz
- name: corvaseq
metadata: gisaid_auspice_omicron_corvaseq/omicron_corvaseq_augur.tar.gz
sequences: gisaid_auspice_omicron_corvaseq/omicron_corvaseq_augur.tar.gz
- name: background_data
metadata: data/ncov_north-america.tar.gz
sequences: data/ncov_north-america.tar.gz
# GenBank data includes "Wuhan-Hu-1/2019" which we use as the root for this build.
refine:
root: "Wuhan-Hu-1/2019"
# Define a single build for the state of interest, Idaho.
# The build name will be "idaho" and it will use the custom
# subsampling scheme defined below.
builds:
northcarolina:
title: "North Carolina-specific genomic surveillance build"
subsampling_scheme: northcarolina_scheme
# Defines colorings for input data sources
# (e.g. "background_data" is "yes" or "no").
auspice_config: my-ncov-analyses/auspice-config-custom-data.json
# Define a single subsampling scheme for the state of Idaho.
# This analysis is for a specific date range, so we specify
# the same maximum collection date for strains in all sections
# of the subsampling scheme below.
subsampling:
northcarolina_scheme:
# Include all data from Idaho.
# When the workflow merges metadata from multiple
# inputs, it creates a boolean column for each input to
# indicate which input each record came from. A record
# from the "usa" input will have a value of "yes" in a
# column named "usa". The same record will have a column
# for the "nextregions" input with a value of "no".
custom_sample:
query: --query "(corvaseq == 'yes')"
# Limit the number of Idaho records included in the
# analysis to a reasonable but large number. Tune this
# number alone with the other "max_sequences" in the
# sections below to keep your final build to <10,000
# records.
# To understand transmission patterns within the US that
# led to introductions to Idaho, we select a subset of USA
# data from states other than Idaho with priority given to
# strains that are genetically similar to the strains in
# the "idaho" subsampling set defined above.
nc_context:
query: --query "(custom_data != 'yes') & (country == 'USA' & division == 'North Carolina')"
# This value sets a hard upper limit on how many strains
# make it into the analysis. Tune this value, based on
# your needs for the resulting tree.
max_sequences: 1000
# These group-by columns attempt to evenly sample across
# US states by year and month. Sequences in each group
# of state, year, and month are prioritized by genetic
# proximity.
min_date: "--min-date 2021-10-31"
max_date: "--max-date 2022-03-05"
group_by: year month
priorities:
type: proximity
focus: custom_sample
usa_context:
query: --query "(custom_data != 'yes') & (country == 'USA' & division != 'North Carolina')"
# This value sets a hard upper limit on how many strains
# make it into the analysis. Tune this value, based on
# your needs for the resulting tree.
max_sequences: 100
# These group-by columns attempt to evenly sample across
# US states by year and month. Sequences in each group
# of state, year, and month are prioritized by genetic
# proximity.
min_date: "--min-date 2021-10-31"
max_date: "--max-date 2022-03-05"
group_by: division year month
priorities:
type: proximity
focus: custom_sample
# Select a subset of data from the "background_data" for
# context. This example prioritizes strains that are
# genetically related to the "idaho" subsampling set, but
# you can remove the "priorities" block to get a random
# global context instead.
northamerica_context:
query: --query "(custom_data != 'yes')"
# As with the contextual data from the USA above, tune
# this value to get a reasonable number of strains in
# your build.
max_sequences: 10
min_date: "--min-date 2021-10-31"
max_date: "--max-date 2022-03-05"
priorities:
type: proximity
focus: custom_sample