genomic-surveillance.yaml

inputs:
  - name: reference_data
    metadata: https://data.nextstrain.org/files/ncov/open/reference/metadata.tsv.xz
    sequences: https://data.nextstrain.org/files/ncov/open/reference/sequences.fasta.xz
  - name: corvaseq
    metadata: gisaid_auspice_omicron_corvaseq/omicron_corvaseq_augur.tar.gz
    sequences: gisaid_auspice_omicron_corvaseq/omicron_corvaseq_augur.tar.gz
  - name: background_data
    metadata: data/ncov_north-america.tar.gz
    sequences: data/ncov_north-america.tar.gz

# GenBank data includes "Wuhan-Hu-1/2019" which we use as the root for this build.
refine:
  root: "Wuhan-Hu-1/2019"

# Define a single build for the state of interest, Idaho.
# The build name will be "idaho" and it will use the custom
# subsampling scheme defined below.
builds:
  northcarolina:
    title: "North Carolina-specific genomic surveillance build"
    subsampling_scheme: northcarolina_scheme
    # Defines colorings for input data sources
    # (e.g. "background_data" is "yes" or "no").
    auspice_config: my-ncov-analyses/auspice-config-custom-data.json

# Define a single subsampling scheme for the state of Idaho.
# This analysis is for a specific date range, so we specify
# the same maximum collection date for strains in all sections
# of the subsampling scheme below.
subsampling:
  northcarolina_scheme:
    # Include all data from Idaho.
    # When the workflow merges metadata from multiple
    # inputs, it creates a boolean column for each input to
    # indicate which input each record came from. A record
    # from the "usa" input will have a value of "yes" in a
    # column named "usa". The same record will have a column
    # for the "nextregions" input with a value of "no".
    custom_sample:
      query: --query "(corvaseq == 'yes')"
      # Limit the number of Idaho records included in the
      # analysis to a reasonable but large number. Tune this
      # number alone with the other "max_sequences" in the
      # sections below to keep your final build to <10,000
      # records.
    # To understand transmission patterns within the US that
    # led to introductions to Idaho, we select a subset of USA
    # data from states other than Idaho with priority given to
    # strains that are genetically similar to the strains in
    # the "idaho" subsampling set defined above.
    nc_context:
      query: --query "(custom_data != 'yes') & (country == 'USA' & division == 'North Carolina')"
      # This value sets a hard upper limit on how many strains
      # make it into the analysis. Tune this value, based on
      # your needs for the resulting tree.
      max_sequences: 1000
      # These group-by columns attempt to evenly sample across
      # US states by year and month. Sequences in each group
      # of state, year, and month are prioritized by genetic
      # proximity.
      min_date: "--min-date 2021-10-31"
      max_date: "--max-date 2022-03-05"
      group_by: year month
      priorities:
        type: proximity
        focus: custom_sample
    usa_context:
      query: --query "(custom_data != 'yes') & (country == 'USA' & division != 'North Carolina')"
      # This value sets a hard upper limit on how many strains
      # make it into the analysis. Tune this value, based on
      # your needs for the resulting tree.
      max_sequences: 100
      # These group-by columns attempt to evenly sample across
      # US states by year and month. Sequences in each group
      # of state, year, and month are prioritized by genetic
      # proximity.
      min_date: "--min-date 2021-10-31"
      max_date: "--max-date 2022-03-05"
      group_by: division year month
      priorities:
        type: proximity
        focus: custom_sample
    # Select a subset of data from the "background_data" for
    # context. This example prioritizes strains that are
    # genetically related to the "idaho" subsampling set, but
    # you can remove the "priorities" block to get a random
    # global context instead.
    northamerica_context:
      query: --query "(custom_data != 'yes')"
      # As with the contextual data from the USA above, tune
      # this value to get a reasonable number of strains in
      # your build.
      max_sequences: 10
      min_date: "--min-date 2021-10-31"
      max_date: "--max-date 2022-03-05"
      priorities:
        type: proximity
        focus: custom_sample