nextstrain · joverlee521 · Sep 13, 2023 · Sep 12, 2023 · Sep 13, 2023 · Sep 13, 2023
diff --git a/.github/workflows/fetch-and-ingest.yaml b/.github/workflows/fetch-and-ingest.yaml
@@ -26,36 +26,62 @@ on:
 
   # Manually triggered using GitHub's UI
   workflow_dispatch:
+    inputs:
+      trial_name:
+        description: 'Short name for a trial run. WARNING: without this we will overwrite files in s3://nextstrain-data/files/workflows/rsv'
 
 jobs:
-  fetch-and-ingest:
+  set_config_overrides:
     runs-on: ubuntu-latest
-    env:
-      GITHUB_RUN_ID: ${{ github.run_id }}
     steps:
-    - uses: actions/checkout@v3
-    - uses: nextstrain/.github/actions/setup-nextstrain-cli@master
+      - id: s3_dst
+        run: |
+          S3_DST=s3://nextstrain-data/files/workflows/rsv
+
+          if [[ -n "$TRIAL_NAME" ]]; then
+            S3_DST+=/trial/"$TRIAL_NAME"
+          fi
+
+          echo "s3_dst=$S3_DST" >> "$GITHUB_OUTPUT"
+        env:
+          TRIAL_NAME: ${{ inputs.trial_name }}
+      - id: trigger_rebuild
+        run: |
+          TRIGGER_REBUILD=true
 
-    - name: run_pipeline
+          if [[ -n "$TRIAL_NAME" ]]; then
+            TRIGGER_REBUILD=false
+          fi
+
+          echo "trigger_rebuild=$TRIGGER_REBUILD" >> "$GITHUB_OUTPUT"
+    outputs:
+      s3_dst: ${{ steps.s3_dst.outputs.s3_dst }}
+      trigger_rebuild: ${{ steps.trigger_rebuild.outputs.trigger_rebuild }}
+
+  fetch-and-ingest:
+    needs: [set_config_overrides]
+    permissions:
+      id-token: write
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
+    secrets: inherit
+    with:
+      runtime: aws-batch
       run: |
-        ./bin/write-envdir ingest/env.d \
-          AWS_DEFAULT_REGION \
-          GITHUB_RUN_ID \
-          PAT_GITHUB_DISPATCH
         nextstrain build \
           --aws-batch \
           --detach \
           --no-download \
           --cpus 8 \
           --memory 32gib \
-          --exec env \
+          --env AWS_ACCESS_KEY_ID \
+          --env AWS_SECRET_ACCESS_KEY \
+          --env PAT_GITHUB_DISPATCH="$GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH" \
+          --env S3_DST \
+          --env TRIGGER_REBUILD \
           ingest \
-            envdir env.d snakemake \
-              --configfiles config/config.yaml config/optional.yaml \
-              --cores 8 \
-              --printshellcmds
-      env:
-        AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }}
-        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-        PAT_GITHUB_DISPATCH: ${{ secrets.GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH }}
+            --configfiles config/config.yaml config/optional.yaml \
+            --config s3_dst="$S3_DST" trigger_rebuild="$TRIGGER_REBUILD" \
+            --printshellcmds
+      env: |
+        S3_DST: ${{ needs.set_config_overrides.outputs.s3_dst }}
+        TRIGGER_REBUILD: ${{ needs.set_config_overrides.outputs.trigger_rebuild }}
diff --git a/.github/workflows/rebuild.yaml b/.github/workflows/rebuild.yaml
@@ -28,32 +28,22 @@ on:
 
 jobs:
   rebuild_rsv:
-    runs-on: ubuntu-latest
-    env:
-      GITHUB_RUN_ID: ${{ github.run_id }}
-    steps:
-    - uses: actions/checkout@v3
-    - uses: nextstrain/.github/actions/setup-nextstrain-cli@master
-
-    - name: launch_build
+    permissions:
+      id-token: write
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
+    secrets: inherit
+    with:
+      runtime: aws-batch
       run: |
-        ./bin/write-envdir env.d \
-          AWS_DEFAULT_REGION \
-          GITHUB_RUN_ID
-
         nextstrain build \
           --aws-batch \
           --detach \
           --no-download \
           --cpus 16 \
           --memory 64gib \
-          --exec env \
-          . \
-            envdir env.d snakemake deploy \
-              --configfiles config/configfile.yaml config/nextstrain_automation.yaml \
-              --cores 16 \
-              --printshellcmds
-      env:
-        AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }}
-        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          --env AWS_ACCESS_KEY_ID \
+          --env AWS_SECRET_ACCESS_KEY \
+          .
+          deploy \
+            --configfiles config/configfile.yaml config/nextstrain_automation.yaml \
+            --printshellcmds
diff --git a/bin/set-branch-ingest-config b/bin/set-branch-ingest-config
diff --git a/bin/write-envdir b/bin/write-envdir
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -25,7 +25,7 @@ def _get_all_targets(wildcards):
             )
         elif len(remote_file_names) != len(set(remote_file_names)):
             print(f"Skipping file upload for {target!r} because there are duplicate remote file names.")
-        elif not params.get("dst"):
+        elif not config.get("s3_dst"):
             print(f"Skipping file upload for {target!r} because the destintion was not defined.")
         else:
             all_targets.extend(
@@ -37,10 +37,10 @@ def _get_all_targets(wildcards):
                 )
             )
     return all_targets
-    
+
 rule all:
     input: _get_all_targets
-        
+
 
 
 include: "workflow/snakemake_rules/fetch_sequences.smk"

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
@@ -1,11 +1,11 @@
 # Sources of sequences to include in the ingest run
 sources: ['genbank']
 conda_environment: "workflow/envs/nextstrain.yaml"
-fetch:
-  genbank_url:
-    a: "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%28208893%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession%3Aid%2Cgenbank_accession_rev%3AAccVer_s%2Cdatabase%3ASourceDB_s%2Cstrain%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cisolation_source%3AIsolation_csv%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Csequence%3ANucleotide_seq&sort=SourceDB_s+desc%2C+CollectionDate_s+asc%2C+id+asc&email=hello%40nextstrain.org"
-    b: "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%28208895%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession%3Aid%2Cgenbank_accession_rev%3AAccVer_s%2Cdatabase%3ASourceDB_s%2Cstrain%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cisolation_source%3AIsolation_csv%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Csequence%3ANucleotide_seq&sort=SourceDB_s+desc%2C+CollectionDate_s+asc%2C+id+asc&email=hello%40nextstrain.org"
-    general: "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%2811250%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession%3Aid%2Cgenbank_accession_rev%3AAccVer_s%2Cdatabase%3ASourceDB_s%2Cstrain%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cisolation_source%3AIsolation_csv%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Csequence%3ANucleotide_seq&sort=SourceDB_s+desc%2C+CollectionDate_s+asc%2C+id+asc&email=hello%40nextstrain.org"
+
+ncbi_taxon_id:
+  a: "208893"
+  b: "208895"
+  general: "11250"
 
 # Params for the transform rulegeneral
 transform:
@@ -72,23 +72,3 @@ transform:
     'authors',
     'institution'
   ]
-
-upload:
-  s3:
-    # AWS S3 Bucket with prefix
-    dst: 's3://nextstrain-data/files/workflows/rsv'
-    # Files to upload to S3 that are in the `data` directory
-    files_to_upload: [
-      'a/metadata.tsv',
-      'a/sequences.fasta',
-      'b/metadata.tsv',
-      'b/sequences.fasta'
-    ]
-    # Remote file names for the files to upload, must be in the same order as local files above
-    remote_file_names: [
-      'a/metadata.tsv.gz',
-      'a/sequences.fasta.xz',
-      'b/metadata.tsv.gz',
-      'b/sequences.fasta.xz'
-    ]
-    cloudfront_domain: 'data.nextstrain.org'
diff --git a/ingest/config/optional.yaml b/ingest/config/optional.yaml
@@ -1,10 +1,10 @@
 # Optional configs used by Nextstrain team
 # Params for uploads
+# AWS S3 Bucket with prefix
+s3_dst: 's3://nextstrain-data/files/workflows/rsv'
 upload:
   # Upload params for AWS S3
   s3:
-    # AWS S3 Bucket with prefix
-    dst: 's3://nextstrain-data/files/workflows/rsv'
     # Files to upload to S3 that are in the `data` directory
     files_to_upload: [
       'genbank.ndjson',

diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv
@@ -0,0 +1,17 @@
+key	value
+Accession	genbank_accession_rev
+Source database	database
+Isolate Lineage	strain
+Geographic Region	region
+Geographic Location	location
+Isolate Collection date	collected
+Release date	submitted
+Update date	updated
+Length	length
+Host Name	host
+Isolate Lineage source	isolation_source
+BioProjects	bioproject_accession
+BioSample accession	biosample_accession
+SRA Accessions	sra_accession
+Submitter Names	authors
+Submitter Affiliation	submitting_organization