Skip to content

Commit

Permalink
Set segment/filename structure for ingest workflow
Browse files Browse the repository at this point in the history
Across the pipeline and upload steps, organize the files into a hierarchical structure

* <segment>/metadata.tsv.zst
* <segment>/sequences.fasta.zst
  • Loading branch information
j23414 committed Jul 29, 2024
1 parent bd5b432 commit a3c9794
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 22 deletions.
10 changes: 5 additions & 5 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ wildcard_constraints:
# TODO: Add link to centralized docs on standard Nextstrain metadata fields
rule all:
input:
sequences=expand("results/sequences_{segment}.fasta", segment=segments),
metadata=expand("results/metadata_{segment}.tsv", segment=segments),
metadata_all="results/metadata_all.tsv",
sequences=expand("results/{segment}/sequences.fasta", segment=segments),
metadata=expand("results/{segment}/metadata.tsv", segment=segments),
metadata_all="results/all/metadata.tsv",


# Note that only PATHOGEN-level customizations should be added to these
Expand All @@ -40,10 +40,10 @@ rule create_final_metadata:
input:
metadata="data/subset_metadata.tsv"
output:
metadata="results/metadata_all.tsv"
metadata="results/all/metadata.tsv"
shell:
"""
mv {input.metadata} {output.metadata}
cp {input.metadata} {output.metadata}
"""

# Allow users to import custom rules provided via the config.
Expand Down
12 changes: 6 additions & 6 deletions ingest/build-configs/nextstrain-automation/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ s3_dst: "s3://nextstrain-data/files/workflows/lassa"
# Mapping of files to upload
files_to_upload:
ncbi.ndjson.zst: data/ncbi.ndjson
metadata_all.tsv.zst: results/metadata_all.tsv
sequences_all.fasta.zst: results/sequences_all.fasta
metadata_L.tsv.zst: results/metadata_L.tsv
sequences_L.fasta.zst: results/sequences_L.fasta
metadata_S.tsv.zst: results/metadata_S.tsv
sequences_S.fasta.zst: results/sequences_S.fasta
all/metadata.tsv.zst: results/all/metadata.tsv
all/sequences.fasta.zst: results/all/sequences.fasta
L/metadata.tsv.zst: results/L/metadata.tsv
L/sequences.fasta.zst: results/L/sequences.fasta
S/metadata.tsv.zst: results/S/metadata.tsv
S/sequences.fasta.zst: results/S/sequences.fasta
6 changes: 3 additions & 3 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ rule curate:
annotations=config["curate"]["annotations"],
output:
metadata="data/all_metadata.tsv",
sequences="results/sequences_all.fasta",
sequences="results/all/sequences.fasta",
log:
"logs/curate.txt",
benchmark:
Expand Down Expand Up @@ -121,11 +121,11 @@ rule subset_metadata:
input:
metadata="data/all_metadata.tsv",
output:
subset_metadata="data/subset_metadata.tsv",
metadata="data/subset_metadata.tsv",
params:
metadata_fields=",".join(config["curate"]["metadata_columns"]),
shell:
"""
tsv-select -H -f {params.metadata_fields} \
{input.metadata} > {output.subset_metadata}
{input.metadata} > {output.metadata}
"""
16 changes: 8 additions & 8 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ and sequences to split the sequences into L and S segments.
REQUIRED INPUTS:
metadata = data/subset_metadata.tsv
sequences = results/sequences_all.fasta
sequences = results/all/sequences.fasta
OUTPUTS:
metadata = results/metadata_{segment}.tsv
sequences = results/sequences_{segment}.fasta
metadata = results/{segment}/metadata.tsv
sequences = results/{segment}/sequences.fasta
See Nextclade docs for more details on usage, inputs, and outputs if you would
like to customize the rules:
Expand All @@ -20,10 +20,10 @@ https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
rule run_nextclade_to_identify_segment:
input:
metadata = "data/subset_metadata.tsv",
sequences = "results/sequences_all.fasta",
sequences = "results/all/sequences.fasta",
segment_reference = config["nextclade"]["segment_reference"],
output:
sequences = "results/sequences_{segment}.fasta",
sequences = "results/{segment}/sequences.fasta",
params:
min_seed_cover = config["nextclade"]["min_seed_cover"],
shell:
Expand All @@ -38,10 +38,10 @@ rule run_nextclade_to_identify_segment:

rule subset_metadata_by_segment:
input:
metadata = "data/subset_metadata.tsv",
sequences = "results/sequences_{segment}.fasta",
metadata = "results/all/metadata.tsv",
sequences = "results/{segment}/sequences.fasta",
output:
metadata = "results/metadata_{segment}.tsv",
metadata = "results/{segment}/metadata.tsv",
params:
strain_id_field = config["curate"]["output_id_field"],
shell:
Expand Down

0 comments on commit a3c9794

Please sign in to comment.