diff --git a/ingest/Snakefile b/ingest/Snakefile index 6411df9..f1d2404 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -22,9 +22,9 @@ wildcard_constraints: # TODO: Add link to centralized docs on standard Nextstrain metadata fields rule all: input: - sequences=expand("results/sequences_{segment}.fasta", segment=segments), - metadata=expand("results/metadata_{segment}.tsv", segment=segments), - metadata_all="results/metadata_all.tsv", + sequences=expand("results/{segment}/sequences.fasta", segment=segments), + metadata=expand("results/{segment}/metadata.tsv", segment=segments), + metadata_all="results/all/metadata.tsv", # Note that only PATHOGEN-level customizations should be added to these @@ -40,10 +40,10 @@ rule create_final_metadata: input: metadata="data/subset_metadata.tsv" output: - metadata="results/metadata_all.tsv" + metadata="results/all/metadata.tsv" shell: """ - mv {input.metadata} {output.metadata} + cp {input.metadata} {output.metadata} """ # Allow users to import custom rules provided via the config. diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml index 5c26ed0..0ebdc26 100644 --- a/ingest/build-configs/nextstrain-automation/config.yaml +++ b/ingest/build-configs/nextstrain-automation/config.yaml @@ -17,9 +17,9 @@ s3_dst: "s3://nextstrain-data/files/workflows/lassa" # Mapping of files to upload files_to_upload: ncbi.ndjson.zst: data/ncbi.ndjson - metadata_all.tsv.zst: results/metadata_all.tsv - sequences_all.fasta.zst: results/sequences_all.fasta - metadata_L.tsv.zst: results/metadata_L.tsv - sequences_L.fasta.zst: results/sequences_L.fasta - metadata_S.tsv.zst: results/metadata_S.tsv - sequences_S.fasta.zst: results/sequences_S.fasta + all/metadata.tsv.zst: results/all/metadata.tsv + all/sequences.fasta.zst: results/all/sequences.fasta + L/metadata.tsv.zst: results/L/metadata.tsv + L/sequences.fasta.zst: results/L/sequences.fasta + S/metadata.tsv.zst: results/S/metadata.tsv + S/sequences.fasta.zst: results/S/sequences.fasta diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index cf9fa92..29f2520 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -62,7 +62,7 @@ rule curate: annotations=config["curate"]["annotations"], output: metadata="data/all_metadata.tsv", - sequences="results/sequences_all.fasta", + sequences="results/all/sequences.fasta", log: "logs/curate.txt", benchmark: @@ -121,11 +121,11 @@ rule subset_metadata: input: metadata="data/all_metadata.tsv", output: - subset_metadata="data/subset_metadata.tsv", + metadata="data/subset_metadata.tsv", params: metadata_fields=",".join(config["curate"]["metadata_columns"]), shell: """ tsv-select -H -f {params.metadata_fields} \ - {input.metadata} > {output.subset_metadata} + {input.metadata} > {output.metadata} """ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index d6595e5..fb044a2 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -5,12 +5,12 @@ and sequences to split the sequences into L and S segments. REQUIRED INPUTS: metadata = data/subset_metadata.tsv - sequences = results/sequences_all.fasta + sequences = results/all/sequences.fasta OUTPUTS: - metadata = results/metadata_{segment}.tsv - sequences = results/sequences_{segment}.fasta + metadata = results/{segment}/metadata.tsv + sequences = results/{segment}/sequences.fasta See Nextclade docs for more details on usage, inputs, and outputs if you would like to customize the rules: @@ -20,10 +20,10 @@ https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html rule run_nextclade_to_identify_segment: input: metadata = "data/subset_metadata.tsv", - sequences = "results/sequences_all.fasta", + sequences = "results/all/sequences.fasta", segment_reference = config["nextclade"]["segment_reference"], output: - sequences = "results/sequences_{segment}.fasta", + sequences = "results/{segment}/sequences.fasta", params: min_seed_cover = config["nextclade"]["min_seed_cover"], shell: @@ -38,10 +38,10 @@ rule run_nextclade_to_identify_segment: rule subset_metadata_by_segment: input: - metadata = "data/subset_metadata.tsv", - sequences = "results/sequences_{segment}.fasta", + metadata = "results/all/metadata.tsv", + sequences = "results/{segment}/sequences.fasta", output: - metadata = "results/metadata_{segment}.tsv", + metadata = "results/{segment}/metadata.tsv", params: strain_id_field = config["curate"]["output_id_field"], shell: