From 9ca57883c03447bd602e87891505831628941a28 Mon Sep 17 00:00:00 2001 From: April Shen Date: Fri, 29 Sep 2023 11:54:27 +0100 Subject: [PATCH 1/4] update installation instructions and move nextflow pipelines to single directory --- README.md | 37 ++++++++++++++----- .../step1-fetch-clinvar-data.md | 2 +- docs/manual-curation/step3-export-results.md | 2 +- .../open-targets/generate-evidence-strings.md | 2 +- .../annotation_pipeline.nf | 21 ++++++----- .../export_curation_spreadsheet.nf | 5 ++- .../generate_curation_spreadsheet.nf | 7 ++-- pipelines/nextflow.config | 3 ++ setup.py | 2 +- tests/output_generation/test_pipeline.sh | 1 - 10 files changed, 52 insertions(+), 30 deletions(-) rename cmat/output_generation/pipeline.nf => pipelines/annotation_pipeline.nf (92%) rename cmat/trait_mapping/export.nf => pipelines/export_curation_spreadsheet.nf (97%) rename cmat/trait_mapping/generate.nf => pipelines/generate_curation_spreadsheet.nf (94%) create mode 100644 pipelines/nextflow.config diff --git a/README.md b/README.md index eb085d35..7e9e5864 100644 --- a/README.md +++ b/README.md @@ -10,26 +10,43 @@ For instructions on how to process ClinVar data for the Open Targets platform, s ## Install -The code requires Python 3.8+. You can install the library and its dependencies as follows (e.g. in a virtual environment): +The code requires Python 3.8+, and you will also need Nextflow 21.10+ to run the pipelines. Refer to [Nextflow documentation](https://www.nextflow.io/docs/latest/getstarted.html) for specifics on installing Nextflow on your system. +To install CMAT, first either clone the repository or download the latest released version from [here](https://github.com/EBIvariation/CMAT/releases): ```bash git clone git@github.com:EBIvariation/CMAT.git +# OR +wget -O CMAT.zip https://github.com/EBIvariation/CMAT/archive/refs/tags/v3.0.3.zip +unzip CMAT.zip +``` + +Then install the library and its dependencies as follows (e.g. in a virtual environment): +```bash cd CMAT pip install -r requirements.txt python setup.py install ``` -Running the pipelines also requires Nextflow 21.10+. Refer to [Nextflow documentation](https://www.nextflow.io/docs/latest/getstarted.html) for specifics on installing Nextflow on your system. +You then need to set the `PYTHON_BIN` variable in the [Nextflow config](pipelines/nextflow.config), which will allow the +Nextflow processes to access the correct Python executable. -Finally, the pipelines currently require that the following environment variables be set: +Finally, the instructions in this readme use the following environment variables as a convenience, they are not needed for the pipelines to run. ```bash -# Path to directory where this repo is cloned +# Path to directory where source code is downloaded export CODE_ROOT= -# Path to python executable (allows nextflow processes to access python) -export PYTHON_BIN= # Path to ontology mapping file (the provided path points to the version included in this repo) export LATEST_MAPPINGS=${CODE_ROOT}/mappings/latest_mappings.tsv -```` +``` + +To confirm everything is set up properly, you can run the annotation pipeline on the small dataset included with the tests. +It should take a couple minutes to run and generate a file `annotated_clinvar.xml.gz` in the test directory. +```bash +mkdir testdir && cd testdir +nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \ + --output_dir . \ + --clinvar ${CODE_ROOT}/tests/output_generation/resources/end2end/input.xml.gz \ + --mappings ${LATEST_MAPPINGS} +``` ## Run @@ -51,7 +68,7 @@ cd ${ANNOTATION_ROOT} mkdir -p gene_mapping logs # Run the nextflow pipeline, resuming execution of previous attempt if possible. -nextflow run ${CODE_ROOT}/cmat/output_generation/pipeline.nf \ +nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \ --output_dir ${ANNOTATION_ROOT} \ --mappings ${LATEST_MAPPINGS} \ -resume @@ -81,7 +98,7 @@ mkdir -p ${CURATION_ROOT} cd ${CURATION_ROOT} # Run the nextflow pipeline, resuming execution of previous attempt if possible. -nextflow run ${CODE_ROOT}/cmat/trait_mapping/generate.nf \ +nextflow run ${CODE_ROOT}/pipelines/generate_curation_spreadsheet.nf \ --curation_root ${CURATION_ROOT} \ --mappings ${LATEST_MAPPINGS} \ --comments ${CURATOR_COMMENTS} \ @@ -108,7 +125,7 @@ Download the spreadsheet as a CSV file, making sure that all the data is visible cd ${CURATION_ROOT} # Run the nextflow pipeline, resuming execution of previous attempt if possible. -nextflow run ${CODE_ROOT}/cmat/trait_mapping/export.nf \ +nextflow run ${CODE_ROOT}/pipelines/export_curation_spreadsheet.nf \ --input_csv ${CURATION_ROOT}/finished_curation_spreadsheet.csv \ --curation_root ${CURATION_ROOT} \ --mappings ${LATEST_MAPPINGS} \ diff --git a/docs/manual-curation/step1-fetch-clinvar-data.md b/docs/manual-curation/step1-fetch-clinvar-data.md index 7c13f379..6a2de020 100644 --- a/docs/manual-curation/step1-fetch-clinvar-data.md +++ b/docs/manual-curation/step1-fetch-clinvar-data.md @@ -11,7 +11,7 @@ mkdir -p ${CURATION_RELEASE_ROOT} cd ${CURATION_RELEASE_ROOT} # Run the nextflow pipeline, resuming execution of previous attempt if possible. -nextflow run ${CODE_ROOT}/cmat/trait_mapping/generate.nf \ +nextflow run ${CODE_ROOT}/pipelines/generate_curation_spreadsheet.nf \ --curation_root ${CURATION_RELEASE_ROOT} \ -resume ``` diff --git a/docs/manual-curation/step3-export-results.md b/docs/manual-curation/step3-export-results.md index f5a5edd5..ecaa08ad 100644 --- a/docs/manual-curation/step3-export-results.md +++ b/docs/manual-curation/step3-export-results.md @@ -14,7 +14,7 @@ Once the manual curation is completed, download the spreadsheet as a CSV file, m cd ${CURATION_RELEASE_ROOT} # Run the nextflow pipeline, resuming execution of previous attempt if possible. -nextflow run ${CODE_ROOT}/cmat/trait_mapping/export.nf \ +nextflow run ${CODE_ROOT}/pipelines/export_curation_spreadsheet.nf \ --input_csv ${CURATION_RELEASE_ROOT}/finished_curation_spreadsheet.csv \ --curation_root ${CURATION_RELEASE_ROOT} \ --with_feedback \ diff --git a/docs/open-targets/generate-evidence-strings.md b/docs/open-targets/generate-evidence-strings.md index 43b34c79..1a235426 100644 --- a/docs/open-targets/generate-evidence-strings.md +++ b/docs/open-targets/generate-evidence-strings.md @@ -32,7 +32,7 @@ cd ${BATCH_ROOT} mkdir -p clinvar gene_mapping evidence_strings logs # Run the nextflow pipeline, resuming execution of previous attempt if possible. -nextflow run ${CODE_ROOT}/cmat/output_generation/pipeline.nf \ +nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \ --output_dir ${BATCH_ROOT} \ --schema ${OT_SCHEMA_VERSION} \ -resume diff --git a/cmat/output_generation/pipeline.nf b/pipelines/annotation_pipeline.nf similarity index 92% rename from cmat/output_generation/pipeline.nf rename to pipelines/annotation_pipeline.nf index 4b82ed85..325b7db7 100644 --- a/cmat/output_generation/pipeline.nf +++ b/pipelines/annotation_pipeline.nf @@ -30,6 +30,7 @@ if (!params.output_dir) { exit 1, helpMessage() } batchRoot = params.output_dir +codeRoot = "${baseDir}/.." /* @@ -125,7 +126,7 @@ process runSnpIndel { script: """ - \${PYTHON_BIN} "\${CODE_ROOT}/bin/consequence_prediction/extract_variants_for_vep.py" --clinvar-xml ${clinvarXml} \ + \${PYTHON_BIN} "${codeRoot}/bin/consequence_prediction/extract_variants_for_vep.py" --clinvar-xml ${clinvarXml} \ | sort -u \ | parallel \ --halt now,fail=1 `# If any job fails, kill the remaining ones immediately and report failure` \ @@ -133,7 +134,7 @@ process runSnpIndel { -j 20 `# Number of concurrent workers` \ -N 200 `# Number of records (lines) per worker` \ --tmpdir . `# Store temporary files in the current directory to avoid /tmp overflow` \ - \${PYTHON_BIN} "\${CODE_ROOT}/cmat/consequence_prediction/snp_indel_variants/pipeline.py" \ + \${PYTHON_BIN} "${codeRoot}/cmat/consequence_prediction/snp_indel_variants/pipeline.py" \ | sort -u > consequences_snp.tsv """ } @@ -158,7 +159,7 @@ process runRepeat { script: """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/consequence_prediction/run_repeat_expansion_variants.py \ + \${PYTHON_BIN} ${codeRoot}/bin/consequence_prediction/run_repeat_expansion_variants.py \ --clinvar-xml ${clinvarXml} \ --output-consequences consequences_repeat.tsv @@ -188,7 +189,7 @@ process runStructural { script: """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/consequence_prediction/run_structural_variants.py \ + \${PYTHON_BIN} ${codeRoot}/bin/consequence_prediction/run_structural_variants.py \ --clinvar-xml ${clinvarXml} \ --output-consequences consequences_structural.tsv @@ -227,7 +228,7 @@ process mapGenes { script: """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/map_genes.py \ + \${PYTHON_BIN} ${codeRoot}/bin/evaluation/map_genes.py \ --clinvar-xml ${clinvarXml} \ --output-file output_gene_mappings.tsv """ @@ -245,7 +246,7 @@ process mapXrefs { script: """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/map_xrefs.py \ + \${PYTHON_BIN} ${codeRoot}/bin/evaluation/map_xrefs.py \ --clinvar-xml ${clinvarXml} \ --output-file output_xref_mappings.tsv """ @@ -260,7 +261,7 @@ process checkLatestMappings { script: """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/check_latest_mappings.py \ + \${PYTHON_BIN} ${codeRoot}/bin/evaluation/check_latest_mappings.py \ --latest-mappings ${params.mappings} \ --output-file output_eval_latest.tsv """ @@ -293,7 +294,7 @@ process generateAnnotatedXml { def evalXrefFlag = evalXrefMapping != file("empty2")? "--eval-xref-file ${evalXrefMapping}" : "" def evalLatestFlag = evalLatest != file("empty3")? "--eval-latest-file ${evalLatest}" : "" """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/generate_annotated_xml.py \ + \${PYTHON_BIN} ${codeRoot}/bin/generate_annotated_xml.py \ --clinvar-xml ${clinvarXml} \ --efo-mapping ${params.mappings} \ --gene-mapping ${consequenceMappings} \ @@ -324,7 +325,7 @@ process generateEvidence { script: """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/evidence_string_generation.py \ + \${PYTHON_BIN} ${codeRoot}/bin/evidence_string_generation.py \ --clinvar-xml ${clinvarXml} \ --efo-mapping ${params.mappings} \ --gene-mapping ${consequenceMappings} \ @@ -368,7 +369,7 @@ process convertXrefs { path "clinvar_xrefs.txt", emit: clinvarXrefs """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/traits_to_zooma_format.py \ + \${PYTHON_BIN} ${codeRoot}/bin/traits_to_zooma_format.py \ --clinvar-xml ${clinvarXml} \ --zooma-feedback clinvar_xrefs.txt """ diff --git a/cmat/trait_mapping/export.nf b/pipelines/export_curation_spreadsheet.nf similarity index 97% rename from cmat/trait_mapping/export.nf rename to pipelines/export_curation_spreadsheet.nf index 80847622..f677eb07 100644 --- a/cmat/trait_mapping/export.nf +++ b/pipelines/export_curation_spreadsheet.nf @@ -28,6 +28,7 @@ if (!params.curation_root or !params.input_csv) { exit 1, helpMessage() } curationRoot = params.curation_root +codeRoot = "${baseDir}/.." /* @@ -63,7 +64,7 @@ process exportTable { script: """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/export_curation_table.py \ + \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/export_curation_table.py \ -i ${params.input_csv} \ -d finished_mappings_curation.tsv \ -m terms_for_efo_import.txt \ @@ -143,7 +144,7 @@ process createEfoTable { script: """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/create_efo_table.py \ + \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/create_efo_table.py \ -i ${importTerms} \ -o efo_import_table.tsv """ diff --git a/cmat/trait_mapping/generate.nf b/pipelines/generate_curation_spreadsheet.nf similarity index 94% rename from cmat/trait_mapping/generate.nf rename to pipelines/generate_curation_spreadsheet.nf index 6cdc373f..115543df 100644 --- a/cmat/trait_mapping/generate.nf +++ b/pipelines/generate_curation_spreadsheet.nf @@ -32,6 +32,7 @@ if (!params.curation_root) { exit 1, helpMessage() } curationRoot = params.curation_root +codeRoot = "${baseDir}/.." /* @@ -78,7 +79,7 @@ process parseTraits { script: """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/parse_traits.py \ + \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/parse_traits.py \ -i ${clinvarXml} \ -o parsed_traits.csv """ @@ -116,7 +117,7 @@ process processTraits { script: """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/process_traits.py \ + \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/process_traits.py \ -i ${traitChunk} \ -o automated_traits_${traitChunk}.tsv \ -c curation_traits_${traitChunk}.tsv @@ -178,7 +179,7 @@ process createCurationTable { script: """ - \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/create_table_for_manual_curation.py \ + \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/create_table_for_manual_curation.py \ --traits-for-curation ${curationTraits} \ --previous-mappings ${params.mappings} \ --previous-comments ${params.comments} \ diff --git a/pipelines/nextflow.config b/pipelines/nextflow.config new file mode 100644 index 00000000..66384b73 --- /dev/null +++ b/pipelines/nextflow.config @@ -0,0 +1,3 @@ +env { + PYTHON_BIN = 'python' +} \ No newline at end of file diff --git a/setup.py b/setup.py index 946d27a2..58b5e6ce 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ def get_requires(): long_description = fh.read() setup(name='cmat', - version='3.0.3', + version='3.0.4.dev0', author_email='opentargets-clinvar@ebi.ac.uk', url='https://github.com/EBIvariation/eva-opentargets', packages=find_packages(), diff --git a/tests/output_generation/test_pipeline.sh b/tests/output_generation/test_pipeline.sh index 5cb318d0..ef7242ff 100644 --- a/tests/output_generation/test_pipeline.sh +++ b/tests/output_generation/test_pipeline.sh @@ -6,7 +6,6 @@ export LC_COLLATE=C SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" export CODE_ROOT="$(dirname $(dirname "${SCRIPT_DIR}"))" -export PYTHON_BIN=python export BATCH_ROOT_BASE=${SCRIPT_DIR}/resources/end2end CWD=${PWD} From 3146ce0e6d9a7bc9ea02ed60d15238fad5afae1c Mon Sep 17 00:00:00 2001 From: April Shen Date: Fri, 29 Sep 2023 12:02:30 +0100 Subject: [PATCH 2/4] fix tests --- tests/output_generation/evaluation/test_ols_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/output_generation/evaluation/test_ols_utils.py b/tests/output_generation/evaluation/test_ols_utils.py index 20e7cb9c..93e2759a 100644 --- a/tests/output_generation/evaluation/test_ols_utils.py +++ b/tests/output_generation/evaluation/test_ols_utils.py @@ -2,12 +2,12 @@ def test_fetch_eval_data(): - expected = ('MONDO:0004975', False, {'MONDO:0004975', 'Orphanet:238616'}) + expected = ('MONDO:0004975', False, {'MONDO:0004975'}) assert fetch_eval_data(db_iden=('MONDO', 'MONDO:0004975')) == expected assert fetch_eval_data(uri='http://purl.obolibrary.org/obo/MONDO_0004975') == expected def test_fetch_eval_data_include_neighbors(): - expected = ('MONDO:0004975', False, {'MONDO:0004975', 'Orphanet:238616'}, + expected = ('MONDO:0004975', False, {'MONDO:0004975'}, {'EFO:0005815', 'MONDO:0001627'}, {'MONDO:0100087', 'EFO:1001870'}) assert fetch_eval_data(db_iden=('MONDO', 'MONDO:0004975'), include_neighbors=True) == expected From bdb50ba325ed4786457b17213a8c06d54a5d37b0 Mon Sep 17 00:00:00 2001 From: April Shen Date: Fri, 29 Sep 2023 12:36:42 +0100 Subject: [PATCH 3/4] fix e2e test --- tests/output_generation/test_pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/output_generation/test_pipeline.sh b/tests/output_generation/test_pipeline.sh index ef7242ff..f1fa8120 100644 --- a/tests/output_generation/test_pipeline.sh +++ b/tests/output_generation/test_pipeline.sh @@ -13,7 +13,7 @@ BATCH_ROOT=${BATCH_ROOT_BASE}/test_batch mkdir -p ${BATCH_ROOT} cd ${BATCH_ROOT} -nextflow run ${CODE_ROOT}/cmat/output_generation/pipeline.nf \ +nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \ --output_dir ${BATCH_ROOT} \ --schema $(cat "${CODE_ROOT}/OT_SCHEMA_VERSION") \ --clinvar ${BATCH_ROOT_BASE}/input.xml.gz \ From a891e6f22d423a48eda01cc0cef5a8ba5d9bc0cb Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 2 Oct 2023 12:48:46 +0100 Subject: [PATCH 4/4] replace baseDir with projectDir --- pipelines/annotation_pipeline.nf | 2 +- pipelines/export_curation_spreadsheet.nf | 2 +- pipelines/generate_curation_spreadsheet.nf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/annotation_pipeline.nf b/pipelines/annotation_pipeline.nf index 325b7db7..60d38f85 100644 --- a/pipelines/annotation_pipeline.nf +++ b/pipelines/annotation_pipeline.nf @@ -30,7 +30,7 @@ if (!params.output_dir) { exit 1, helpMessage() } batchRoot = params.output_dir -codeRoot = "${baseDir}/.." +codeRoot = "${projectDir}/.." /* diff --git a/pipelines/export_curation_spreadsheet.nf b/pipelines/export_curation_spreadsheet.nf index f677eb07..2082411c 100644 --- a/pipelines/export_curation_spreadsheet.nf +++ b/pipelines/export_curation_spreadsheet.nf @@ -28,7 +28,7 @@ if (!params.curation_root or !params.input_csv) { exit 1, helpMessage() } curationRoot = params.curation_root -codeRoot = "${baseDir}/.." +codeRoot = "${projectDir}/.." /* diff --git a/pipelines/generate_curation_spreadsheet.nf b/pipelines/generate_curation_spreadsheet.nf index 115543df..1b24b5ec 100644 --- a/pipelines/generate_curation_spreadsheet.nf +++ b/pipelines/generate_curation_spreadsheet.nf @@ -32,7 +32,7 @@ if (!params.curation_root) { exit 1, helpMessage() } curationRoot = params.curation_root -codeRoot = "${baseDir}/.." +codeRoot = "${projectDir}/.." /*