From 9ca57883c03447bd602e87891505831628941a28 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Fri, 29 Sep 2023 11:54:27 +0100
Subject: [PATCH 1/4] update installation instructions and move nextflow
 pipelines to single directory

---
 README.md                                     | 37 ++++++++++++++-----
 .../step1-fetch-clinvar-data.md               |  2 +-
 docs/manual-curation/step3-export-results.md  |  2 +-
 .../open-targets/generate-evidence-strings.md |  2 +-
 .../annotation_pipeline.nf                    | 21 ++++++-----
 .../export_curation_spreadsheet.nf            |  5 ++-
 .../generate_curation_spreadsheet.nf          |  7 ++--
 pipelines/nextflow.config                     |  3 ++
 setup.py                                      |  2 +-
 tests/output_generation/test_pipeline.sh      |  1 -
 10 files changed, 52 insertions(+), 30 deletions(-)
 rename cmat/output_generation/pipeline.nf => pipelines/annotation_pipeline.nf (92%)
 rename cmat/trait_mapping/export.nf => pipelines/export_curation_spreadsheet.nf (97%)
 rename cmat/trait_mapping/generate.nf => pipelines/generate_curation_spreadsheet.nf (94%)
 create mode 100644 pipelines/nextflow.config

diff --git a/README.md b/README.md
index eb085d35..7e9e5864 100644
--- a/README.md
+++ b/README.md
@@ -10,26 +10,43 @@ For instructions on how to process ClinVar data for the Open Targets platform, s
 
 ## Install
 
-The code requires Python 3.8+. You can install the library and its dependencies as follows (e.g. in a virtual environment):
+The code requires Python 3.8+, and you will also need Nextflow 21.10+ to run the pipelines. Refer to [Nextflow documentation](https://www.nextflow.io/docs/latest/getstarted.html) for specifics on installing Nextflow on your system.
 
+To install CMAT, first either clone the repository or download the latest released version from [here](https://github.com/EBIvariation/CMAT/releases):
 ```bash
 git clone git@github.com:EBIvariation/CMAT.git
+# OR
+wget -O CMAT.zip https://github.com/EBIvariation/CMAT/archive/refs/tags/v3.0.3.zip
+unzip CMAT.zip
+```
+
+Then install the library and its dependencies as follows (e.g. in a virtual environment):
+```bash
 cd CMAT
 pip install -r requirements.txt
 python setup.py install
 ```
 
-Running the pipelines also requires Nextflow 21.10+. Refer to [Nextflow documentation](https://www.nextflow.io/docs/latest/getstarted.html) for specifics on installing Nextflow on your system.
+You then need to set the `PYTHON_BIN` variable in the [Nextflow config](pipelines/nextflow.config), which will allow the
+Nextflow processes to access the correct Python executable.
 
-Finally, the pipelines currently require that the following environment variables be set:
+Finally, the instructions in this readme use the following environment variables as a convenience, they are not needed for the pipelines to run.
 ```bash
-# Path to directory where this repo is cloned
+# Path to directory where source code is downloaded
 export CODE_ROOT=
-# Path to python executable (allows nextflow processes to access python)
-export PYTHON_BIN=
 # Path to ontology mapping file (the provided path points to the version included in this repo)
 export LATEST_MAPPINGS=${CODE_ROOT}/mappings/latest_mappings.tsv
-````
+```
+
+To confirm everything is set up properly, you can run the annotation pipeline on the small dataset included with the tests.
+It should take a couple minutes to run and generate a file `annotated_clinvar.xml.gz` in the test directory.
+```bash
+mkdir testdir && cd testdir
+nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
+  --output_dir . \
+  --clinvar ${CODE_ROOT}/tests/output_generation/resources/end2end/input.xml.gz \
+  --mappings ${LATEST_MAPPINGS}
+```
 
 ## Run
 
@@ -51,7 +68,7 @@ cd ${ANNOTATION_ROOT}
 mkdir -p gene_mapping logs
 
 # Run the nextflow pipeline, resuming execution of previous attempt if possible.
-nextflow run ${CODE_ROOT}/cmat/output_generation/pipeline.nf \
+nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
   --output_dir ${ANNOTATION_ROOT} \
   --mappings ${LATEST_MAPPINGS} \
   -resume
@@ -81,7 +98,7 @@ mkdir -p ${CURATION_ROOT}
 cd ${CURATION_ROOT}
 
 # Run the nextflow pipeline, resuming execution of previous attempt if possible.
-nextflow run ${CODE_ROOT}/cmat/trait_mapping/generate.nf \
+nextflow run ${CODE_ROOT}/pipelines/generate_curation_spreadsheet.nf \
   --curation_root ${CURATION_ROOT} \
   --mappings ${LATEST_MAPPINGS} \
   --comments ${CURATOR_COMMENTS} \
@@ -108,7 +125,7 @@ Download the spreadsheet as a CSV file, making sure that all the data is visible
 cd ${CURATION_ROOT}
 
 # Run the nextflow pipeline, resuming execution of previous attempt if possible.
-nextflow run ${CODE_ROOT}/cmat/trait_mapping/export.nf \
+nextflow run ${CODE_ROOT}/pipelines/export_curation_spreadsheet.nf \
   --input_csv ${CURATION_ROOT}/finished_curation_spreadsheet.csv \
   --curation_root ${CURATION_ROOT} \
   --mappings ${LATEST_MAPPINGS} \
diff --git a/docs/manual-curation/step1-fetch-clinvar-data.md b/docs/manual-curation/step1-fetch-clinvar-data.md
index 7c13f379..6a2de020 100644
--- a/docs/manual-curation/step1-fetch-clinvar-data.md
+++ b/docs/manual-curation/step1-fetch-clinvar-data.md
@@ -11,7 +11,7 @@ mkdir -p ${CURATION_RELEASE_ROOT}
 cd ${CURATION_RELEASE_ROOT}
 
 # Run the nextflow pipeline, resuming execution of previous attempt if possible.
-nextflow run ${CODE_ROOT}/cmat/trait_mapping/generate.nf \
+nextflow run ${CODE_ROOT}/pipelines/generate_curation_spreadsheet.nf \
   --curation_root ${CURATION_RELEASE_ROOT} \
   -resume
 ```
diff --git a/docs/manual-curation/step3-export-results.md b/docs/manual-curation/step3-export-results.md
index f5a5edd5..ecaa08ad 100644
--- a/docs/manual-curation/step3-export-results.md
+++ b/docs/manual-curation/step3-export-results.md
@@ -14,7 +14,7 @@ Once the manual curation is completed, download the spreadsheet as a CSV file, m
 cd ${CURATION_RELEASE_ROOT}
 
 # Run the nextflow pipeline, resuming execution of previous attempt if possible.
-nextflow run ${CODE_ROOT}/cmat/trait_mapping/export.nf \
+nextflow run ${CODE_ROOT}/pipelines/export_curation_spreadsheet.nf \
   --input_csv ${CURATION_RELEASE_ROOT}/finished_curation_spreadsheet.csv \
   --curation_root ${CURATION_RELEASE_ROOT} \
   --with_feedback \
diff --git a/docs/open-targets/generate-evidence-strings.md b/docs/open-targets/generate-evidence-strings.md
index 43b34c79..1a235426 100644
--- a/docs/open-targets/generate-evidence-strings.md
+++ b/docs/open-targets/generate-evidence-strings.md
@@ -32,7 +32,7 @@ cd ${BATCH_ROOT}
 mkdir -p clinvar gene_mapping evidence_strings logs
 
 # Run the nextflow pipeline, resuming execution of previous attempt if possible.
-nextflow run ${CODE_ROOT}/cmat/output_generation/pipeline.nf \
+nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
   --output_dir ${BATCH_ROOT} \
   --schema ${OT_SCHEMA_VERSION} \
   -resume
diff --git a/cmat/output_generation/pipeline.nf b/pipelines/annotation_pipeline.nf
similarity index 92%
rename from cmat/output_generation/pipeline.nf
rename to pipelines/annotation_pipeline.nf
index 4b82ed85..325b7db7 100644
--- a/cmat/output_generation/pipeline.nf
+++ b/pipelines/annotation_pipeline.nf
@@ -30,6 +30,7 @@ if (!params.output_dir) {
     exit 1, helpMessage()
 }
 batchRoot = params.output_dir
+codeRoot = "${baseDir}/.."
 
 
 /*
@@ -125,7 +126,7 @@ process runSnpIndel {
 
     script:
     """
-    \${PYTHON_BIN} "\${CODE_ROOT}/bin/consequence_prediction/extract_variants_for_vep.py" --clinvar-xml ${clinvarXml} \
+    \${PYTHON_BIN} "${codeRoot}/bin/consequence_prediction/extract_variants_for_vep.py" --clinvar-xml ${clinvarXml} \
     | sort -u \
     | parallel \
         --halt now,fail=1    `# If any job fails, kill the remaining ones immediately and report failure` \
@@ -133,7 +134,7 @@ process runSnpIndel {
         -j 20                `# Number of concurrent workers`                                             \
         -N 200               `# Number of records (lines) per worker`                                     \
         --tmpdir .           `# Store temporary files in the current directory to avoid /tmp overflow`    \
-        \${PYTHON_BIN} "\${CODE_ROOT}/cmat/consequence_prediction/snp_indel_variants/pipeline.py" \
+        \${PYTHON_BIN} "${codeRoot}/cmat/consequence_prediction/snp_indel_variants/pipeline.py" \
     | sort -u > consequences_snp.tsv
     """
 }
@@ -158,7 +159,7 @@ process runRepeat {
 
    script:
    """
-   \${PYTHON_BIN} \${CODE_ROOT}/bin/consequence_prediction/run_repeat_expansion_variants.py \
+   \${PYTHON_BIN} ${codeRoot}/bin/consequence_prediction/run_repeat_expansion_variants.py \
         --clinvar-xml ${clinvarXml} \
         --output-consequences consequences_repeat.tsv
 
@@ -188,7 +189,7 @@ process runStructural {
 
    script:
    """
-   \${PYTHON_BIN} \${CODE_ROOT}/bin/consequence_prediction/run_structural_variants.py \
+   \${PYTHON_BIN} ${codeRoot}/bin/consequence_prediction/run_structural_variants.py \
         --clinvar-xml ${clinvarXml} \
         --output-consequences consequences_structural.tsv
 
@@ -227,7 +228,7 @@ process mapGenes {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/map_genes.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/evaluation/map_genes.py \
         --clinvar-xml ${clinvarXml} \
         --output-file output_gene_mappings.tsv
     """
@@ -245,7 +246,7 @@ process mapXrefs {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/map_xrefs.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/evaluation/map_xrefs.py \
         --clinvar-xml ${clinvarXml} \
         --output-file output_xref_mappings.tsv
     """
@@ -260,7 +261,7 @@ process checkLatestMappings {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/check_latest_mappings.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/evaluation/check_latest_mappings.py \
         --latest-mappings ${params.mappings} \
         --output-file output_eval_latest.tsv
     """
@@ -293,7 +294,7 @@ process generateAnnotatedXml {
     def evalXrefFlag = evalXrefMapping != file("empty2")? "--eval-xref-file ${evalXrefMapping}" : ""
     def evalLatestFlag = evalLatest != file("empty3")? "--eval-latest-file ${evalLatest}" : ""
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/generate_annotated_xml.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/generate_annotated_xml.py \
         --clinvar-xml ${clinvarXml} \
         --efo-mapping ${params.mappings} \
         --gene-mapping ${consequenceMappings} \
@@ -324,7 +325,7 @@ process generateEvidence {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/evidence_string_generation.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/evidence_string_generation.py \
         --clinvar-xml ${clinvarXml} \
         --efo-mapping ${params.mappings} \
         --gene-mapping ${consequenceMappings} \
@@ -368,7 +369,7 @@ process convertXrefs {
     path "clinvar_xrefs.txt", emit: clinvarXrefs
 
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/traits_to_zooma_format.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/traits_to_zooma_format.py \
         --clinvar-xml ${clinvarXml} \
         --zooma-feedback clinvar_xrefs.txt
     """
diff --git a/cmat/trait_mapping/export.nf b/pipelines/export_curation_spreadsheet.nf
similarity index 97%
rename from cmat/trait_mapping/export.nf
rename to pipelines/export_curation_spreadsheet.nf
index 80847622..f677eb07 100644
--- a/cmat/trait_mapping/export.nf
+++ b/pipelines/export_curation_spreadsheet.nf
@@ -28,6 +28,7 @@ if (!params.curation_root or !params.input_csv) {
     exit 1, helpMessage()
 }
 curationRoot = params.curation_root
+codeRoot = "${baseDir}/.."
 
 
 /*
@@ -63,7 +64,7 @@ process exportTable {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/export_curation_table.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/export_curation_table.py \
         -i ${params.input_csv} \
         -d finished_mappings_curation.tsv \
         -m terms_for_efo_import.txt \
@@ -143,7 +144,7 @@ process createEfoTable {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/create_efo_table.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/create_efo_table.py \
         -i ${importTerms} \
         -o efo_import_table.tsv
     """
diff --git a/cmat/trait_mapping/generate.nf b/pipelines/generate_curation_spreadsheet.nf
similarity index 94%
rename from cmat/trait_mapping/generate.nf
rename to pipelines/generate_curation_spreadsheet.nf
index 6cdc373f..115543df 100644
--- a/cmat/trait_mapping/generate.nf
+++ b/pipelines/generate_curation_spreadsheet.nf
@@ -32,6 +32,7 @@ if (!params.curation_root) {
     exit 1, helpMessage()
 }
 curationRoot = params.curation_root
+codeRoot = "${baseDir}/.."
 
 
 /*
@@ -78,7 +79,7 @@ process parseTraits {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/parse_traits.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/parse_traits.py \
         -i ${clinvarXml} \
         -o parsed_traits.csv
     """
@@ -116,7 +117,7 @@ process processTraits {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/process_traits.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/process_traits.py \
         -i ${traitChunk} \
         -o automated_traits_${traitChunk}.tsv \
         -c curation_traits_${traitChunk}.tsv
@@ -178,7 +179,7 @@ process createCurationTable {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/create_table_for_manual_curation.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/create_table_for_manual_curation.py \
         --traits-for-curation ${curationTraits} \
         --previous-mappings ${params.mappings} \
         --previous-comments ${params.comments} \
diff --git a/pipelines/nextflow.config b/pipelines/nextflow.config
new file mode 100644
index 00000000..66384b73
--- /dev/null
+++ b/pipelines/nextflow.config
@@ -0,0 +1,3 @@
+env {
+    PYTHON_BIN = 'python'
+}
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 946d27a2..58b5e6ce 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ def get_requires():
     long_description = fh.read()
 
 setup(name='cmat',
-      version='3.0.3',
+      version='3.0.4.dev0',
       author_email='opentargets-clinvar@ebi.ac.uk',
       url='https://github.com/EBIvariation/eva-opentargets',
       packages=find_packages(),
diff --git a/tests/output_generation/test_pipeline.sh b/tests/output_generation/test_pipeline.sh
index 5cb318d0..ef7242ff 100644
--- a/tests/output_generation/test_pipeline.sh
+++ b/tests/output_generation/test_pipeline.sh
@@ -6,7 +6,6 @@ export LC_COLLATE=C
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 export CODE_ROOT="$(dirname $(dirname "${SCRIPT_DIR}"))"
 
-export PYTHON_BIN=python
 export BATCH_ROOT_BASE=${SCRIPT_DIR}/resources/end2end
 
 CWD=${PWD}

From 3146ce0e6d9a7bc9ea02ed60d15238fad5afae1c Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Fri, 29 Sep 2023 12:02:30 +0100
Subject: [PATCH 2/4] fix tests

---
 tests/output_generation/evaluation/test_ols_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/output_generation/evaluation/test_ols_utils.py b/tests/output_generation/evaluation/test_ols_utils.py
index 20e7cb9c..93e2759a 100644
--- a/tests/output_generation/evaluation/test_ols_utils.py
+++ b/tests/output_generation/evaluation/test_ols_utils.py
@@ -2,12 +2,12 @@
 
 
 def test_fetch_eval_data():
-    expected = ('MONDO:0004975', False, {'MONDO:0004975', 'Orphanet:238616'})
+    expected = ('MONDO:0004975', False, {'MONDO:0004975'})
     assert fetch_eval_data(db_iden=('MONDO', 'MONDO:0004975')) == expected
     assert fetch_eval_data(uri='http://purl.obolibrary.org/obo/MONDO_0004975') == expected
 
 
 def test_fetch_eval_data_include_neighbors():
-    expected = ('MONDO:0004975', False, {'MONDO:0004975', 'Orphanet:238616'},
+    expected = ('MONDO:0004975', False, {'MONDO:0004975'},
                 {'EFO:0005815', 'MONDO:0001627'}, {'MONDO:0100087', 'EFO:1001870'})
     assert fetch_eval_data(db_iden=('MONDO', 'MONDO:0004975'), include_neighbors=True) == expected

From bdb50ba325ed4786457b17213a8c06d54a5d37b0 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Fri, 29 Sep 2023 12:36:42 +0100
Subject: [PATCH 3/4] fix e2e test

---
 tests/output_generation/test_pipeline.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/output_generation/test_pipeline.sh b/tests/output_generation/test_pipeline.sh
index ef7242ff..f1fa8120 100644
--- a/tests/output_generation/test_pipeline.sh
+++ b/tests/output_generation/test_pipeline.sh
@@ -13,7 +13,7 @@ BATCH_ROOT=${BATCH_ROOT_BASE}/test_batch
 mkdir -p ${BATCH_ROOT}
 cd ${BATCH_ROOT}
 
-nextflow run ${CODE_ROOT}/cmat/output_generation/pipeline.nf \
+nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
   --output_dir ${BATCH_ROOT} \
   --schema $(cat "${CODE_ROOT}/OT_SCHEMA_VERSION") \
   --clinvar ${BATCH_ROOT_BASE}/input.xml.gz \

From a891e6f22d423a48eda01cc0cef5a8ba5d9bc0cb Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Mon, 2 Oct 2023 12:48:46 +0100
Subject: [PATCH 4/4] replace baseDir with projectDir

---
 pipelines/annotation_pipeline.nf           | 2 +-
 pipelines/export_curation_spreadsheet.nf   | 2 +-
 pipelines/generate_curation_spreadsheet.nf | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelines/annotation_pipeline.nf b/pipelines/annotation_pipeline.nf
index 325b7db7..60d38f85 100644
--- a/pipelines/annotation_pipeline.nf
+++ b/pipelines/annotation_pipeline.nf
@@ -30,7 +30,7 @@ if (!params.output_dir) {
     exit 1, helpMessage()
 }
 batchRoot = params.output_dir
-codeRoot = "${baseDir}/.."
+codeRoot = "${projectDir}/.."
 
 
 /*
diff --git a/pipelines/export_curation_spreadsheet.nf b/pipelines/export_curation_spreadsheet.nf
index f677eb07..2082411c 100644
--- a/pipelines/export_curation_spreadsheet.nf
+++ b/pipelines/export_curation_spreadsheet.nf
@@ -28,7 +28,7 @@ if (!params.curation_root or !params.input_csv) {
     exit 1, helpMessage()
 }
 curationRoot = params.curation_root
-codeRoot = "${baseDir}/.."
+codeRoot = "${projectDir}/.."
 
 
 /*
diff --git a/pipelines/generate_curation_spreadsheet.nf b/pipelines/generate_curation_spreadsheet.nf
index 115543df..1b24b5ec 100644
--- a/pipelines/generate_curation_spreadsheet.nf
+++ b/pipelines/generate_curation_spreadsheet.nf
@@ -32,7 +32,7 @@ if (!params.curation_root) {
     exit 1, helpMessage()
 }
 curationRoot = params.curation_root
-codeRoot = "${baseDir}/.."
+codeRoot = "${projectDir}/.."
 
 
 /*