EBIvariation · apriltuesday · May 16, 2024 · May 13, 2024 · May 14, 2024 · May 14, 2024
diff --git a/bin/trait_mapping/create_efo_table.py b/bin/trait_mapping/create_efo_table.py
diff --git a/bin/trait_mapping/export_curation_table.py b/bin/trait_mapping/export_curation_table.py
@@ -5,19 +5,14 @@
 import pandas as pd
 
 
-def export_table(input_filepath, done_filepath, import_filepath, comments_filepath):
+def export_table(input_filepath, done_filepath, comments_filepath):
     curation_table = pd.read_csv(input_filepath, skiprows=1, header=0)
 
     # Finished mappings
     done_rows = curation_table[curation_table['Status'] == 'DONE']
     done_rows = done_rows[['ClinVar label', 'URI of selected mapping', 'Label of selected mapping']]
     done_rows.to_csv(done_filepath, sep='\t', header=False, index=False)
 
-    # Terms for import
-    import_rows = curation_table[curation_table['Status'] == 'IMPORT']
-    import_rows = import_rows['URI of selected mapping']
-    import_rows.to_csv(import_filepath, header=False, index=False)
-
     # Comments column
     comment_rows = curation_table[curation_table['Comment'].notna() & curation_table['Status'].notna()]
     comment_rows = comment_rows[['ClinVar label', 'Comment']].astype(str)
@@ -32,9 +27,7 @@ def export_table(input_filepath, done_filepath, import_filepath, comments_filepa
                         help="path to input csv file")
     parser.add_argument("-d", dest="done_filepath", required=True,
                         help="path to output file for terms that are done")
-    parser.add_argument("-m", dest="import_filepath", required=True,
-                        help="path to output file for terms to import")
     parser.add_argument("-c", dest="comments_filepath", required=True,
                         help="path to output file for curator comments")
     args = parser.parse_args()
-    export_table(args.input_filepath, args.done_filepath, args.import_filepath, args.comments_filepath)
+    export_table(args.input_filepath, args.done_filepath, args.comments_filepath)
diff --git a/docs/manual-curation/README.md b/docs/manual-curation/README.md
@@ -9,7 +9,6 @@ The protocol consists of four parts which are done in sequence by different peop
 1. [**Fetch data**](step1-fetch-clinvar-data.md) (technical). The latest ClinVar data is downloaded and the trait names are extracted. They are attempted to be automatically mapped to ontology terms using ZOOMA. The traits which cannot be mapped automatically are output as a separate file, which is loaded into a Google spreadsheet.
 1. [**Curate**](step2-manual-curation.md) (biological). The curator goes through the spreadsheet and fills it in, performing manual curation. Other people review the results and comment on them.
 1. [**Extract results**](step3-export-results.md) (technical). Curation results are extracted from the spreadsheet into a TSV file. Some accompanying data is prepared for providing feedback to EFO.
-1. [**Provide feedback**](step4-submit-efo-feedback.md) (biological). The curator, using the data generated on the previous steps, submits feedback to EFO and follows up on this.
 
 ## Setting up environment
 To follow the technical steps of the protocol, you will need to set up the environment.
@@ -31,5 +30,6 @@ export CURATION_RELEASE_ROOT=${BATCH_ROOT_BASE}/manual_curation/${CURATION_RELEA
 * The number of traits in the `finished_mappings_curation.tsv` file is the same as in the spreadsheet after applying all relevant filters
 * _Important:_ spreadsheet does not contain line endings, or extraneous space symbols, in trait names (can be checked by a regexp search)
 * For submitting terms to EFO
-  + Cross-references has been populated for as many traits as possible
+  + Terms for import all have associated HP or MONDO IDs
+  + Information for new terms has been populated for as many traits as possible
   + GitHub issue has been created and linked in the issue
diff --git a/docs/manual-curation/step1-fetch-clinvar-data.md b/docs/manual-curation/step1-fetch-clinvar-data.md
@@ -18,4 +18,4 @@ nextflow run ${CODE_ROOT}/pipelines/generate_curation_spreadsheet.nf \
 
 ## Create a Google spreadsheet for curation
 
-Duplicate a [template](https://docs.google.com/spreadsheets/d/1PyDzRs3bO1klvvSv9XuHmx-x7nqZ0UAGeS6aV2SQ2Yg/edit?usp=sharing). Paste the contents of `${CURATION_RELEASE_ROOT}/google_sheets_table.tsv` file into it, starting with column H “ClinVar label”. Example of a table fully populated with data can be found [here](https://docs.google.com/spreadsheets/d/1HQ08UQTpS-0sE9MyzdUPO7EihMxDb2e8N14s1BknjVo/edit?usp=sharing).
+Duplicate a [template](https://docs.google.com/spreadsheets/d/1GWAQAZjOpzsIkdCu0CSRDoehZEUB3VjZYYiHWp9Tn7Q/edit?usp=sharing). Paste the contents of `${CURATION_RELEASE_ROOT}/google_sheets_table.tsv` file into it, starting with column H “ClinVar label”. Example of a table fully populated with data can be found [here](https://docs.google.com/spreadsheets/d/1HQ08UQTpS-0sE9MyzdUPO7EihMxDb2e8N14s1BknjVo/edit?usp=sharing).
diff --git a/docs/manual-curation/step2-manual-curation.md b/docs/manual-curation/step2-manual-curation.md
@@ -73,8 +73,6 @@ The “Status” column has the following acceptable values:
 * **SKIP** — trait is going to be skipped in this iteration, due to being too non-specific, or just having a low frequency
 * **UNSURE** — temporary status; traits to be discussed with reviewers/the team
 
-Note that IMPORT and NEW terms are processed in Step 4, for now you should ignore the `Add EFO disease` tab within the manual curation spreadsheet and simply mark the status appropriately.
-
 ### Comment field for curation review
 The "Comment" field can be used to enter arbitrary additional information which will be used by reviewers. Precede any text with initials e.g. "BK - example comment". Comments should be ordered chronologically in reverse: most recent ones at the top.
 Any comments will become available in the Notes field within the next iteration.
@@ -89,3 +87,13 @@ This provision does _not_ apply to cases where the source string contains additi
 
 ### Note on spaces and line breaks
 Sometimes, especially when copy-pasting information from external sources, a mapping label or URL can contain an additional space symbol (at the beginning or end) or an accidental line break. This causes problems in the downstream processing and must be manually removed. To minimise the occurences of this, Google Sheets template includes a validation formula for the first two columns (“URI of selected mapping” and “Label of selected mapping”). If it detects an extra space symbol or a line break, the cell will be highlighted in red.
+
+## New terms
+Once a term has been marked as IMPORT or NEW, it will automatically show up in the corresponding "Add to EFO" worksheet.
+Terms for import do not require any additional manual intervention, but new terms require some additional information, in particular:
+* **Parent term** - Suggested parent term within EFO. This is required but does not need to be exact as it will be reviewed by EFO maintainers - a rough idea of the term hierarchy is acceptable.
+* **Child terms** - Suggested children within EFO (if any), should be added if possible.
+* **Description, synonyms, PubMed IDs** - Should be added if possible, for example taken from OMIM or MedGen, but can be skipped if the information cannot be found.
+* **MedGen, OMIM** - Links to the specified resource, useful references if any of the above cannot be found. These are often present in the "Suggested exact mapping" column.
+
+Any additional comments can be left in the final column, they will be passed on to EFO.
diff --git a/docs/manual-curation/step3-export-results.md b/docs/manual-curation/step3-export-results.md
@@ -27,8 +27,10 @@ The automated pipeline checks for complete duplicates in the list of text-to-ont
 ## Check and correct known problematic mappings
 There is a [spreadsheet](https://docs.google.com/spreadsheets/d/1m4ld3y3Pfust5JSOJOX9ZmImRCKRGi-fGYj_dExoGj8/edit) which was created to track trait-to-ontology mappings which were especially problematic in the past to users of Open Targets platform. Prior to running subsequent steps, make sure that all traits mentioned in that spreadsheet are mapped to the correct ontology terms in `${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv`.
 
-## Copy the table for EFO import
-The file `${CURATION_RELEASE_ROOT}/efo_import_table.tsv` will contain a partially ready table for EFO import. Copy its contents into the “Add EFO disease” sheet in the curation spreadsheet.
+## Submit feedback to EFO
+Tables for IMPORT and NEW terms will be created by curators during [step 2](step2-manual-curation.md).
+Open a new git issue with EFO to review and import these novel trait names, e.g. [https://github.com/EBISPOT/efo/issues/1898](https://github.com/EBISPOT/efo/issues/1898).
+
 
 ## Submit feedback to ZOOMA
 See more details on ZOOMA feedback in the [evidence string generation protocol](../generate-evidence-strings.md#submit-feedback-to-zooma). At this stage, only the **eva_clinvar** dataset is being submitted; clinvar_xrefs is submitted during evidence string generation.

diff --git a/docs/manual-curation/step4-submit-efo-feedback.md b/docs/manual-curation/step4-submit-efo-feedback.md
diff --git a/pipelines/export_curation_spreadsheet.nf b/pipelines/export_curation_spreadsheet.nf
@@ -45,7 +45,6 @@ workflow {
     checkDuplicates(mergeWithLatestMappings.out.newMappings)
     addMappingsHeader(checkDuplicates.out.duplicatesOk, mergeWithLatestMappings.out.newMappings, getTargetOntology.out.targetOntology)
     if (params.with_feedback) {
-        createEfoTable(exportTable.out.importTerms)
         generateZoomaFeedback(mergeWithLatestMappings.out.newMappings)
         updateLinks(addMappingsHeader.out.finalMappings, generateZoomaFeedback.out.zoomaFeedback)
     }
@@ -65,15 +64,13 @@ process exportTable {
 
     output:
     path "finished_mappings_curation.tsv", emit: finishedMappings
-    path "terms_for_efo_import.txt", emit: importTerms
     path "curator_comments.tsv", emit: curatorComments
 
     script:
     """
     \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/export_curation_table.py \
         -i ${params.input_csv} \
         -d finished_mappings_curation.tsv \
-        -m terms_for_efo_import.txt \
         -c curator_comments.tsv
     """
 }
@@ -142,31 +139,6 @@ process mergeWithLatestMappings {
     """
 }
 
-/*
- * Prepare the table for EFO import.
- */
-process createEfoTable {
-    label 'short_time'
-    label 'small_mem'
-    publishDir "${curationRoot}",
-        overwrite: true,
-        mode: "copy",
-        pattern: "*.tsv"
-
-    input:
-    path importTerms
-
-    output:
-    path "efo_import_table.tsv", emit: efoImportTable
-
-    script:
-    """
-    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/create_efo_table.py \
-        -i ${importTerms} \
-        -o efo_import_table.tsv
-    """
-}
-
 /*
  * Generate ZOOMA feedback.
  */

diff --git a/tests/consequence_prediction/structural_variants/test_structural_pipeline.py b/tests/consequence_prediction/structural_variants/test_structural_pipeline.py
@@ -26,10 +26,10 @@ def run_pipeline(resource_name, include_transcripts=False):
 
 def test_successful_run():
     assert sorted(run_pipeline('precise_genomic.xml.gz')) == sorted([
-        ['NC_000016.10:g.72059151_72063259del', 'ENSG00000140830', 'TXNL4B', 'feature_truncation'],
+        ['NC_000016.10:g.72059151_72063259del', 'ENSG00000140830', 'TXNL4B', 'intron_variant'],
         ['NC_000016.10:g.72059151_72063259del', 'ENSG00000257017', 'HP', 'stop_lost'],
         ['NC_000016.10:g.72059151_72063259del', 'ENSG00000261701', 'HPR', 'feature_truncation'],
-        ['NC_000001.11:g.25271785_25329047del', 'ENSG00000117616', 'RSRP1', 'feature_truncation'],
+        ['NC_000001.11:g.25271785_25329047del', 'ENSG00000117616', 'RSRP1', 'intron_variant'],
         ['NC_000001.11:g.25271785_25329047del', 'ENSG00000187010', 'RHD', 'stop_lost'],
         ['NC_000011.10:g.5226797_5226798insGCC', 'ENSG00000244734', 'HBB', 'feature_elongation']
     ])