Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3378 read scientific name from evapro #171

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions eva_submission/ENA_submission/upload_to_ENA.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,10 @@ def _post_xml_file_to_ena(self, url, file_dict):
)
return response

def upload_xml_files_to_ena(self, dry_ena_upload=False):
def upload_xml_files_to_ena(self, private_config_xml_file, profile, dry_ena_upload=False):
"""Upload the xml files to the webin submission endpoint and parse the receipt."""
submission_file, project_file, analysis_file = self.converter.create_submission_files(self.eload)
submission_file, project_file, analysis_file = self.converter.create_submission_files(self.eload,
private_config_xml_file, profile)
file_dict = {
'SUBMISSION': (os.path.basename(submission_file), get_file_content(submission_file), 'application/xml'),
'ANALYSIS': (os.path.basename(analysis_file), get_file_content(analysis_file), 'application/xml')
Expand Down Expand Up @@ -114,10 +115,10 @@ def parse_ena_receipt(self, ena_xml_receipt):

class ENAUploaderAsync(ENAUploader):

def upload_xml_files_to_ena(self, dry_ena_upload=False):
def upload_xml_files_to_ena(self, private_config_xml_file, profile, dry_ena_upload=False):
"""Upload the xml file to the asynchronous endpoint and monitor the results from the poll endpoint."""

webin_file = self.converter.create_single_submission_file(self.eload)
webin_file = self.converter.create_single_submission_file(self.eload, private_config_xml_file, profile)
file_dict = {
'file': (os.path.basename(webin_file), get_file_content(webin_file), 'application/xml'),
}
Expand Down
16 changes: 9 additions & 7 deletions eva_submission/ENA_submission/xlsx_to_ENA_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from xml.etree.ElementTree import Element, ElementTree

from ebi_eva_common_pyutils.logger import AppLogger
from ebi_eva_common_pyutils.taxonomy.taxonomy import get_scientific_name_from_ensembl
from ebi_eva_common_pyutils.taxonomy.taxonomy import get_scientific_name_from_taxonomy

from eva_submission.eload_utils import check_existing_project_in_ena
from eva_submission.xlsx.xlsx_parser_eva import EvaXlsxReader
Expand Down Expand Up @@ -109,7 +109,7 @@ def existing_project(self):
return prj_title
return None

def _create_project_xml(self):
def _create_project_xml(self, private_config_xml_file, profile):
"""
This function read the project row from the XLS parser then create and populate an XML element following ENA
data model.
Expand Down Expand Up @@ -151,7 +151,9 @@ def _create_project_xml(self):
if 'Tax ID' in project_row:
org_elemt = add_element(sub_project_elemt, 'ORGANISM')
add_element(org_elemt, 'TAXON_ID', element_text=str(project_row.get('Tax ID')).strip())
scientific_name = get_scientific_name_from_ensembl(str(project_row.get('Tax ID')).strip())
scientific_name = get_scientific_name_from_taxonomy(str(project_row.get('Tax ID')).strip(),
private_config_xml_file=private_config_xml_file,
profile=profile)
add_element(org_elemt, 'SCIENTIFIC_NAME', element_text=scientific_name)

add_element(org_elemt, 'STRAIN', element_text=project_row.get('Strain', ''), content_required=True)
Expand Down Expand Up @@ -339,13 +341,13 @@ def write_xml_to_file(xml_element, output_file):
with open(output_file, 'bw') as open_file:
open_file.write(prettify(etree))

def create_submission_files(self, eload):
def create_submission_files(self, eload, private_config_xml_file, profile):
files_to_submit = []
if not self.is_existing_project:
files_to_submit.append(
{'file_name': os.path.basename(self.project_file), 'schema': 'project'}
)
projects_elemt = self._create_project_xml()
projects_elemt = self._create_project_xml(private_config_xml_file, profile)
self.write_xml_to_file(projects_elemt, self.project_file)
project_file = self.project_file
else:
Expand All @@ -363,7 +365,7 @@ def create_submission_files(self, eload):

return self.submission_file, project_file, self.analysis_file

def create_single_submission_file(self, eload):
def create_single_submission_file(self, eload, private_config_xml_file, profile):
root = Element('WEBIN')
# Submission ELEMENT
action = 'ADD'
Expand All @@ -372,7 +374,7 @@ def create_single_submission_file(self, eload):

# Project ELEMENT
if not self.is_existing_project:
projects_elemt = self._create_project_xml()
projects_elemt = self._create_project_xml(private_config_xml_file, profile)
root.append(projects_elemt)

# Analysis ELEMENT
Expand Down
3 changes: 2 additions & 1 deletion eva_submission/eload_brokering.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ def broker_to_ena(self, force=False, existing_project=None, async_upload=False,
else:
ena_uploader.upload_vcf_files_to_ena_ftp(files_to_upload)
# Upload XML to ENA
ena_uploader.upload_xml_files_to_ena(dry_ena_upload)
ena_uploader.upload_xml_files_to_ena(cfg['maven']['settings_file'], cfg['maven']['environment'],
dry_ena_upload)
if not dry_ena_upload:
# Update the project accession in case we're working with existing project
# We should not be uploading additional analysis in th same ELOAD so no need to update
Expand Down
6 changes: 4 additions & 2 deletions eva_submission/eload_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from retry import retry

from ebi_eva_common_pyutils.taxonomy.taxonomy import get_scientific_name_from_ensembl
from ebi_eva_common_pyutils.taxonomy.taxonomy import get_scientific_name_from_taxonomy
from ebi_eva_common_pyutils.config import cfg
from ebi_eva_common_pyutils.config_utils import get_contig_alias_db_creds_for_profile

Expand Down Expand Up @@ -151,7 +151,9 @@ def detect_metadata_attributes(self):
taxonomy_id = eva_metadata.project.get('Tax ID')
if taxonomy_id and (isinstance(taxonomy_id, int) or taxonomy_id.isdigit()):
self.eload_cfg.set('submission', 'taxonomy_id', value=int(taxonomy_id))
scientific_name = get_scientific_name_from_ensembl(taxonomy_id)
scientific_name = get_scientific_name_from_taxonomy(taxonomy_id,
private_config_xml_file=cfg['maven']['settings_file'],
profile=cfg['maven']['environment'])
self.eload_cfg.set('submission', 'scientific_name', value=scientific_name)
else:
if taxonomy_id:
Expand Down
2 changes: 1 addition & 1 deletion eva_submission/eload_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def _get_valid_vcf_files_by_analysis(self):

def _validate_metadata_format(self):
validator = EvaXlsxValidator(self.eload_cfg['submission']['metadata_spreadsheet'])
validator.validate()
validator.validate(cfg['maven']['settings_file'], cfg['maven']['environment'])
self.eload_cfg['validation']['metadata_check']['metadata_spreadsheet'] = self.eload_cfg['submission']['metadata_spreadsheet']
self.eload_cfg['validation']['metadata_check']['errors'] = validator.error_list
self.eload_cfg['validation']['metadata_check']['pass'] = len(validator.error_list) == 0
Expand Down
12 changes: 7 additions & 5 deletions eva_submission/xlsx/xlsx_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import yaml
from cerberus import Validator
from ebi_eva_common_pyutils.logger import AppLogger
from ebi_eva_common_pyutils.taxonomy.taxonomy import get_scientific_name_from_ensembl
from ebi_eva_common_pyutils.taxonomy.taxonomy import get_scientific_name_from_taxonomy
from ebi_eva_common_pyutils.variation.assembly_utils import retrieve_genbank_assembly_accessions_from_ncbi
from requests import HTTPError

Expand All @@ -25,10 +25,10 @@ def __init__(self, metadata_file):

self.error_list = []

def validate(self):
def validate(self, private_config_xml_file, profile):
self.cerberus_validation()
self.complex_validation()
self.semantic_validation()
self.semantic_validation(private_config_xml_file, profile)

def cerberus_validation(self):
"""
Expand Down Expand Up @@ -79,7 +79,7 @@ def complex_validation(self):
)
self.check_date(row, 'collection_date', required=True)

def semantic_validation(self):
def semantic_validation(self, private_config_xml_file, profile):
"""
Validation of the data that involve checking its meaning
This function adds error statements to the errors attribute
Expand All @@ -98,7 +98,9 @@ def semantic_validation(self):
taxid_and_species_list = set([(row['Tax Id'], row['Scientific Name']) for row in self.metadata['Sample'] if row['Tax Id']])
for taxid, species in taxid_and_species_list:
try:
scientific_name = get_scientific_name_from_ensembl(int(taxid))
scientific_name = get_scientific_name_from_taxonomy(int(taxid),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given the possible latency across the different stages (metadata validation, brokering etc.,) I think we should nail down the scientific name just once at the outset (we can decide when this should be #devmeeting), add it to the EVA and use the metadata from EVA from there on.

private_config_xml_file=private_config_xml_file,
profile=profile)
if species != scientific_name:
if species.lower() == scientific_name.lower():
correct_taxid_sc_name[taxid] = scientific_name
Expand Down
7 changes: 6 additions & 1 deletion tests/test_eload_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import shutil
from unittest import TestCase, mock
from unittest.mock import patch

from ebi_eva_common_pyutils.config import cfg

Expand Down Expand Up @@ -79,7 +80,11 @@ def test_detect_metadata_attributes(self):
self.create_vcfs()
metadata = self.create_metadata()
self.eload.eload_cfg.set('submission', 'metadata_spreadsheet', value=metadata)
self.eload.detect_metadata_attributes()
cfg.content['maven']['settings_file'] = None
cfg.content['maven']['environment'] = None
with patch('eva_submission.eload_preparation.get_scientific_name_from_taxonomy') as m_sci_name:
m_sci_name.return_value = 'Homo sapiens'
self.eload.detect_metadata_attributes()

assert self.eload.eload_cfg.query('submission', 'project_title') == 'Greatest project ever'
assert self.eload.eload_cfg.query('submission', 'taxonomy_id') == 9606
Expand Down
17 changes: 12 additions & 5 deletions tests/test_upload_to_ENA.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,15 @@ def test_parse_ena_receipt_multiple_analyses(self):
}

def test_single_upload_xml_files_to_ena(self):
with patch.object(ENAUploader, '_post_xml_file_to_ena') as mock_post,\
patch('eva_submission.ENA_submission.upload_to_ENA.requests.get') as mock_get:
with patch.object(ENAUploader, '_post_xml_file_to_ena') as mock_post, \
patch('eva_submission.ENA_submission.xlsx_to_ENA_xml.get_scientific_name_from_taxonomy') as m_sci_name, \
patch('eva_submission.ENA_submission.upload_to_ENA.requests.get') as mock_get:
m_sci_name.return_value = 'Homo Sapiens'
json_data = {'submissionId': 'ERA123456', '_links': [{'rel': 'poll-xml', 'href': 'https://example.com/link'}]}
mock_post.return_value = Mock(status_code=200, json=Mock(return_value=json_data))
mock_get.return_value = Mock(status_code=200, text=self.receipt)
self.assertFalse(os.path.isfile(self.uploader_async.converter.single_submission_file))
self.uploader_async.upload_xml_files_to_ena()
self.uploader_async.upload_xml_files_to_ena(None, None)
self.assertTrue(os.path.isfile(self.uploader_async.converter.single_submission_file))
mock_post.assert_called_with(
'https://wwwdev.ebi.ac.uk/ena/submit/webin-v2/submit/queue',
Expand All @@ -119,16 +121,21 @@ def test_single_upload_xml_files_to_ena(self):

def test_single_upload_xml_files_to_ena_failed(self):
self.assertFalse(os.path.isfile(self.uploader_async.converter.single_submission_file))
self.uploader_async.upload_xml_files_to_ena()
with patch('eva_submission.ENA_submission.xlsx_to_ENA_xml.get_scientific_name_from_taxonomy') as m_sci_name:
m_sci_name.return_value = 'Homo Sapiens'
self.uploader_async.upload_xml_files_to_ena(None, None)
self.assertTrue(os.path.isfile(self.uploader_async.converter.single_submission_file))
self.assertEqual(self.uploader_async.results, {'errors': ['403']})


def test_single_dry_upload_xml_files_to_ena(self):
with patch.object(ENAUploader, '_post_xml_file_to_ena') as mock_post,\
patch('eva_submission.ENA_submission.upload_to_ENA.requests.get') as mock_get, \
patch('eva_submission.ENA_submission.xlsx_to_ENA_xml.get_scientific_name_from_taxonomy') as m_sci_name, \
patch.object(ENAUploaderAsync, 'info') as mock_info:
m_sci_name.return_value = 'Homo Sapiens'
self.assertFalse(os.path.isfile(self.uploader_async.converter.single_submission_file))
self.uploader_async.upload_xml_files_to_ena(dry_ena_upload=True)
self.uploader_async.upload_xml_files_to_ena(None, None, dry_ena_upload=True)
self.assertTrue(os.path.isfile(self.uploader_async.converter.single_submission_file))
mock_info.assert_any_call('Would have uploaded the following XML files to ENA asynchronous submission '
'endpoint:')
Expand Down
24 changes: 16 additions & 8 deletions tests/test_xlsx_to_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,9 @@ def test_create_project(self):
</PROJECT_SET>
'''
self.converter.reader = Mock(project=self.project_row)
with patch('eva_submission.ENA_submission.xlsx_to_ENA_xml.get_scientific_name_from_ensembl') as m_sci_name:
with patch('eva_submission.ENA_submission.xlsx_to_ENA_xml.get_scientific_name_from_taxonomy') as m_sci_name:
m_sci_name.return_value = 'Oncorhynchus mykiss'
root = self.converter._create_project_xml()
root = self.converter._create_project_xml(None, None)
expected_root = ET.fromstring(expected_project)
assert elements_equal(root, expected_root)

Expand All @@ -179,9 +179,9 @@ def test_add_analysis_to_existing_project(self):
))

def test_process_metadata_spreadsheet(self):
with patch('eva_submission.ENA_submission.xlsx_to_ENA_xml.get_scientific_name_from_ensembl') as m_sci_name:
with patch('eva_submission.ENA_submission.xlsx_to_ENA_xml.get_scientific_name_from_taxonomy') as m_sci_name:
m_sci_name.return_value = 'Oncorhynchus mykiss'
self.converter.create_submission_files('TEST1')
self.converter.create_submission_files('TEST1', None, None)
assert os.path.isfile(os.path.join(self.brokering_folder, 'TEST1.Submission.xml'))
assert os.path.isfile(os.path.join(self.brokering_folder, 'TEST1.Project.xml'))
assert os.path.isfile(os.path.join(self.brokering_folder, 'TEST1.Analysis.xml'))
Expand Down Expand Up @@ -243,21 +243,29 @@ def test_create_submission_with_date(self):
assert elements_equal(root, expected_root)

def test_create_submission_files(self):
submission_file, project_file, analysis_file = self.converter.create_submission_files('ELOAD_1')
with patch('eva_submission.ENA_submission.xlsx_to_ENA_xml.get_scientific_name_from_taxonomy') as m_sci_name:
m_sci_name.return_value = 'Homo Sapiens'
submission_file, project_file, analysis_file = self.converter.create_submission_files('ELOAD_1',
None, None)
assert os.path.exists(submission_file)
assert os.path.exists(project_file)
assert os.path.exists(analysis_file)

def test_create_submission_files_for_existing_project(self):
# When the project already exist not PROJECT XML will be generated
with patch.object(EnaXlsxConverter, 'is_existing_project', return_value=True):
submission_file, project_file, analysis_file = self.converter.create_submission_files('ELOAD_1')
with patch('eva_submission.ENA_submission.xlsx_to_ENA_xml.get_scientific_name_from_taxonomy') as m_sci_name, \
patch.object(EnaXlsxConverter, 'is_existing_project', return_value=True):
m_sci_name.return_value = 'Homo Sapiens'
submission_file, project_file, analysis_file = self.converter.create_submission_files('ELOAD_1',
None, None)
assert os.path.exists(submission_file)
assert project_file is None
assert os.path.exists(analysis_file)
assert not os.path.exists(self.converter.project_file)

def test_create_single_submission_files(self):
self.converter.create_single_submission_file('ELOAD_1')
with patch('eva_submission.ENA_submission.xlsx_to_ENA_xml.get_scientific_name_from_taxonomy') as m_sci_name:
m_sci_name.return_value = 'Homo Sapiens'
self.converter.create_single_submission_file('ELOAD_1', None, None)


8 changes: 4 additions & 4 deletions tests/test_xlsx_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ def test_complex_validation_failure(self):
self.assertEqual(self.validator_fail.error_list, expected_errors)

def test_validate(self):
with patch('eva_submission.xlsx.xlsx_validation.get_scientific_name_from_ensembl') as m_sci_name:
with patch('eva_submission.xlsx.xlsx_validation.get_scientific_name_from_taxonomy') as m_sci_name:
m_sci_name.return_value = 'Homo sapiens'
self.validator.validate()
self.validator.validate(None, None)
assert self.validator.error_list == []

def test_correct_scientific_name_in_metadata(self):
Expand All @@ -62,9 +62,9 @@ def test_correct_scientific_name_in_metadata(self):
assert len([s for s in scientific_name_list if s == 'Homo Sapiens']) == 10
assert len([s for s in scientific_name_list if s == 'HS']) == 10

with patch('eva_submission.xlsx.xlsx_validation.get_scientific_name_from_ensembl') as m_sci_name:
with patch('eva_submission.xlsx.xlsx_validation.get_scientific_name_from_taxonomy') as m_sci_name:
m_sci_name.return_value = 'Homo sapiens'
self.validator_sc_name.validate()
self.validator_sc_name.validate(None, None)
assert self.validator_sc_name.error_list == ['In Samples, Taxonomy 9606 and scientific name HS are inconsistent']

reader_after_modification = EvaXlsxReader(self.metadata_file_wrong_sc_name_copy)
Expand Down
Loading