Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes from submissions #163

Merged
merged 9 commits into from
Jul 10, 2023
4 changes: 2 additions & 2 deletions bin/broker_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@
from ebi_eva_common_pyutils.logger import logging_config as log_cfg

from eva_submission.eload_brokering import EloadBrokering
from eva_submission.eload_utils import check_existing_project
from eva_submission.eload_utils import check_existing_project_in_ena
from eva_submission.submission_config import load_config

logger = log_cfg.get_logger(__name__)


def ENA_Project(project):
"""Helper function to validate early that the project provided exist in ENA and is public"""
if not check_existing_project(str(project)):
if not check_existing_project_in_ena(str(project)):
logger.warning(f'Project {project} provided does not exist in ENA.')
raise ValueError
return str(project)
Expand Down
11 changes: 7 additions & 4 deletions eva_submission/ENA_submission/upload_to_ENA.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,13 @@ def upload_xml_files_to_ena(self, dry_ena_upload=False):
response = self._post_xml_file_to_ena(cfg.query('ena', 'submit_async'), file_dict)
if response.status_code == 200:
json_data = response.json()
xml_link = [link_dict['href'] for link_dict in json_data['links'] if link_dict['rel'] == 'poll-xml'][0]
self.results['submissionId'] = json_data['submissionId']
self.results['poll-links'] = xml_link
self.monitor_results()
if 'links' in json_data:
xml_link = [link_dict['href'] for link_dict in json_data['links'] if link_dict['rel'] == 'poll-xml'][0]
self.results['submissionId'] = json_data['submissionId']
self.results['poll-links'] = xml_link
self.monitor_results()
else:
self.results['errors'] = [f'No links present in json document: {json_data}']
else:
self.results['errors'] = [f'{response.status_code}']

Expand Down
6 changes: 3 additions & 3 deletions eva_submission/ENA_submission/xlsx_to_ENA_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ebi_eva_common_pyutils.logger import AppLogger
from ebi_eva_common_pyutils.taxonomy.taxonomy import get_scientific_name_from_ensembl

from eva_submission.eload_utils import check_existing_project
from eva_submission.eload_utils import check_existing_project_in_ena
from eva_submission.xlsx.xlsx_parser_eva import EvaXlsxReader


Expand Down Expand Up @@ -103,9 +103,9 @@ def is_existing_project(self):
def existing_project(self):
prj_alias = self.reader.project.get('Project Alias', '')
prj_title = self.reader.project.get('Project Title', '')
if re.match(r'^PRJ(EB|NA)', prj_alias) and check_existing_project(prj_alias):
if re.match(r'^PRJ(EB|NA)', prj_alias) and check_existing_project_in_ena(prj_alias):
return prj_alias
elif re.match(r'^PRJ(EB|NA)', prj_title) and check_existing_project(prj_title):
elif re.match(r'^PRJ(EB|NA)', prj_title) and check_existing_project_in_ena(prj_title):
return prj_title
return None

Expand Down
13 changes: 7 additions & 6 deletions eva_submission/biosamples_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,12 +404,13 @@ def map_metadata_to_bsd_data(self):
pass
if sample_row.get('Novel attribute(s)'):
for novel_attribute in sample_row.get('Novel attribute(s)').split(','):
attribute, value = novel_attribute.split(':')
self.apply_mapping(
bsd_sample_entry['characteristics'],
self.map_sample_key(attribute.lower()),
[{'text': self.serialize(value)}]
)
if ":" in novel_attribute:
attribute, value = novel_attribute.strip().split(':')
self.apply_mapping(
bsd_sample_entry['characteristics'],
self.map_sample_key(attribute.lower()),
[{'text': self.serialize(value)}]
)
# Apply defaults if the key doesn't already exist
for key in self.characteristic_defaults:
if key not in sample_row:
Expand Down
4 changes: 2 additions & 2 deletions eva_submission/eload_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from eva_submission import NEXTFLOW_DIR
from eva_submission.eload_submission import Eload
from eva_submission.eload_utils import provision_new_database_for_variant_warehouse
from eva_submission.eload_utils import provision_new_database_for_variant_warehouse, check_project_exists_in_evapro
from eva_submission.submission_config import EloadConfig
from eva_submission.vep_utils import get_vep_and_vep_cache_version

Expand Down Expand Up @@ -207,7 +207,7 @@ def check_variant_db(self):
provision_new_database_for_variant_warehouse(db_info['db_name'])

def load_from_ena(self):
if self.eload_cfg.query('brokering', 'ena', 'existing_project'):
if check_project_exists_in_evapro(self.project_accession):
analyses = self.eload_cfg.query('brokering', 'ena', 'ANALYSIS')
for analysis_accession in analyses.values():
self.load_from_ena_from_project_or_analysis(analysis_accession)
Expand Down
10 changes: 5 additions & 5 deletions eva_submission/eload_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def check_submitted_filenames(self):
f'{", ".join(set(submitted_vcfs).difference(set(spreadsheet_vcfs)))}')
analysis_alias = ''
if len(eva_xls_reader.analysis) == 1:
analysis_alias = eva_xls_reader.analysis[0].get('Analysis Alias') or ''
analysis_alias = self._unique_alias(eva_xls_reader.analysis[0].get('Analysis Alias')) or ''
elif len(eva_xls_reader.analysis) > 1:
self.error("Multiple analyses found, can't add submitted VCF to spreadsheet")
raise ValueError("Multiple analyses found, can't add submitted VCF to spreadsheet")
Expand All @@ -123,26 +123,26 @@ def detect_metadata_attributes(self):
analysis_reference = {}
for analysis in eva_metadata.analysis:
reference_txt = analysis.get('Reference')
analysis_alias = self._unique_alias(analysis.get('Analysis Alias'))
assembly_accessions = resolve_accession_from_text(reference_txt) if reference_txt else None
if not assembly_accessions:
assembly_accession = None
elif len(assembly_accessions) == 1:
assembly_accession = assembly_accessions[0]
else:
self.warning(f"Multiple assemblies found for {analysis.get('Analysis Alias')}: {', '.join(assembly_accessions)} ")
self.warning(f"Multiple assemblies found for {analysis_alias}: {', '.join(assembly_accessions)} ")
assembly_accession = sorted(assembly_accessions)[-1]
self.warning(f"Will use the most recent assembly: {assembly_accession}")

if assembly_accession:
analysis_reference[analysis.get('Analysis Alias')] = {'assembly_accession': assembly_accession,
'vcf_files': []}
analysis_reference[analysis_alias] = {'assembly_accession': assembly_accession, 'vcf_files': []}
else:
self.error(f"Reference is missing for Analysis {analysis.get('Analysis Alias')}")

for file in eva_metadata.files:
if file.get("File Type") == 'vcf':
file_full = os.path.join(self.eload_dir, directory_structure['vcf'], file.get("File Name"))
analysis_alias = file.get("Analysis Alias")
analysis_alias = self._unique_alias(file.get("Analysis Alias"))
analysis_reference[analysis_alias]['vcf_files'].append(file_full)
self.eload_cfg.set('submission', 'analyses', value=analysis_reference)

Expand Down
13 changes: 10 additions & 3 deletions eva_submission/eload_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ def _get_dir(self, key):
def now(self):
return datetime.now()

def _unique_alias(self, alias):
if alias and not alias.startswith(self.eload):
return f'{self.eload}_{alias}'
return alias

def create_log_file(self):
logfile_name = os.path.join(self.eload_dir, str(self.eload) + "_submission.log")
if logfile_name not in eload_logging_files:
Expand All @@ -100,14 +105,14 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None
reader = EvaXlsxReader(input_spreadsheet)
single_analysis_alias = None
if len(reader.analysis) == 1:
single_analysis_alias = reader.analysis[0].get('Analysis Alias')
single_analysis_alias = self._unique_alias(reader.analysis[0].get('Analysis Alias'))

sample_rows = []
for sample_row in reader.samples:
if self.eload_cfg.query('brokering', 'Biosamples', 'Samples', sample_row.get('Sample Name')):
sample_rows.append({
'row_num': sample_row.get('row_num'),
'Analysis Alias': sample_row.get('Analysis Alias') or single_analysis_alias,
'Analysis Alias': self._unique_alias(sample_row.get('Analysis Alias')) or single_analysis_alias,
'Sample ID': sample_row.get('Sample Name'),
'Sample Accession': self.eload_cfg['brokering']['Biosamples']['Samples'][sample_row.get('Sample Name')]
})
Expand Down Expand Up @@ -138,7 +143,9 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None
project_row = reader.project
if existing_project:
project_row['Project Alias'] = existing_project

elif self.eload not in project_row['Project Alias']:
# Add the eload id to ensure that the project alias is unique
project_row['Project Alias'] = self._unique_alias(project_row['Project Alias'])
if output_spreadsheet:
eva_xls_writer = EvaXlsxWriter(input_spreadsheet, output_spreadsheet)
else:
Expand Down
11 changes: 10 additions & 1 deletion eva_submission/eload_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,15 @@ def get_project_alias(project_accession):
return rows[0][0]


def check_project_exists_in_evapro(project_accession):
with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as conn:
query = f"select alias from evapro.project where project_accession='{project_accession}';"
rows = get_all_results_for_query(conn, query)
if len(rows) == 1:
return True
return False


def get_hold_date_from_ena(project_accession, project_alias=None):
"""Gets hold date from ENA"""
if not project_alias:
Expand Down Expand Up @@ -145,7 +154,7 @@ def download_file(url, dest):


@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
def check_existing_project(project_accession):
def check_existing_project_in_ena(project_accession):
"""
Check if a project accession exists and is public in ENA
:param project_accession:
Expand Down
21 changes: 13 additions & 8 deletions eva_submission/eload_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ def mark_valid_files_and_metadata(self, merge_per_analysis):
self.eload_cfg.query('validation', validation_task, 'forced', ret_default=False)
for validation_task in self.all_validation_tasks
]):
self.eload_cfg.set('validation', 'valid', 'analyses',
value=copy.copy(self.eload_cfg.query('submission', 'analyses')))

for analysis_alias in self.eload_cfg.query('submission', 'analyses'):
u_analysis_alias = self._unique_alias(analysis_alias)
self.eload_cfg.set('validation', 'valid', 'analyses', u_analysis_alias,
value=self.eload_cfg.query('submission', 'analyses', analysis_alias))
self.eload_cfg.set(
'validation', 'valid', 'analyses', analysis_alias, 'vcf_files',
'validation', 'valid', 'analyses', u_analysis_alias, 'vcf_files',
value=self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files')
)
self.eload_cfg.set('validation', 'valid', 'metadata_spreadsheet',
Expand All @@ -76,7 +76,7 @@ def mark_valid_files_and_metadata(self, merge_per_analysis):
def _get_vcf_files(self):
vcf_files = []
for analysis_alias in self.eload_cfg.query('submission', 'analyses'):
files = self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files')
files = self.eload_cfg.query('submission', 'analyses', self._unique_alias(analysis_alias), 'vcf_files')
vcf_files.extend(files) if files else None
return vcf_files

Expand All @@ -85,7 +85,7 @@ def _get_valid_vcf_files_by_analysis(self):
valid_analysis_dict = self.eload_cfg.query('validation', 'valid', 'analyses')
if valid_analysis_dict:
for analysis_alias in valid_analysis_dict:
vcf_files[analysis_alias] = valid_analysis_dict[analysis_alias]['vcf_files']
vcf_files[self._unique_alias(analysis_alias)] = valid_analysis_dict[analysis_alias]['vcf_files']
return vcf_files

def _validate_metadata_format(self):
Expand All @@ -102,8 +102,8 @@ def _validate_sample_names(self):
)
for analysis_alias in results_per_analysis_alias:
has_difference, diff_submitted_file_submission, diff_submission_submitted_file = results_per_analysis_alias[analysis_alias]

self.eload_cfg.set('validation', 'sample_check', 'analysis', str(analysis_alias), value={
analysis_alias = self._unique_alias(analysis_alias)
self.eload_cfg.set('validation', 'sample_check', 'analysis', analysis_alias, value={
'difference_exists': has_difference,
'in_VCF_not_in_metadata': diff_submitted_file_submission,
'in_metadata_not_in_VCF': diff_submission_submitted_file
Expand All @@ -117,6 +117,7 @@ def _validate_genotype_aggregation(self):
detect_vcf_aggregation(vcf_file)
for vcf_file in self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files')
]
analysis_alias = self._unique_alias(analysis_alias)
if len(set(aggregations)) == 1 and None not in aggregations:
aggregation = set(aggregations).pop()
self.eload_cfg.set('validation', 'aggregation_check', 'analyses', str(analysis_alias), value=aggregation)
Expand All @@ -139,6 +140,7 @@ def detect_and_optionally_merge(self, merge_per_analysis):
for analysis_alias, vcf_files in vcfs_by_analysis.items():
if len(vcf_files) < 2:
continue
analysis_alias = self._unique_alias(analysis_alias)
merge_type = detect_merge_type(vcf_files)
if merge_type:
self.eload_cfg.set('validation', 'merge_type', analysis_alias, value=merge_type.value)
Expand Down Expand Up @@ -468,6 +470,7 @@ def _sample_check_report(self):
reports = []
for analysis_alias in self.eload_cfg.query('validation', 'sample_check', 'analysis', ret_default=[]):
results = self.eload_cfg.query('validation', 'sample_check', 'analysis', analysis_alias)
analysis_alias = self._unique_alias(analysis_alias)
report_data = {
'analysis_alias': analysis_alias,
'pass': 'FAIL' if results.get('difference_exists') else 'PASS',
Expand All @@ -486,6 +489,7 @@ def _vcf_merge_report(self):
return ' No mergeable VCFs\n'
reports = [' Merge types:']
for analysis_alias, merge_type in analysis_merge_dict.items():
analysis_alias = self._unique_alias(analysis_alias)
reports.append(f' * {analysis_alias}: {merge_type}')

errors = self.eload_cfg.query('validation', 'merge_errors')
Expand All @@ -500,6 +504,7 @@ def _aggregation_report(self):
reports = []
if aggregation_dict:
for analysis_alias, aggregation in aggregation_dict.get('analyses', {}).items():
analysis_alias = self._unique_alias(analysis_alias)
reports.append(f" * {analysis_alias}: {aggregation}")
reports.append(" * Errors:")
for error in aggregation_dict.get('errors', []):
Expand Down
4 changes: 2 additions & 2 deletions eva_submission/submission_qc_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def check_all_browsable_files_are_available_in_ftp(self, vcf_files):
no_ext_file, _ = os.path.splitext(file)
if file not in files_in_ftp:
missing_files.append(file)
if f'{file}.csi' not in files_in_ftp or f'{no_ext_file}.csi' not in files_in_ftp:
if f'{file}.csi' not in files_in_ftp and f'{no_ext_file}.csi' not in files_in_ftp:
missing_files.append(f'{file}.csi or {no_ext_file}.csi')

# accessioned files will not be present for human taxonomy
Expand All @@ -161,7 +161,7 @@ def check_all_browsable_files_are_available_in_ftp(self, vcf_files):
no_ext_accessioned_file, _ = os.path.splitext(accessioned_file)
if accessioned_file not in files_in_ftp:
missing_files.append(accessioned_file)
if f'{accessioned_file}.csi' not in files_in_ftp or f'{no_ext_accessioned_file}.csi' not in files_in_ftp:
if f'{accessioned_file}.csi' not in files_in_ftp and f'{no_ext_accessioned_file}.csi' not in files_in_ftp:
missing_files.append(f'{accessioned_file}.csi or {no_ext_accessioned_file}.csi')

self._ftp_check_result = "PASS" if not missing_files else "FAIL"
Expand Down
6 changes: 4 additions & 2 deletions tests/test_eload_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,14 @@ def test_check_variant_db_with_creation(self):
m_mongo.return_value.shard_collections.assert_called_once()

def test_load_from_ena(self):
with patch('eva_submission.eload_ingestion.command_utils.run_command_with_output', autospec=True) as m_execute:
with patch('eva_submission.eload_ingestion.check_project_exists_in_evapro'), \
patch('eva_submission.eload_ingestion.command_utils.run_command_with_output', autospec=True) as m_execute:
self.eload.load_from_ena()
m_execute.assert_called_once()

def test_load_from_ena_script_fails(self):
with patch('eva_submission.eload_ingestion.command_utils.run_command_with_output', autospec=True) as m_execute:
with patch('eva_submission.eload_ingestion.check_project_exists_in_evapro'), \
patch('eva_submission.eload_ingestion.command_utils.run_command_with_output', autospec=True) as m_execute:
m_execute.side_effect = subprocess.CalledProcessError(1, 'some command')
with self.assertRaises(subprocess.CalledProcessError):
self.eload.load_from_ena()
Expand Down
4 changes: 2 additions & 2 deletions tests/test_eload_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ def test_detect_metadata_attributes(self):
assert self.eload.eload_cfg.query('submission', 'project_title') == 'Greatest project ever'
assert self.eload.eload_cfg.query('submission', 'taxonomy_id') == 9606
assert self.eload.eload_cfg.query('submission', 'scientific_name') == 'Homo sapiens'
assert self.eload.eload_cfg.query('submission', 'analyses', 'GAE', 'assembly_accession') == 'GCA_000001405.1'
vcf_files = self.eload.eload_cfg.query('submission', 'analyses', 'GAE', 'vcf_files')
assert self.eload.eload_cfg.query('submission', 'analyses', 'ELOAD_1_GAE', 'assembly_accession') == 'GCA_000001405.1'
vcf_files = self.eload.eload_cfg.query('submission', 'analyses', 'ELOAD_1_GAE', 'vcf_files')
assert len(vcf_files) == 1
assert '10_submitted/vcf_files/T100.vcf.gz' in vcf_files[0]

Expand Down
Loading