Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes from submissions #163

Merged
merged 9 commits into from
Jul 10, 2023
11 changes: 7 additions & 4 deletions eva_submission/ENA_submission/upload_to_ENA.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,13 @@ def upload_xml_files_to_ena(self, dry_ena_upload=False):
response = self._post_xml_file_to_ena(cfg.query('ena', 'submit_async'), file_dict)
if response.status_code == 200:
json_data = response.json()
xml_link = [link_dict['href'] for link_dict in json_data['links'] if link_dict['rel'] == 'poll-xml'][0]
self.results['submissionId'] = json_data['submissionId']
self.results['poll-links'] = xml_link
self.monitor_results()
if 'links' in json_data:
xml_link = [link_dict['href'] for link_dict in json_data['links'] if link_dict['rel'] == 'poll-xml'][0]
self.results['submissionId'] = json_data['submissionId']
self.results['poll-links'] = xml_link
self.monitor_results()
else:
self.results['errors'] = [f'No links present in json document: {json_data}']
else:
self.results['errors'] = [f'{response.status_code}']

Expand Down
13 changes: 7 additions & 6 deletions eva_submission/biosamples_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,12 +404,13 @@ def map_metadata_to_bsd_data(self):
pass
if sample_row.get('Novel attribute(s)'):
for novel_attribute in sample_row.get('Novel attribute(s)').split(','):
attribute, value = novel_attribute.split(':')
self.apply_mapping(
bsd_sample_entry['characteristics'],
self.map_sample_key(attribute.lower()),
[{'text': self.serialize(value)}]
)
if ":" in novel_attribute:
attribute, value = novel_attribute.strip().split(':')
self.apply_mapping(
bsd_sample_entry['characteristics'],
self.map_sample_key(attribute.lower()),
[{'text': self.serialize(value)}]
)
# Apply defaults if the key doesn't already exist
for key in self.characteristic_defaults:
if key not in sample_row:
Expand Down
10 changes: 5 additions & 5 deletions eva_submission/eload_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def check_submitted_filenames(self):
f'{", ".join(set(submitted_vcfs).difference(set(spreadsheet_vcfs)))}')
analysis_alias = ''
if len(eva_xls_reader.analysis) == 1:
analysis_alias = eva_xls_reader.analysis[0].get('Analysis Alias') or ''
analysis_alias = self._unique_analysis_alias(eva_xls_reader.analysis[0].get('Analysis Alias')) or ''
apriltuesday marked this conversation as resolved.
Show resolved Hide resolved
elif len(eva_xls_reader.analysis) > 1:
self.error("Multiple analyses found, can't add submitted VCF to spreadsheet")
raise ValueError("Multiple analyses found, can't add submitted VCF to spreadsheet")
Expand All @@ -123,26 +123,26 @@ def detect_metadata_attributes(self):
analysis_reference = {}
for analysis in eva_metadata.analysis:
reference_txt = analysis.get('Reference')
analysis_alias = self._unique_analysis_alias(analysis.get('Analysis Alias'))
assembly_accessions = resolve_accession_from_text(reference_txt) if reference_txt else None
if not assembly_accessions:
assembly_accession = None
elif len(assembly_accessions) == 1:
assembly_accession = assembly_accessions[0]
else:
self.warning(f"Multiple assemblies found for {analysis.get('Analysis Alias')}: {', '.join(assembly_accessions)} ")
self.warning(f"Multiple assemblies found for {analysis_alias}: {', '.join(assembly_accessions)} ")
assembly_accession = sorted(assembly_accessions)[-1]
self.warning(f"Will use the most recent assembly: {assembly_accession}")

if assembly_accession:
analysis_reference[analysis.get('Analysis Alias')] = {'assembly_accession': assembly_accession,
'vcf_files': []}
analysis_reference[analysis_alias] = {'assembly_accession': assembly_accession, 'vcf_files': []}
else:
self.error(f"Reference is missing for Analysis {analysis.get('Analysis Alias')}")

for file in eva_metadata.files:
if file.get("File Type") == 'vcf':
file_full = os.path.join(self.eload_dir, directory_structure['vcf'], file.get("File Name"))
analysis_alias = file.get("Analysis Alias")
analysis_alias = self._unique_analysis_alias(file.get("Analysis Alias"))
analysis_reference[analysis_alias]['vcf_files'].append(file_full)
self.eload_cfg.set('submission', 'analyses', value=analysis_reference)

Expand Down
13 changes: 10 additions & 3 deletions eva_submission/eload_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ def _get_dir(self, key):
def now(self):
return datetime.now()

def _unique_analysis_alias(self, analysis_alias):
apriltuesday marked this conversation as resolved.
Show resolved Hide resolved
if not analysis_alias.startswith(self.eload):
return f'{self.eload}_{analysis_alias}'
return analysis_alias

def create_log_file(self):
logfile_name = os.path.join(self.eload_dir, str(self.eload) + "_submission.log")
if logfile_name not in eload_logging_files:
Expand All @@ -100,14 +105,14 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None
reader = EvaXlsxReader(input_spreadsheet)
single_analysis_alias = None
if len(reader.analysis) == 1:
single_analysis_alias = reader.analysis[0].get('Analysis Alias')
single_analysis_alias = self._unique_analysis_alias(reader.analysis[0].get('Analysis Alias'))

sample_rows = []
for sample_row in reader.samples:
if self.eload_cfg.query('brokering', 'Biosamples', 'Samples', sample_row.get('Sample Name')):
sample_rows.append({
'row_num': sample_row.get('row_num'),
'Analysis Alias': sample_row.get('Analysis Alias') or single_analysis_alias,
'Analysis Alias': self._unique_analysis_alias(sample_row.get('Analysis Alias')) or single_analysis_alias,
'Sample ID': sample_row.get('Sample Name'),
'Sample Accession': self.eload_cfg['brokering']['Biosamples']['Samples'][sample_row.get('Sample Name')]
})
Expand Down Expand Up @@ -138,7 +143,9 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None
project_row = reader.project
if existing_project:
project_row['Project Alias'] = existing_project

elif self.eload not in project_row['Project Alias']:
# Add the eload id to ensure that the project alias is unique
project_row['Project Alias'] = self.eload + '_' + project_row['Project Alias']
if output_spreadsheet:
eva_xls_writer = EvaXlsxWriter(input_spreadsheet, output_spreadsheet)
else:
Expand Down
21 changes: 13 additions & 8 deletions eva_submission/eload_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ def mark_valid_files_and_metadata(self, merge_per_analysis):
self.eload_cfg.query('validation', validation_task, 'forced', ret_default=False)
for validation_task in self.all_validation_tasks
]):
self.eload_cfg.set('validation', 'valid', 'analyses',
value=copy.copy(self.eload_cfg.query('submission', 'analyses')))

for analysis_alias in self.eload_cfg.query('submission', 'analyses'):
u_analysis_alias = self._unique_analysis_alias(analysis_alias)
self.eload_cfg.set('validation', 'valid', 'analyses', u_analysis_alias,
value=self.eload_cfg.query('submission', 'analyses', analysis_alias))
self.eload_cfg.set(
'validation', 'valid', 'analyses', analysis_alias, 'vcf_files',
'validation', 'valid', 'analyses', u_analysis_alias, 'vcf_files',
value=self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files')
)
self.eload_cfg.set('validation', 'valid', 'metadata_spreadsheet',
Expand All @@ -76,7 +76,7 @@ def mark_valid_files_and_metadata(self, merge_per_analysis):
def _get_vcf_files(self):
vcf_files = []
for analysis_alias in self.eload_cfg.query('submission', 'analyses'):
files = self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files')
files = self.eload_cfg.query('submission', 'analyses', self._unique_analysis_alias(analysis_alias), 'vcf_files')
vcf_files.extend(files) if files else None
return vcf_files

Expand All @@ -85,7 +85,7 @@ def _get_valid_vcf_files_by_analysis(self):
valid_analysis_dict = self.eload_cfg.query('validation', 'valid', 'analyses')
if valid_analysis_dict:
for analysis_alias in valid_analysis_dict:
vcf_files[analysis_alias] = valid_analysis_dict[analysis_alias]['vcf_files']
vcf_files[self._unique_analysis_alias(analysis_alias)] = valid_analysis_dict[analysis_alias]['vcf_files']
return vcf_files

def _validate_metadata_format(self):
Expand All @@ -102,8 +102,8 @@ def _validate_sample_names(self):
)
for analysis_alias in results_per_analysis_alias:
has_difference, diff_submitted_file_submission, diff_submission_submitted_file = results_per_analysis_alias[analysis_alias]

self.eload_cfg.set('validation', 'sample_check', 'analysis', str(analysis_alias), value={
analysis_alias = self._unique_analysis_alias(analysis_alias)
self.eload_cfg.set('validation', 'sample_check', 'analysis', analysis_alias, value={
'difference_exists': has_difference,
'in_VCF_not_in_metadata': diff_submitted_file_submission,
'in_metadata_not_in_VCF': diff_submission_submitted_file
Expand All @@ -117,6 +117,7 @@ def _validate_genotype_aggregation(self):
detect_vcf_aggregation(vcf_file)
for vcf_file in self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files')
]
analysis_alias = self._unique_analysis_alias(analysis_alias)
if len(set(aggregations)) == 1 and None not in aggregations:
aggregation = set(aggregations).pop()
self.eload_cfg.set('validation', 'aggregation_check', 'analyses', str(analysis_alias), value=aggregation)
Expand All @@ -139,6 +140,7 @@ def detect_and_optionally_merge(self, merge_per_analysis):
for analysis_alias, vcf_files in vcfs_by_analysis.items():
if len(vcf_files) < 2:
continue
analysis_alias = self._unique_analysis_alias(analysis_alias)
merge_type = detect_merge_type(vcf_files)
if merge_type:
self.eload_cfg.set('validation', 'merge_type', analysis_alias, value=merge_type.value)
Expand Down Expand Up @@ -468,6 +470,7 @@ def _sample_check_report(self):
reports = []
for analysis_alias in self.eload_cfg.query('validation', 'sample_check', 'analysis', ret_default=[]):
results = self.eload_cfg.query('validation', 'sample_check', 'analysis', analysis_alias)
analysis_alias = self._unique_analysis_alias(analysis_alias)
report_data = {
'analysis_alias': analysis_alias,
'pass': 'FAIL' if results.get('difference_exists') else 'PASS',
Expand All @@ -486,6 +489,7 @@ def _vcf_merge_report(self):
return ' No mergeable VCFs\n'
reports = [' Merge types:']
for analysis_alias, merge_type in analysis_merge_dict.items():
analysis_alias = self._unique_analysis_alias(analysis_alias)
reports.append(f' * {analysis_alias}: {merge_type}')

errors = self.eload_cfg.query('validation', 'merge_errors')
Expand All @@ -500,6 +504,7 @@ def _aggregation_report(self):
reports = []
if aggregation_dict:
for analysis_alias, aggregation in aggregation_dict.get('analyses', {}).items():
analysis_alias = self._unique_analysis_alias(analysis_alias)
reports.append(f" * {analysis_alias}: {aggregation}")
reports.append(" * Errors:")
for error in aggregation_dict.get('errors', []):
Expand Down
4 changes: 2 additions & 2 deletions eva_submission/submission_qc_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def check_all_browsable_files_are_available_in_ftp(self, vcf_files):
no_ext_file, _ = os.path.splitext(file)
if file not in files_in_ftp:
missing_files.append(file)
if f'{file}.csi' not in files_in_ftp or f'{no_ext_file}.csi' not in files_in_ftp:
if f'{file}.csi' not in files_in_ftp and f'{no_ext_file}.csi' not in files_in_ftp:
missing_files.append(f'{file}.csi or {no_ext_file}.csi')

# accessioned files will not be present for human taxonomy
Expand All @@ -161,7 +161,7 @@ def check_all_browsable_files_are_available_in_ftp(self, vcf_files):
no_ext_accessioned_file, _ = os.path.splitext(accessioned_file)
if accessioned_file not in files_in_ftp:
missing_files.append(accessioned_file)
if f'{accessioned_file}.csi' not in files_in_ftp or f'{no_ext_accessioned_file}.csi' not in files_in_ftp:
if f'{accessioned_file}.csi' not in files_in_ftp and f'{no_ext_accessioned_file}.csi' not in files_in_ftp:
missing_files.append(f'{accessioned_file}.csi or {no_ext_accessioned_file}.csi')

self._ftp_check_result = "PASS" if not missing_files else "FAIL"
Expand Down
4 changes: 2 additions & 2 deletions tests/test_eload_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ def test_detect_metadata_attributes(self):
assert self.eload.eload_cfg.query('submission', 'project_title') == 'Greatest project ever'
assert self.eload.eload_cfg.query('submission', 'taxonomy_id') == 9606
assert self.eload.eload_cfg.query('submission', 'scientific_name') == 'Homo sapiens'
assert self.eload.eload_cfg.query('submission', 'analyses', 'GAE', 'assembly_accession') == 'GCA_000001405.1'
vcf_files = self.eload.eload_cfg.query('submission', 'analyses', 'GAE', 'vcf_files')
assert self.eload.eload_cfg.query('submission', 'analyses', 'ELOAD_1_GAE', 'assembly_accession') == 'GCA_000001405.1'
vcf_files = self.eload.eload_cfg.query('submission', 'analyses', 'ELOAD_1_GAE', 'vcf_files')
assert len(vcf_files) == 1
assert '10_submitted/vcf_files/T100.vcf.gz' in vcf_files[0]

Expand Down
10 changes: 5 additions & 5 deletions tests/test_eload_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,21 +121,21 @@ def test_report(self):
----------------------------------
Sample names check:
* a1: PASS
* ELOAD_2_a1: PASS
- Samples that appear in the VCF but not in the Metadata sheet:
- Samples that appear in the Metadata sheet but not in the VCF file(s):
----------------------------------
Aggregation:
* a1: none
* ELOAD_2_a1: none
* Errors:
----------------------------------
VCF merge:
Merge types:
* a1: horizontal
* ELOAD_2_a1: horizontal
----------------------------------
Expand All @@ -151,7 +151,7 @@ def test_report(self):

def test_detect_and_optionally_merge(self):
original_content = deepcopy(self.validation.eload_cfg.content)
analysis_alias = 'alias'
analysis_alias = 'ELOAD_2_alias'
valid_files = ['file1', 'file2']
merged_files = {analysis_alias: 'merged.vcf.gz'}
self.validation.eload_cfg.set('validation', 'valid', 'analyses', analysis_alias, 'vcf_files', value=valid_files)
Expand Down Expand Up @@ -232,6 +232,6 @@ def test_mark_valid_files_and_metadata(self):
assert self.validation.eload_cfg.query('validation', 'valid') is None
self.validation.mark_valid_files_and_metadata(merge_per_analysis=False)
# Check that the normalised file was picked up instead of the original file
expected = {'analyses': {'analysis_alias': {'vcf_files': ['test.vcf']}},
expected = {'analyses': {'ELOAD_2_analysis_alias': {'vcf_files': ['test.vcf']}},
'metadata_spreadsheet': '/path/to/the/spreadsheet'}
assert self.validation.eload_cfg.query('validation', 'valid') == expected