Skip to content

Commit

Permalink
Ensure Project and Analysis alias are unique with the Webin account b…
Browse files Browse the repository at this point in the history
…y prefixing the ELOAD number to the alias
  • Loading branch information
tcezard committed Jul 5, 2023
1 parent 0e4b405 commit 87277c6
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 23 deletions.
10 changes: 5 additions & 5 deletions eva_submission/eload_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def check_submitted_filenames(self):
f'{", ".join(set(submitted_vcfs).difference(set(spreadsheet_vcfs)))}')
analysis_alias = ''
if len(eva_xls_reader.analysis) == 1:
analysis_alias = eva_xls_reader.analysis[0].get('Analysis Alias') or ''
analysis_alias = self._unique_analysis_alias(eva_xls_reader.analysis[0].get('Analysis Alias')) or ''
elif len(eva_xls_reader.analysis) > 1:
self.error("Multiple analyses found, can't add submitted VCF to spreadsheet")
raise ValueError("Multiple analyses found, can't add submitted VCF to spreadsheet")
Expand All @@ -123,26 +123,26 @@ def detect_metadata_attributes(self):
analysis_reference = {}
for analysis in eva_metadata.analysis:
reference_txt = analysis.get('Reference')
analysis_alias = self._unique_analysis_alias(analysis.get('Analysis Alias'))
assembly_accessions = resolve_accession_from_text(reference_txt) if reference_txt else None
if not assembly_accessions:
assembly_accession = None
elif len(assembly_accessions) == 1:
assembly_accession = assembly_accessions[0]
else:
self.warning(f"Multiple assemblies found for {analysis.get('Analysis Alias')}: {', '.join(assembly_accessions)} ")
self.warning(f"Multiple assemblies found for {analysis_alias}: {', '.join(assembly_accessions)} ")
assembly_accession = sorted(assembly_accessions)[-1]
self.warning(f"Will use the most recent assembly: {assembly_accession}")

if assembly_accession:
analysis_reference[analysis.get('Analysis Alias')] = {'assembly_accession': assembly_accession,
'vcf_files': []}
analysis_reference[analysis_alias] = {'assembly_accession': assembly_accession, 'vcf_files': []}
else:
self.error(f"Reference is missing for Analysis {analysis.get('Analysis Alias')}")

for file in eva_metadata.files:
if file.get("File Type") == 'vcf':
file_full = os.path.join(self.eload_dir, directory_structure['vcf'], file.get("File Name"))
analysis_alias = file.get("Analysis Alias")
analysis_alias = self._unique_analysis_alias(file.get("Analysis Alias"))
analysis_reference[analysis_alias]['vcf_files'].append(file_full)
self.eload_cfg.set('submission', 'analyses', value=analysis_reference)

Expand Down
13 changes: 10 additions & 3 deletions eva_submission/eload_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ def _get_dir(self, key):
def now(self):
return datetime.now()

def _unique_analysis_alias(self, analysis_alias):
if not analysis_alias.startswith(self.eload):
return f'{self.eload}_{analysis_alias}'
return analysis_alias

def create_log_file(self):
logfile_name = os.path.join(self.eload_dir, str(self.eload) + "_submission.log")
if logfile_name not in eload_logging_files:
Expand All @@ -100,14 +105,14 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None
reader = EvaXlsxReader(input_spreadsheet)
single_analysis_alias = None
if len(reader.analysis) == 1:
single_analysis_alias = reader.analysis[0].get('Analysis Alias')
single_analysis_alias = self._unique_analysis_alias(reader.analysis[0].get('Analysis Alias'))

sample_rows = []
for sample_row in reader.samples:
if self.eload_cfg.query('brokering', 'Biosamples', 'Samples', sample_row.get('Sample Name')):
sample_rows.append({
'row_num': sample_row.get('row_num'),
'Analysis Alias': sample_row.get('Analysis Alias') or single_analysis_alias,
'Analysis Alias': self._unique_analysis_alias(sample_row.get('Analysis Alias')) or single_analysis_alias,
'Sample ID': sample_row.get('Sample Name'),
'Sample Accession': self.eload_cfg['brokering']['Biosamples']['Samples'][sample_row.get('Sample Name')]
})
Expand Down Expand Up @@ -138,7 +143,9 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None
project_row = reader.project
if existing_project:
project_row['Project Alias'] = existing_project

elif self.eload not in project_row['Project Alias']:
# Add the eload id to ensure that the project alias is unique
project_row['Project Alias'] = self.eload + '_' + project_row['Project Alias']
if output_spreadsheet:
eva_xls_writer = EvaXlsxWriter(input_spreadsheet, output_spreadsheet)
else:
Expand Down
23 changes: 15 additions & 8 deletions eva_submission/eload_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ def mark_valid_files_and_metadata(self, merge_per_analysis):
self.eload_cfg.query('validation', validation_task, 'forced', ret_default=False)
for validation_task in self.all_validation_tasks
]):
self.eload_cfg.set('validation', 'valid', 'analyses',
value=copy.copy(self.eload_cfg.query('submission', 'analyses')))

for analysis_alias in self.eload_cfg.query('submission', 'analyses'):
u_analysis_alias = self._unique_analysis_alias(analysis_alias)
self.eload_cfg.set('validation', 'valid', 'analyses', u_analysis_alias,
value=self.eload_cfg.query('submission', 'analyses', analysis_alias))
self.eload_cfg.set(
'validation', 'valid', 'analyses', analysis_alias, 'vcf_files',
'validation', 'valid', 'analyses', u_analysis_alias, 'vcf_files',
value=self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files')
)
self.eload_cfg.set('validation', 'valid', 'metadata_spreadsheet',
Expand All @@ -76,7 +76,7 @@ def mark_valid_files_and_metadata(self, merge_per_analysis):
def _get_vcf_files(self):
vcf_files = []
for analysis_alias in self.eload_cfg.query('submission', 'analyses'):
files = self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files')
files = self.eload_cfg.query('submission', 'analyses', self._unique_analysis_alias(analysis_alias), 'vcf_files')
vcf_files.extend(files) if files else None
return vcf_files

Expand All @@ -85,7 +85,8 @@ def _get_valid_vcf_files_by_analysis(self):
valid_analysis_dict = self.eload_cfg.query('validation', 'valid', 'analyses')
if valid_analysis_dict:
for analysis_alias in valid_analysis_dict:
vcf_files[analysis_alias] = valid_analysis_dict[analysis_alias]['vcf_files']
print(analysis_alias)
vcf_files[self._unique_analysis_alias(analysis_alias)] = valid_analysis_dict[analysis_alias]['vcf_files']
return vcf_files

def _validate_metadata_format(self):
Expand All @@ -102,8 +103,8 @@ def _validate_sample_names(self):
)
for analysis_alias in results_per_analysis_alias:
has_difference, diff_submitted_file_submission, diff_submission_submitted_file = results_per_analysis_alias[analysis_alias]

self.eload_cfg.set('validation', 'sample_check', 'analysis', str(analysis_alias), value={
analysis_alias = self._unique_analysis_alias(analysis_alias)
self.eload_cfg.set('validation', 'sample_check', 'analysis', analysis_alias, value={
'difference_exists': has_difference,
'in_VCF_not_in_metadata': diff_submitted_file_submission,
'in_metadata_not_in_VCF': diff_submission_submitted_file
Expand All @@ -117,6 +118,7 @@ def _validate_genotype_aggregation(self):
detect_vcf_aggregation(vcf_file)
for vcf_file in self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files')
]
analysis_alias = self._unique_analysis_alias(analysis_alias)
if len(set(aggregations)) == 1 and None not in aggregations:
aggregation = set(aggregations).pop()
self.eload_cfg.set('validation', 'aggregation_check', 'analyses', str(analysis_alias), value=aggregation)
Expand All @@ -137,8 +139,10 @@ def detect_and_optionally_merge(self, merge_per_analysis):
vcfs_to_horizontal_merge = {}
vcfs_to_vertical_concat = {}
for analysis_alias, vcf_files in vcfs_by_analysis.items():
print(analysis_alias)
if len(vcf_files) < 2:
continue
analysis_alias = self._unique_analysis_alias(analysis_alias)
merge_type = detect_merge_type(vcf_files)
if merge_type:
self.eload_cfg.set('validation', 'merge_type', analysis_alias, value=merge_type.value)
Expand Down Expand Up @@ -468,6 +472,7 @@ def _sample_check_report(self):
reports = []
for analysis_alias in self.eload_cfg.query('validation', 'sample_check', 'analysis', ret_default=[]):
results = self.eload_cfg.query('validation', 'sample_check', 'analysis', analysis_alias)
analysis_alias = self._unique_analysis_alias(analysis_alias)
report_data = {
'analysis_alias': analysis_alias,
'pass': 'FAIL' if results.get('difference_exists') else 'PASS',
Expand All @@ -486,6 +491,7 @@ def _vcf_merge_report(self):
return ' No mergeable VCFs\n'
reports = [' Merge types:']
for analysis_alias, merge_type in analysis_merge_dict.items():
analysis_alias = self._unique_analysis_alias(analysis_alias)
reports.append(f' * {analysis_alias}: {merge_type}')

errors = self.eload_cfg.query('validation', 'merge_errors')
Expand All @@ -500,6 +506,7 @@ def _aggregation_report(self):
reports = []
if aggregation_dict:
for analysis_alias, aggregation in aggregation_dict.get('analyses', {}).items():
analysis_alias = self._unique_analysis_alias(analysis_alias)
reports.append(f" * {analysis_alias}: {aggregation}")
reports.append(" * Errors:")
for error in aggregation_dict.get('errors', []):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_eload_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ def test_detect_metadata_attributes(self):
assert self.eload.eload_cfg.query('submission', 'project_title') == 'Greatest project ever'
assert self.eload.eload_cfg.query('submission', 'taxonomy_id') == 9606
assert self.eload.eload_cfg.query('submission', 'scientific_name') == 'Homo sapiens'
assert self.eload.eload_cfg.query('submission', 'analyses', 'GAE', 'assembly_accession') == 'GCA_000001405.1'
vcf_files = self.eload.eload_cfg.query('submission', 'analyses', 'GAE', 'vcf_files')
assert self.eload.eload_cfg.query('submission', 'analyses', 'ELOAD_1_GAE', 'assembly_accession') == 'GCA_000001405.1'
vcf_files = self.eload.eload_cfg.query('submission', 'analyses', 'ELOAD_1_GAE', 'vcf_files')
assert len(vcf_files) == 1
assert '10_submitted/vcf_files/T100.vcf.gz' in vcf_files[0]

Expand Down
10 changes: 5 additions & 5 deletions tests/test_eload_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,21 +121,21 @@ def test_report(self):
----------------------------------
Sample names check:
* a1: PASS
* ELOAD_2_a1: PASS
- Samples that appear in the VCF but not in the Metadata sheet:
- Samples that appear in the Metadata sheet but not in the VCF file(s):
----------------------------------
Aggregation:
* a1: none
* ELOAD_2_a1: none
* Errors:
----------------------------------
VCF merge:
Merge types:
* a1: horizontal
* ELOAD_2_a1: horizontal
----------------------------------
Expand All @@ -151,7 +151,7 @@ def test_report(self):

def test_detect_and_optionally_merge(self):
original_content = deepcopy(self.validation.eload_cfg.content)
analysis_alias = 'alias'
analysis_alias = 'ELOAD_2_alias'
valid_files = ['file1', 'file2']
merged_files = {analysis_alias: 'merged.vcf.gz'}
self.validation.eload_cfg.set('validation', 'valid', 'analyses', analysis_alias, 'vcf_files', value=valid_files)
Expand Down Expand Up @@ -232,6 +232,6 @@ def test_mark_valid_files_and_metadata(self):
assert self.validation.eload_cfg.query('validation', 'valid') is None
self.validation.mark_valid_files_and_metadata(merge_per_analysis=False)
# Check that the normalised file was picked up instead of the original file
expected = {'analyses': {'analysis_alias': {'vcf_files': ['test.vcf']}},
expected = {'analyses': {'ELOAD_2_analysis_alias': {'vcf_files': ['test.vcf']}},
'metadata_spreadsheet': '/path/to/the/spreadsheet'}
assert self.validation.eload_cfg.query('validation', 'valid') == expected

0 comments on commit 87277c6

Please sign in to comment.