diff --git a/eva_submission/eload_preparation.py b/eva_submission/eload_preparation.py index a2ce8b7..9ff4c01 100644 --- a/eva_submission/eload_preparation.py +++ b/eva_submission/eload_preparation.py @@ -103,7 +103,7 @@ def check_submitted_filenames(self): f'{", ".join(set(submitted_vcfs).difference(set(spreadsheet_vcfs)))}') analysis_alias = '' if len(eva_xls_reader.analysis) == 1: - analysis_alias = eva_xls_reader.analysis[0].get('Analysis Alias') or '' + analysis_alias = self._unique_analysis_alias(eva_xls_reader.analysis[0].get('Analysis Alias')) or '' elif len(eva_xls_reader.analysis) > 1: self.error("Multiple analyses found, can't add submitted VCF to spreadsheet") raise ValueError("Multiple analyses found, can't add submitted VCF to spreadsheet") @@ -123,26 +123,26 @@ def detect_metadata_attributes(self): analysis_reference = {} for analysis in eva_metadata.analysis: reference_txt = analysis.get('Reference') + analysis_alias = self._unique_analysis_alias(analysis.get('Analysis Alias')) assembly_accessions = resolve_accession_from_text(reference_txt) if reference_txt else None if not assembly_accessions: assembly_accession = None elif len(assembly_accessions) == 1: assembly_accession = assembly_accessions[0] else: - self.warning(f"Multiple assemblies found for {analysis.get('Analysis Alias')}: {', '.join(assembly_accessions)} ") + self.warning(f"Multiple assemblies found for {analysis_alias}: {', '.join(assembly_accessions)} ") assembly_accession = sorted(assembly_accessions)[-1] self.warning(f"Will use the most recent assembly: {assembly_accession}") if assembly_accession: - analysis_reference[analysis.get('Analysis Alias')] = {'assembly_accession': assembly_accession, - 'vcf_files': []} + analysis_reference[analysis_alias] = {'assembly_accession': assembly_accession, 'vcf_files': []} else: self.error(f"Reference is missing for Analysis {analysis.get('Analysis Alias')}") for file in eva_metadata.files: if file.get("File Type") == 'vcf': file_full = os.path.join(self.eload_dir, directory_structure['vcf'], file.get("File Name")) - analysis_alias = file.get("Analysis Alias") + analysis_alias = self._unique_analysis_alias(file.get("Analysis Alias")) analysis_reference[analysis_alias]['vcf_files'].append(file_full) self.eload_cfg.set('submission', 'analyses', value=analysis_reference) diff --git a/eva_submission/eload_submission.py b/eva_submission/eload_submission.py index 6c0e72c..41303b7 100755 --- a/eva_submission/eload_submission.py +++ b/eva_submission/eload_submission.py @@ -74,6 +74,11 @@ def _get_dir(self, key): def now(self): return datetime.now() + def _unique_analysis_alias(self, analysis_alias): + if not analysis_alias.startswith(self.eload): + return f'{self.eload}_{analysis_alias}' + return analysis_alias + def create_log_file(self): logfile_name = os.path.join(self.eload_dir, str(self.eload) + "_submission.log") if logfile_name not in eload_logging_files: @@ -100,14 +105,14 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None reader = EvaXlsxReader(input_spreadsheet) single_analysis_alias = None if len(reader.analysis) == 1: - single_analysis_alias = reader.analysis[0].get('Analysis Alias') + single_analysis_alias = self._unique_analysis_alias(reader.analysis[0].get('Analysis Alias')) sample_rows = [] for sample_row in reader.samples: if self.eload_cfg.query('brokering', 'Biosamples', 'Samples', sample_row.get('Sample Name')): sample_rows.append({ 'row_num': sample_row.get('row_num'), - 'Analysis Alias': sample_row.get('Analysis Alias') or single_analysis_alias, + 'Analysis Alias': self._unique_analysis_alias(sample_row.get('Analysis Alias')) or single_analysis_alias, 'Sample ID': sample_row.get('Sample Name'), 'Sample Accession': self.eload_cfg['brokering']['Biosamples']['Samples'][sample_row.get('Sample Name')] }) @@ -138,7 +143,9 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None project_row = reader.project if existing_project: project_row['Project Alias'] = existing_project - + elif self.eload not in project_row['Project Alias']: + # Add the eload id to ensure that the project alias is unique + project_row['Project Alias'] = self.eload + '_' + project_row['Project Alias'] if output_spreadsheet: eva_xls_writer = EvaXlsxWriter(input_spreadsheet, output_spreadsheet) else: diff --git a/eva_submission/eload_validation.py b/eva_submission/eload_validation.py index 0fcc1d2..aed3e3b 100755 --- a/eva_submission/eload_validation.py +++ b/eva_submission/eload_validation.py @@ -61,12 +61,12 @@ def mark_valid_files_and_metadata(self, merge_per_analysis): self.eload_cfg.query('validation', validation_task, 'forced', ret_default=False) for validation_task in self.all_validation_tasks ]): - self.eload_cfg.set('validation', 'valid', 'analyses', - value=copy.copy(self.eload_cfg.query('submission', 'analyses'))) - for analysis_alias in self.eload_cfg.query('submission', 'analyses'): + u_analysis_alias = self._unique_analysis_alias(analysis_alias) + self.eload_cfg.set('validation', 'valid', 'analyses', u_analysis_alias, + value=self.eload_cfg.query('submission', 'analyses', analysis_alias)) self.eload_cfg.set( - 'validation', 'valid', 'analyses', analysis_alias, 'vcf_files', + 'validation', 'valid', 'analyses', u_analysis_alias, 'vcf_files', value=self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files') ) self.eload_cfg.set('validation', 'valid', 'metadata_spreadsheet', @@ -76,7 +76,7 @@ def mark_valid_files_and_metadata(self, merge_per_analysis): def _get_vcf_files(self): vcf_files = [] for analysis_alias in self.eload_cfg.query('submission', 'analyses'): - files = self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files') + files = self.eload_cfg.query('submission', 'analyses', self._unique_analysis_alias(analysis_alias), 'vcf_files') vcf_files.extend(files) if files else None return vcf_files @@ -85,7 +85,8 @@ def _get_valid_vcf_files_by_analysis(self): valid_analysis_dict = self.eload_cfg.query('validation', 'valid', 'analyses') if valid_analysis_dict: for analysis_alias in valid_analysis_dict: - vcf_files[analysis_alias] = valid_analysis_dict[analysis_alias]['vcf_files'] + print(analysis_alias) + vcf_files[self._unique_analysis_alias(analysis_alias)] = valid_analysis_dict[analysis_alias]['vcf_files'] return vcf_files def _validate_metadata_format(self): @@ -102,8 +103,8 @@ def _validate_sample_names(self): ) for analysis_alias in results_per_analysis_alias: has_difference, diff_submitted_file_submission, diff_submission_submitted_file = results_per_analysis_alias[analysis_alias] - - self.eload_cfg.set('validation', 'sample_check', 'analysis', str(analysis_alias), value={ + analysis_alias = self._unique_analysis_alias(analysis_alias) + self.eload_cfg.set('validation', 'sample_check', 'analysis', analysis_alias, value={ 'difference_exists': has_difference, 'in_VCF_not_in_metadata': diff_submitted_file_submission, 'in_metadata_not_in_VCF': diff_submission_submitted_file @@ -117,6 +118,7 @@ def _validate_genotype_aggregation(self): detect_vcf_aggregation(vcf_file) for vcf_file in self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files') ] + analysis_alias = self._unique_analysis_alias(analysis_alias) if len(set(aggregations)) == 1 and None not in aggregations: aggregation = set(aggregations).pop() self.eload_cfg.set('validation', 'aggregation_check', 'analyses', str(analysis_alias), value=aggregation) @@ -137,8 +139,10 @@ def detect_and_optionally_merge(self, merge_per_analysis): vcfs_to_horizontal_merge = {} vcfs_to_vertical_concat = {} for analysis_alias, vcf_files in vcfs_by_analysis.items(): + print(analysis_alias) if len(vcf_files) < 2: continue + analysis_alias = self._unique_analysis_alias(analysis_alias) merge_type = detect_merge_type(vcf_files) if merge_type: self.eload_cfg.set('validation', 'merge_type', analysis_alias, value=merge_type.value) @@ -468,6 +472,7 @@ def _sample_check_report(self): reports = [] for analysis_alias in self.eload_cfg.query('validation', 'sample_check', 'analysis', ret_default=[]): results = self.eload_cfg.query('validation', 'sample_check', 'analysis', analysis_alias) + analysis_alias = self._unique_analysis_alias(analysis_alias) report_data = { 'analysis_alias': analysis_alias, 'pass': 'FAIL' if results.get('difference_exists') else 'PASS', @@ -486,6 +491,7 @@ def _vcf_merge_report(self): return ' No mergeable VCFs\n' reports = [' Merge types:'] for analysis_alias, merge_type in analysis_merge_dict.items(): + analysis_alias = self._unique_analysis_alias(analysis_alias) reports.append(f' * {analysis_alias}: {merge_type}') errors = self.eload_cfg.query('validation', 'merge_errors') @@ -500,6 +506,7 @@ def _aggregation_report(self): reports = [] if aggregation_dict: for analysis_alias, aggregation in aggregation_dict.get('analyses', {}).items(): + analysis_alias = self._unique_analysis_alias(analysis_alias) reports.append(f" * {analysis_alias}: {aggregation}") reports.append(" * Errors:") for error in aggregation_dict.get('errors', []): diff --git a/tests/test_eload_preparation.py b/tests/test_eload_preparation.py index 51ebc82..c1d32ea 100644 --- a/tests/test_eload_preparation.py +++ b/tests/test_eload_preparation.py @@ -84,8 +84,8 @@ def test_detect_metadata_attributes(self): assert self.eload.eload_cfg.query('submission', 'project_title') == 'Greatest project ever' assert self.eload.eload_cfg.query('submission', 'taxonomy_id') == 9606 assert self.eload.eload_cfg.query('submission', 'scientific_name') == 'Homo sapiens' - assert self.eload.eload_cfg.query('submission', 'analyses', 'GAE', 'assembly_accession') == 'GCA_000001405.1' - vcf_files = self.eload.eload_cfg.query('submission', 'analyses', 'GAE', 'vcf_files') + assert self.eload.eload_cfg.query('submission', 'analyses', 'ELOAD_1_GAE', 'assembly_accession') == 'GCA_000001405.1' + vcf_files = self.eload.eload_cfg.query('submission', 'analyses', 'ELOAD_1_GAE', 'vcf_files') assert len(vcf_files) == 1 assert '10_submitted/vcf_files/T100.vcf.gz' in vcf_files[0] diff --git a/tests/test_eload_validation.py b/tests/test_eload_validation.py index f9ee782..498c179 100644 --- a/tests/test_eload_validation.py +++ b/tests/test_eload_validation.py @@ -121,21 +121,21 @@ def test_report(self): ---------------------------------- Sample names check: - * a1: PASS + * ELOAD_2_a1: PASS - Samples that appear in the VCF but not in the Metadata sheet: - Samples that appear in the Metadata sheet but not in the VCF file(s): ---------------------------------- Aggregation: - * a1: none + * ELOAD_2_a1: none * Errors: ---------------------------------- VCF merge: Merge types: - * a1: horizontal + * ELOAD_2_a1: horizontal ---------------------------------- @@ -151,7 +151,7 @@ def test_report(self): def test_detect_and_optionally_merge(self): original_content = deepcopy(self.validation.eload_cfg.content) - analysis_alias = 'alias' + analysis_alias = 'ELOAD_2_alias' valid_files = ['file1', 'file2'] merged_files = {analysis_alias: 'merged.vcf.gz'} self.validation.eload_cfg.set('validation', 'valid', 'analyses', analysis_alias, 'vcf_files', value=valid_files) @@ -232,6 +232,6 @@ def test_mark_valid_files_and_metadata(self): assert self.validation.eload_cfg.query('validation', 'valid') is None self.validation.mark_valid_files_and_metadata(merge_per_analysis=False) # Check that the normalised file was picked up instead of the original file - expected = {'analyses': {'analysis_alias': {'vcf_files': ['test.vcf']}}, + expected = {'analyses': {'ELOAD_2_analysis_alias': {'vcf_files': ['test.vcf']}}, 'metadata_spreadsheet': '/path/to/the/spreadsheet'} assert self.validation.eload_cfg.query('validation', 'valid') == expected