From d82bc5cb398d9c76efaf447a4fa8c54e869447ed Mon Sep 17 00:00:00 2001 From: Victoria Carr Date: Wed, 13 Jul 2022 14:17:35 +0100 Subject: [PATCH 1/6] Add status columns to summary --- bin/collate_qc_data.py | 27 ++++++++++++++++---------- tests/collate_qc_data_test.py | 36 +++++++++++++---------------------- 2 files changed, 30 insertions(+), 33 deletions(-) diff --git a/bin/collate_qc_data.py b/bin/collate_qc_data.py index cf9dd3f..c80225b 100755 --- a/bin/collate_qc_data.py +++ b/bin/collate_qc_data.py @@ -6,12 +6,18 @@ from collections import defaultdict -def write_summary_qc_report(summary_qc, output_prefix): +def write_summary_qc_report(summary_qc, complete_report, output_prefix): - with open(f'{output_prefix}_summary.tab', 'w') as out: - out.write('lane_id\tstatus\n') - for lane_id, status in summary_qc.items(): - out.write(f'{lane_id}\t{status}\n') + summary_report = pd.DataFrame() + summary_report['lane_id'] = summary_qc.keys() + summary_report['status'] = summary_qc.values() + + status_columns = ['lane_id'] + status_columns.extend([column for column in complete_report.columns if 'status' in column]) + + summary_report = summary_report.merge(complete_report[status_columns], how = 'inner', on='lane_id') + + summary_report.to_csv(f'{output_prefix}_summary.txt', sep = '\t', index = False) def get_summary_qc(all_reports): @@ -35,7 +41,7 @@ def get_summary_qc(all_reports): return summary_qc -def write_complete_qc_report(all_reports, output_prefix): +def get_complete_qc_report(all_reports): df = pd.DataFrame() @@ -45,8 +51,8 @@ def write_complete_qc_report(all_reports, output_prefix): else: tmp_df = pd.read_csv(report, sep = '\t') df = df.merge(tmp_df, how = 'inner', on='lane_id') - - df.to_csv(f'{output_prefix}_complete.tab', sep = '\t', index = False) + + return df def get_arguments(): @@ -61,13 +67,14 @@ def get_arguments(): def main(args): # Write complete report - write_complete_qc_report(args.qc_reports, args.output_prefix) + complete_report = get_complete_qc_report(args.qc_reports) + complete_report.to_csv(f'{args.output_prefix}_complete.txt', sep = '\t', index = False) # Get summary QC summary_qc = get_summary_qc(args.qc_reports) # Write summary QC - write_summary_qc_report(summary_qc, args.output_prefix) + write_summary_qc_report(summary_qc, complete_report, args.output_prefix) if __name__ == '__main__': diff --git a/tests/collate_qc_data_test.py b/tests/collate_qc_data_test.py index ac99be8..27d4bf7 100644 --- a/tests/collate_qc_data_test.py +++ b/tests/collate_qc_data_test.py @@ -14,15 +14,9 @@ class CollateQCData(TestCase): TEST_OUTPUT_PREFIX2 = f'{TEST_DATA_DIR}/qc_report2' - def test_write_complete_qc_report(self): + def test_get_complete_qc_report(self): - write_complete_qc_report([self.TEST_REL_ABND, self.TEST_CONTIG_NO], self.TEST_OUTPUT_PREFIX) - - file = open(f'{self.TEST_OUTPUT_PREFIX}_complete.tab', "r") - actual = "".join(file.readlines()) - os.remove(f'{self.TEST_OUTPUT_PREFIX}_complete.tab') - - self.assertEqual(actual, """lane_id\trel_abundance\trel_abundance_status\tcontig_no\tcontig_no_status\ntest_lane1\t92.38\tPASS\t1\tPASS\ntest_lane2\t2.38\tFAIL\t500\tFAIL\ntest_lane3\t70.0\tFAIL\t3\tPASS\ntest_lane4\t\t\t501\tFAIL\n""") + get_complete_qc_report([self.TEST_REL_ABND, self.TEST_CONTIG_NO]) def test_get_summary_qc(self): @@ -37,23 +31,19 @@ def test_write_summary_qc_report(self): summary_qc['test_lane3'] = 'FAIL' summary_qc['test_lane4'] = '' - write_summary_qc_report(summary_qc, self.TEST_OUTPUT_PREFIX) - - file = open(f'{self.TEST_OUTPUT_PREFIX}_summary.tab', "r") - actual = "".join(file.readlines()) - os.remove(f'{self.TEST_OUTPUT_PREFIX}_summary.tab') + complete_report = get_complete_qc_report([self.TEST_REL_ABND, self.TEST_CONTIG_NO]) - self.assertEqual(actual, """lane_id\tstatus\ntest_lane1\tPASS\ntest_lane2\tFAIL\ntest_lane3\tFAIL\ntest_lane4\t\n""") + write_summary_qc_report(summary_qc, complete_report, self.TEST_OUTPUT_PREFIX) def test_arguments(self): actual = get_arguments().parse_args( - ['--qc_reports', 'report1', 'report2', '--output_prefix', 'output_tab_file']) - self.assertEqual(actual, argparse.Namespace(qc_reports=['report1', 'report2'], output_prefix='output_tab_file')) + ['--qc_reports', 'report1', 'report2', '--output_prefix', 'output_txt_file']) + self.assertEqual(actual, argparse.Namespace(qc_reports=['report1', 'report2'], output_prefix='output_txt_file')) def test_arguments_short_options(self): actual = get_arguments().parse_args( - ['-i', 'report1', 'report2', '-o', 'output_tab_file']) - self.assertEqual(actual, argparse.Namespace(qc_reports=['report1', 'report2'], output_prefix='output_tab_file')) + ['-i', 'report1', 'report2', '-o', 'output_txt_file']) + self.assertEqual(actual, argparse.Namespace(qc_reports=['report1', 'report2'], output_prefix='output_txt_file')) def test_main(self): args = get_arguments().parse_args( @@ -61,14 +51,14 @@ def test_main(self): main(args) - file = open(f'{self.TEST_OUTPUT_PREFIX2}_summary.tab', "r") + file = open(f'{self.TEST_OUTPUT_PREFIX2}_summary.txt', "r") actual = "".join(file.readlines()) - os.remove(f'{self.TEST_OUTPUT_PREFIX2}_summary.tab') + os.remove(f'{self.TEST_OUTPUT_PREFIX2}_summary.txt') - self.assertEqual(actual, """lane_id\tstatus\ntest_lane1\tPASS\ntest_lane2\tFAIL\ntest_lane3\tFAIL\ntest_lane4\t\n""") + self.assertEqual(actual, """lane_id\tstatus\trel_abundance_status\tcontig_no_status\ntest_lane1\tPASS\tPASS\tPASS\ntest_lane2\tFAIL\tFAIL\tFAIL\ntest_lane3\tFAIL\tFAIL\tPASS\ntest_lane4\t\t\tFAIL\n""") - file = open(f'{self.TEST_OUTPUT_PREFIX2}_complete.tab', "r") + file = open(f'{self.TEST_OUTPUT_PREFIX2}_complete.txt', "r") actual = "".join(file.readlines()) - os.remove(f'{self.TEST_OUTPUT_PREFIX2}_complete.tab') + os.remove(f'{self.TEST_OUTPUT_PREFIX2}_complete.txt') self.assertEqual(actual, """lane_id\trel_abundance\trel_abundance_status\tcontig_no\tcontig_no_status\ntest_lane1\t92.38\tPASS\t1\tPASS\ntest_lane2\t2.38\tFAIL\t500\tFAIL\ntest_lane3\t70.0\tFAIL\t3\tPASS\ntest_lane4\t\t\t501\tFAIL\n""") From 650506c0a6c8949ebe4fd563cebe6959ab494024 Mon Sep 17 00:00:00 2001 From: Victoria Carr Date: Wed, 13 Jul 2022 14:29:33 +0100 Subject: [PATCH 2/6] Add version column --- bin/collate_qc_data.py | 16 +++++++++++++--- main.nf | 7 ++++++- modules/collate_qc_data.nf | 2 ++ modules/version.nf | 20 ++++++++++++++++++++ tests/collate_qc_data_test.py | 26 +++++++++++++++++--------- tests/test_data/qc_report_summary.tab | 5 +++++ tests/test_data/qc_report_summary.txt | 5 +++++ tests/test_data/version.txt | 2 ++ 8 files changed, 70 insertions(+), 13 deletions(-) create mode 100644 modules/version.nf create mode 100644 tests/test_data/qc_report_summary.tab create mode 100644 tests/test_data/qc_report_summary.txt create mode 100644 tests/test_data/version.txt diff --git a/bin/collate_qc_data.py b/bin/collate_qc_data.py index c80225b..8cbc9af 100755 --- a/bin/collate_qc_data.py +++ b/bin/collate_qc_data.py @@ -13,7 +13,7 @@ def write_summary_qc_report(summary_qc, complete_report, output_prefix): summary_report['status'] = summary_qc.values() status_columns = ['lane_id'] - status_columns.extend([column for column in complete_report.columns if 'status' in column]) + status_columns.extend([column for column in complete_report.columns if 'status' in column or 'version' in column]) summary_report = summary_report.merge(complete_report[status_columns], how = 'inner', on='lane_id') @@ -41,7 +41,7 @@ def get_summary_qc(all_reports): return summary_qc -def get_complete_qc_report(all_reports): +def get_complete_qc_report(all_reports, version_file): df = pd.DataFrame() @@ -52,6 +52,14 @@ def get_complete_qc_report(all_reports): tmp_df = pd.read_csv(report, sep = '\t') df = df.merge(tmp_df, how = 'inner', on='lane_id') + version = '' + with open(version_file, 'r') as file: + next(file) + for line in file: + version = line.replace("\n", "") + + df['version'] = version + return df @@ -59,6 +67,8 @@ def get_arguments(): parser = argparse.ArgumentParser(description='Collate QC data to output complete and summary reports.') parser.add_argument('-i', '--qc_reports', required=True, nargs='*', help='All QC reports.') + parser.add_argument('-v', '--version', dest='version', required=True, + help='Input file with version of pipeline.') parser.add_argument('-o', '--output_prefix', required=True, type=str, help='Output prefix of QC reports.') @@ -67,7 +77,7 @@ def get_arguments(): def main(args): # Write complete report - complete_report = get_complete_qc_report(args.qc_reports) + complete_report = get_complete_qc_report(args.qc_reports, args.version) complete_report.to_csv(f'{args.output_prefix}_complete.txt', sep = '\t', index = False) # Get summary QC diff --git a/main.nf b/main.nf index 8ea27f4..0abe45e 100755 --- a/main.nf +++ b/main.nf @@ -18,6 +18,7 @@ include {depth_of_coverage} from './modules/depth_of_coverage.nf' include {breadth_of_coverage} from './modules/breadth_of_coverage.nf' include {get_proportion_HET_SNPs} from './modules/get_proportion_HET_SNPs.nf' include {HET_SNPs} from './modules/HET_SNPs.nf' +include {get_version} from './modules/version.nf' // Workflow for reads QC workflow reads_qc { @@ -98,8 +99,12 @@ workflow { // Run assembly QC assemblies_qc(get_file_destinations.out, get_qc_stats_from_pf.out, get_proportion_HET_SNPs.out, headers_ch, lanes_ch) + // Get version of pipeline + get_version() + version_ch = get_version.out + // Collate QC reports - collate_qc_data(reads_qc.out.qc_report, assemblies_qc.out.qc_report) + collate_qc_data(reads_qc.out.qc_report, assemblies_qc.out.qc_report, version_ch) collate_qc_data.out.complete .subscribe { it -> it.copyTo(results_dir) } diff --git a/modules/collate_qc_data.nf b/modules/collate_qc_data.nf index c7bc37f..5713233 100644 --- a/modules/collate_qc_data.nf +++ b/modules/collate_qc_data.nf @@ -3,6 +3,7 @@ process collate_qc_data { input: path read_qc_report tuple file(number_of_contigs), file(contig_gc_content), file(genome_length), file(depth_of_coverage), file(breadth_of_coverage), file(het_snps) + path version output: path("qc_report_complete.tab"), emit: complete @@ -16,6 +17,7 @@ process collate_qc_data { collate_qc_data.py \ --qc_reports ${read_qc_report} ${number_of_contigs} ${contig_gc_content} ${genome_length} ${depth_of_coverage} ${breadth_of_coverage} ${het_snps} \ + --version ${version} \ --output_prefix "qc_report" """ } diff --git a/modules/version.nf b/modules/version.nf new file mode 100644 index 0000000..d1036aa --- /dev/null +++ b/modules/version.nf @@ -0,0 +1,20 @@ +process get_version { + input: + + output: + path("${output}") + + script: + output="version.txt" + version=params.version + """ + echo "version" > ${output} + + if [ -z ${version} ]; + then + echo \$(git -C $baseDir describe --tags) >> ${output} + else + echo ${version} >> ${output} + fi + """ +} diff --git a/tests/collate_qc_data_test.py b/tests/collate_qc_data_test.py index 27d4bf7..0650b6f 100644 --- a/tests/collate_qc_data_test.py +++ b/tests/collate_qc_data_test.py @@ -4,6 +4,7 @@ from collections import defaultdict from bin.collate_qc_data import * +import numpy as np class CollateQCData(TestCase): @@ -12,11 +13,18 @@ class CollateQCData(TestCase): TEST_CONTIG_NO = f'{TEST_DATA_DIR}/test_contig_number.tab' TEST_OUTPUT_PREFIX = f'{TEST_DATA_DIR}/qc_report' TEST_OUTPUT_PREFIX2 = f'{TEST_DATA_DIR}/qc_report2' + TEST_VERSION = f'{TEST_DATA_DIR}/version.txt' def test_get_complete_qc_report(self): - get_complete_qc_report([self.TEST_REL_ABND, self.TEST_CONTIG_NO]) + actual = get_complete_qc_report([self.TEST_REL_ABND, self.TEST_CONTIG_NO], self.TEST_VERSION) + + self.assertEqual(actual.to_dict()['lane_id'], { + 0: 'test_lane1', + 1: 'test_lane2', + 2: 'test_lane3', + 3: 'test_lane4'}) def test_get_summary_qc(self): @@ -31,23 +39,23 @@ def test_write_summary_qc_report(self): summary_qc['test_lane3'] = 'FAIL' summary_qc['test_lane4'] = '' - complete_report = get_complete_qc_report([self.TEST_REL_ABND, self.TEST_CONTIG_NO]) + complete_report = get_complete_qc_report([self.TEST_REL_ABND, self.TEST_CONTIG_NO], self.TEST_VERSION) write_summary_qc_report(summary_qc, complete_report, self.TEST_OUTPUT_PREFIX) def test_arguments(self): actual = get_arguments().parse_args( - ['--qc_reports', 'report1', 'report2', '--output_prefix', 'output_txt_file']) - self.assertEqual(actual, argparse.Namespace(qc_reports=['report1', 'report2'], output_prefix='output_txt_file')) + ['--qc_reports', 'report1', 'report2', '--version', 'version', '--output_prefix', 'output_txt_file']) + self.assertEqual(actual, argparse.Namespace(qc_reports=['report1', 'report2'], version='version', output_prefix='output_txt_file')) def test_arguments_short_options(self): actual = get_arguments().parse_args( - ['-i', 'report1', 'report2', '-o', 'output_txt_file']) - self.assertEqual(actual, argparse.Namespace(qc_reports=['report1', 'report2'], output_prefix='output_txt_file')) + ['-i', 'report1', 'report2', '-v', 'version', '-o', 'output_txt_file']) + self.assertEqual(actual, argparse.Namespace(qc_reports=['report1', 'report2'], version='version', output_prefix='output_txt_file')) def test_main(self): args = get_arguments().parse_args( - ['--qc_reports', self.TEST_REL_ABND, self.TEST_CONTIG_NO, '--output_prefix', self.TEST_OUTPUT_PREFIX2]) + ['--qc_reports', self.TEST_REL_ABND, self.TEST_CONTIG_NO, '--version', self.TEST_VERSION, '--output_prefix', self.TEST_OUTPUT_PREFIX2]) main(args) @@ -55,10 +63,10 @@ def test_main(self): actual = "".join(file.readlines()) os.remove(f'{self.TEST_OUTPUT_PREFIX2}_summary.txt') - self.assertEqual(actual, """lane_id\tstatus\trel_abundance_status\tcontig_no_status\ntest_lane1\tPASS\tPASS\tPASS\ntest_lane2\tFAIL\tFAIL\tFAIL\ntest_lane3\tFAIL\tFAIL\tPASS\ntest_lane4\t\t\tFAIL\n""") + self.assertEqual(actual, """lane_id\tstatus\trel_abundance_status\tcontig_no_status\tversion\ntest_lane1\tPASS\tPASS\tPASS\tv0.0.0\ntest_lane2\tFAIL\tFAIL\tFAIL\tv0.0.0\ntest_lane3\tFAIL\tFAIL\tPASS\tv0.0.0\ntest_lane4\t\t\tFAIL\tv0.0.0\n""") file = open(f'{self.TEST_OUTPUT_PREFIX2}_complete.txt', "r") actual = "".join(file.readlines()) os.remove(f'{self.TEST_OUTPUT_PREFIX2}_complete.txt') - self.assertEqual(actual, """lane_id\trel_abundance\trel_abundance_status\tcontig_no\tcontig_no_status\ntest_lane1\t92.38\tPASS\t1\tPASS\ntest_lane2\t2.38\tFAIL\t500\tFAIL\ntest_lane3\t70.0\tFAIL\t3\tPASS\ntest_lane4\t\t\t501\tFAIL\n""") + self.assertEqual(actual, """lane_id\trel_abundance\trel_abundance_status\tcontig_no\tcontig_no_status\tversion\ntest_lane1\t92.38\tPASS\t1\tPASS\tv0.0.0\ntest_lane2\t2.38\tFAIL\t500\tFAIL\tv0.0.0\ntest_lane3\t70.0\tFAIL\t3\tPASS\tv0.0.0\ntest_lane4\t\t\t501\tFAIL\tv0.0.0\n""") diff --git a/tests/test_data/qc_report_summary.tab b/tests/test_data/qc_report_summary.tab new file mode 100644 index 0000000..f7e7511 --- /dev/null +++ b/tests/test_data/qc_report_summary.tab @@ -0,0 +1,5 @@ +lane_id status +test_lane1 PASS +test_lane2 FAIL +test_lane3 FAIL +test_lane4 diff --git a/tests/test_data/qc_report_summary.txt b/tests/test_data/qc_report_summary.txt new file mode 100644 index 0000000..f65926b --- /dev/null +++ b/tests/test_data/qc_report_summary.txt @@ -0,0 +1,5 @@ +lane_id status rel_abundance_status contig_no_status version +test_lane1 PASS PASS PASS v0.0.0 +test_lane2 FAIL FAIL FAIL v0.0.0 +test_lane3 FAIL FAIL PASS v0.0.0 +test_lane4 FAIL v0.0.0 diff --git a/tests/test_data/version.txt b/tests/test_data/version.txt new file mode 100644 index 0000000..d1d2905 --- /dev/null +++ b/tests/test_data/version.txt @@ -0,0 +1,2 @@ +version +v0.0.0 From a2f9dbcb93a6501a69aee4968c07cf2fb9e0fcc8 Mon Sep 17 00:00:00 2001 From: Victoria Carr Date: Wed, 13 Jul 2022 14:56:22 +0100 Subject: [PATCH 3/6] Add params.version --- nextflow.config | 1 + 1 file changed, 1 insertion(+) diff --git a/nextflow.config b/nextflow.config index 467ed09..bed5b17 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,6 +26,7 @@ params { cov_depth_threshold = 20 cov_breadth_threshold = 70 het_snps_threshold = 20 + version = "" } From 16faf60de74bd412e5c38461714314d06b3f363b Mon Sep 17 00:00:00 2001 From: Victoria Carr Date: Wed, 13 Jul 2022 15:42:31 +0100 Subject: [PATCH 4/6] Change .tab to .txt, edit README --- README.md | 6 +++--- modules/collate_qc_data.nf | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d97e209..4c4792a 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ ## About -This pipeline runs QC for lanes of Group B Strep (GBS) sequences that are imported on farm5 and available on `pf`. The QC includes: +This pipeline provides QC information for lanes of Group B Strep (GBS) sequences that are imported on farm5 and QC-ed, assembled and mapped on `pf`. This pipeline gives: - Relative abundance of GBS reads from Kraken - Number of contigs - GC content @@ -58,10 +58,10 @@ Change: - `/path/to/gbs_qc_reports` to the directory location of the generated reports. (Default is the current directory) ## Output -You should get two tab-delimited output reports `qc_report_summary.tab` and `qc_report_complete.tab` in the `--qc_reports_directory` you specified. `qc_report_summary.tab` gives the `lane_id` and PASS/FAIL `status`. `qc_report_complete.tab` gives all the PASS/FAIL status for each QC. +You should get two tab-delimited output reports `qc_report_summary.txt` and `qc_report_complete.txt` in the `--qc_reports_directory` you specified. `qc_report_summary.txt` gives the `lane_id` and PASS/FAIL `status`. `qc_report_complete.txt` gives all the PASS/FAIL status for each QC. ### Missing Data -If there are empty values in `qc_report_summary.tab` then at least one QC workflow may have failed. You can look in the `qc_report_complete.tab` to find which one. +If there are empty values in `qc_report_summary.txt` then at least one QC workflow may have failed. You can look in the `qc_report_complete.txt` to find which one. If there are empty values for: - `rel_abundance` then these lanes may not have been imported/imported properly with a kraken report. diff --git a/modules/collate_qc_data.nf b/modules/collate_qc_data.nf index 5713233..aa3f5eb 100644 --- a/modules/collate_qc_data.nf +++ b/modules/collate_qc_data.nf @@ -6,8 +6,8 @@ process collate_qc_data { path version output: - path("qc_report_complete.tab"), emit: complete - path("qc_report_summary.tab"), emit: summary + path("qc_report_complete.txg"), emit: complete + path("qc_report_summary.txt"), emit: summary script: python_version = params.python_version From 07416c0fdf7a3a6f91d6dbfd594d23cb5f654f56 Mon Sep 17 00:00:00 2001 From: Victoria Carr Date: Wed, 13 Jul 2022 15:46:51 +0100 Subject: [PATCH 5/6] Typo --- modules/collate_qc_data.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/collate_qc_data.nf b/modules/collate_qc_data.nf index aa3f5eb..fe739de 100644 --- a/modules/collate_qc_data.nf +++ b/modules/collate_qc_data.nf @@ -6,7 +6,7 @@ process collate_qc_data { path version output: - path("qc_report_complete.txg"), emit: complete + path("qc_report_complete.txt"), emit: complete path("qc_report_summary.txt"), emit: summary script: From 89952d3434bde54b8b557319ab8accb937dd6656 Mon Sep 17 00:00:00 2001 From: Victoria Carr Date: Wed, 13 Jul 2022 15:57:25 +0100 Subject: [PATCH 6/6] Add version to README --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4c4792a..88a5659 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ +[![GitHub release (latest by date)](https://img.shields.io/github/v/release/sanger-bentley-group/GBS_QC_nf)](https://github.com/sanger-bentley-group/GBS_QC_nf/releases) + [![pytest check](https://github.com/sanger-bentley-group/GBS_QC_nf/workflows/pytests_check/badge.svg)](https://github.com/sanger-bentley-group/GBS_QC_nf/actions) - # GBS QC Nextflow Pipeline for farm5