Skip to content

Commit

Permalink
Merge pull request #13 from sanger-bentley-group/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
blue-moon22 authored Jul 13, 2022
2 parents 484040e + 89952d3 commit 3fe70b0
Show file tree
Hide file tree
Showing 10 changed files with 97 additions and 41 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
<!-- badges: start -->
[![GitHub release (latest by date)](https://img.shields.io/github/v/release/sanger-bentley-group/GBS_QC_nf)](https://github.com/sanger-bentley-group/GBS_QC_nf/releases)

[![pytest check](https://github.com/sanger-bentley-group/GBS_QC_nf/workflows/pytests_check/badge.svg)](https://github.com/sanger-bentley-group/GBS_QC_nf/actions)
<!-- [![DOI](https://zenodo.org/badge/137083307.svg)](https://zenodo.org/badge/latestdoi/137083307) -->
<!-- badges: end -->

# GBS QC Nextflow Pipeline for farm5

## About

This pipeline runs QC for lanes of Group B Strep (GBS) sequences that are imported on farm5 and available on `pf`. The QC includes:
This pipeline provides QC information for lanes of Group B Strep (GBS) sequences that are imported on farm5 and QC-ed, assembled and mapped on `pf`. This pipeline gives:
- Relative abundance of GBS reads from Kraken
- Number of contigs
- GC content
Expand Down Expand Up @@ -58,10 +59,10 @@ Change:
- `/path/to/gbs_qc_reports` to the directory location of the generated reports. (Default is the current directory)

## Output
You should get two tab-delimited output reports `qc_report_summary.tab` and `qc_report_complete.tab` in the `--qc_reports_directory` you specified. `qc_report_summary.tab` gives the `lane_id` and PASS/FAIL `status`. `qc_report_complete.tab` gives all the PASS/FAIL status for each QC.
You should get two tab-delimited output reports `qc_report_summary.txt` and `qc_report_complete.txt` in the `--qc_reports_directory` you specified. `qc_report_summary.txt` gives the `lane_id` and PASS/FAIL `status`. `qc_report_complete.txt` gives all the PASS/FAIL status for each QC.

### Missing Data
If there are empty values in `qc_report_summary.tab` then at least one QC workflow may have failed. You can look in the `qc_report_complete.tab` to find which one.
If there are empty values in `qc_report_summary.txt` then at least one QC workflow may have failed. You can look in the `qc_report_complete.txt` to find which one.

If there are empty values for:
- `rel_abundance` then these lanes may not have been imported/imported properly with a kraken report.
Expand Down
37 changes: 27 additions & 10 deletions bin/collate_qc_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@
from collections import defaultdict


def write_summary_qc_report(summary_qc, output_prefix):
def write_summary_qc_report(summary_qc, complete_report, output_prefix):

with open(f'{output_prefix}_summary.tab', 'w') as out:
out.write('lane_id\tstatus\n')
for lane_id, status in summary_qc.items():
out.write(f'{lane_id}\t{status}\n')
summary_report = pd.DataFrame()
summary_report['lane_id'] = summary_qc.keys()
summary_report['status'] = summary_qc.values()

status_columns = ['lane_id']
status_columns.extend([column for column in complete_report.columns if 'status' in column or 'version' in column])

summary_report = summary_report.merge(complete_report[status_columns], how = 'inner', on='lane_id')

summary_report.to_csv(f'{output_prefix}_summary.txt', sep = '\t', index = False)


def get_summary_qc(all_reports):
Expand All @@ -35,7 +41,7 @@ def get_summary_qc(all_reports):
return summary_qc


def write_complete_qc_report(all_reports, output_prefix):
def get_complete_qc_report(all_reports, version_file):

df = pd.DataFrame()

Expand All @@ -45,14 +51,24 @@ def write_complete_qc_report(all_reports, output_prefix):
else:
tmp_df = pd.read_csv(report, sep = '\t')
df = df.merge(tmp_df, how = 'inner', on='lane_id')

df.to_csv(f'{output_prefix}_complete.tab', sep = '\t', index = False)

version = ''
with open(version_file, 'r') as file:
next(file)
for line in file:
version = line.replace("\n", "")

df['version'] = version

return df


def get_arguments():
parser = argparse.ArgumentParser(description='Collate QC data to output complete and summary reports.')
parser.add_argument('-i', '--qc_reports', required=True, nargs='*',
help='All QC reports.')
parser.add_argument('-v', '--version', dest='version', required=True,
help='Input file with version of pipeline.')
parser.add_argument('-o', '--output_prefix', required=True, type=str,
help='Output prefix of QC reports.')

Expand All @@ -61,13 +77,14 @@ def get_arguments():

def main(args):
# Write complete report
write_complete_qc_report(args.qc_reports, args.output_prefix)
complete_report = get_complete_qc_report(args.qc_reports, args.version)
complete_report.to_csv(f'{args.output_prefix}_complete.txt', sep = '\t', index = False)

# Get summary QC
summary_qc = get_summary_qc(args.qc_reports)

# Write summary QC
write_summary_qc_report(summary_qc, args.output_prefix)
write_summary_qc_report(summary_qc, complete_report, args.output_prefix)


if __name__ == '__main__':
Expand Down
7 changes: 6 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ include {depth_of_coverage} from './modules/depth_of_coverage.nf'
include {breadth_of_coverage} from './modules/breadth_of_coverage.nf'
include {get_proportion_HET_SNPs} from './modules/get_proportion_HET_SNPs.nf'
include {HET_SNPs} from './modules/HET_SNPs.nf'
include {get_version} from './modules/version.nf'

// Workflow for reads QC
workflow reads_qc {
Expand Down Expand Up @@ -98,8 +99,12 @@ workflow {
// Run assembly QC
assemblies_qc(get_file_destinations.out, get_qc_stats_from_pf.out, get_proportion_HET_SNPs.out, headers_ch, lanes_ch)

// Get version of pipeline
get_version()
version_ch = get_version.out

// Collate QC reports
collate_qc_data(reads_qc.out.qc_report, assemblies_qc.out.qc_report)
collate_qc_data(reads_qc.out.qc_report, assemblies_qc.out.qc_report, version_ch)

collate_qc_data.out.complete
.subscribe { it -> it.copyTo(results_dir) }
Expand Down
6 changes: 4 additions & 2 deletions modules/collate_qc_data.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ process collate_qc_data {
input:
path read_qc_report
tuple file(number_of_contigs), file(contig_gc_content), file(genome_length), file(depth_of_coverage), file(breadth_of_coverage), file(het_snps)
path version

output:
path("qc_report_complete.tab"), emit: complete
path("qc_report_summary.tab"), emit: summary
path("qc_report_complete.txt"), emit: complete
path("qc_report_summary.txt"), emit: summary

script:
python_version = params.python_version
Expand All @@ -16,6 +17,7 @@ process collate_qc_data {
collate_qc_data.py \
--qc_reports ${read_qc_report} ${number_of_contigs} ${contig_gc_content} ${genome_length} ${depth_of_coverage} ${breadth_of_coverage} ${het_snps} \
--version ${version} \
--output_prefix "qc_report"
"""
}
20 changes: 20 additions & 0 deletions modules/version.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
process get_version {
input:

output:
path("${output}")

script:
output="version.txt"
version=params.version
"""
echo "version" > ${output}
if [ -z ${version} ];
then
echo \$(git -C $baseDir describe --tags) >> ${output}
else
echo ${version} >> ${output}
fi
"""
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ params {
cov_depth_threshold = 20
cov_breadth_threshold = 70
het_snps_threshold = 20
version = ""

}

Expand Down
46 changes: 22 additions & 24 deletions tests/collate_qc_data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from collections import defaultdict
from bin.collate_qc_data import *
import numpy as np


class CollateQCData(TestCase):
Expand All @@ -12,17 +13,18 @@ class CollateQCData(TestCase):
TEST_CONTIG_NO = f'{TEST_DATA_DIR}/test_contig_number.tab'
TEST_OUTPUT_PREFIX = f'{TEST_DATA_DIR}/qc_report'
TEST_OUTPUT_PREFIX2 = f'{TEST_DATA_DIR}/qc_report2'
TEST_VERSION = f'{TEST_DATA_DIR}/version.txt'


def test_write_complete_qc_report(self):
def test_get_complete_qc_report(self):

write_complete_qc_report([self.TEST_REL_ABND, self.TEST_CONTIG_NO], self.TEST_OUTPUT_PREFIX)
actual = get_complete_qc_report([self.TEST_REL_ABND, self.TEST_CONTIG_NO], self.TEST_VERSION)

file = open(f'{self.TEST_OUTPUT_PREFIX}_complete.tab', "r")
actual = "".join(file.readlines())
os.remove(f'{self.TEST_OUTPUT_PREFIX}_complete.tab')

self.assertEqual(actual, """lane_id\trel_abundance\trel_abundance_status\tcontig_no\tcontig_no_status\ntest_lane1\t92.38\tPASS\t1\tPASS\ntest_lane2\t2.38\tFAIL\t500\tFAIL\ntest_lane3\t70.0\tFAIL\t3\tPASS\ntest_lane4\t\t\t501\tFAIL\n""")
self.assertEqual(actual.to_dict()['lane_id'], {
0: 'test_lane1',
1: 'test_lane2',
2: 'test_lane3',
3: 'test_lane4'})

def test_get_summary_qc(self):

Expand All @@ -37,38 +39,34 @@ def test_write_summary_qc_report(self):
summary_qc['test_lane3'] = 'FAIL'
summary_qc['test_lane4'] = ''

write_summary_qc_report(summary_qc, self.TEST_OUTPUT_PREFIX)

file = open(f'{self.TEST_OUTPUT_PREFIX}_summary.tab', "r")
actual = "".join(file.readlines())
os.remove(f'{self.TEST_OUTPUT_PREFIX}_summary.tab')
complete_report = get_complete_qc_report([self.TEST_REL_ABND, self.TEST_CONTIG_NO], self.TEST_VERSION)

self.assertEqual(actual, """lane_id\tstatus\ntest_lane1\tPASS\ntest_lane2\tFAIL\ntest_lane3\tFAIL\ntest_lane4\t\n""")
write_summary_qc_report(summary_qc, complete_report, self.TEST_OUTPUT_PREFIX)

def test_arguments(self):
actual = get_arguments().parse_args(
['--qc_reports', 'report1', 'report2', '--output_prefix', 'output_tab_file'])
self.assertEqual(actual, argparse.Namespace(qc_reports=['report1', 'report2'], output_prefix='output_tab_file'))
['--qc_reports', 'report1', 'report2', '--version', 'version', '--output_prefix', 'output_txt_file'])
self.assertEqual(actual, argparse.Namespace(qc_reports=['report1', 'report2'], version='version', output_prefix='output_txt_file'))

def test_arguments_short_options(self):
actual = get_arguments().parse_args(
['-i', 'report1', 'report2', '-o', 'output_tab_file'])
self.assertEqual(actual, argparse.Namespace(qc_reports=['report1', 'report2'], output_prefix='output_tab_file'))
['-i', 'report1', 'report2', '-v', 'version', '-o', 'output_txt_file'])
self.assertEqual(actual, argparse.Namespace(qc_reports=['report1', 'report2'], version='version', output_prefix='output_txt_file'))

def test_main(self):
args = get_arguments().parse_args(
['--qc_reports', self.TEST_REL_ABND, self.TEST_CONTIG_NO, '--output_prefix', self.TEST_OUTPUT_PREFIX2])
['--qc_reports', self.TEST_REL_ABND, self.TEST_CONTIG_NO, '--version', self.TEST_VERSION, '--output_prefix', self.TEST_OUTPUT_PREFIX2])

main(args)

file = open(f'{self.TEST_OUTPUT_PREFIX2}_summary.tab', "r")
file = open(f'{self.TEST_OUTPUT_PREFIX2}_summary.txt', "r")
actual = "".join(file.readlines())
os.remove(f'{self.TEST_OUTPUT_PREFIX2}_summary.tab')
os.remove(f'{self.TEST_OUTPUT_PREFIX2}_summary.txt')

self.assertEqual(actual, """lane_id\tstatus\ntest_lane1\tPASS\ntest_lane2\tFAIL\ntest_lane3\tFAIL\ntest_lane4\t\n""")
self.assertEqual(actual, """lane_id\tstatus\trel_abundance_status\tcontig_no_status\tversion\ntest_lane1\tPASS\tPASS\tPASS\tv0.0.0\ntest_lane2\tFAIL\tFAIL\tFAIL\tv0.0.0\ntest_lane3\tFAIL\tFAIL\tPASS\tv0.0.0\ntest_lane4\t\t\tFAIL\tv0.0.0\n""")

file = open(f'{self.TEST_OUTPUT_PREFIX2}_complete.tab', "r")
file = open(f'{self.TEST_OUTPUT_PREFIX2}_complete.txt', "r")
actual = "".join(file.readlines())
os.remove(f'{self.TEST_OUTPUT_PREFIX2}_complete.tab')
os.remove(f'{self.TEST_OUTPUT_PREFIX2}_complete.txt')

self.assertEqual(actual, """lane_id\trel_abundance\trel_abundance_status\tcontig_no\tcontig_no_status\ntest_lane1\t92.38\tPASS\t1\tPASS\ntest_lane2\t2.38\tFAIL\t500\tFAIL\ntest_lane3\t70.0\tFAIL\t3\tPASS\ntest_lane4\t\t\t501\tFAIL\n""")
self.assertEqual(actual, """lane_id\trel_abundance\trel_abundance_status\tcontig_no\tcontig_no_status\tversion\ntest_lane1\t92.38\tPASS\t1\tPASS\tv0.0.0\ntest_lane2\t2.38\tFAIL\t500\tFAIL\tv0.0.0\ntest_lane3\t70.0\tFAIL\t3\tPASS\tv0.0.0\ntest_lane4\t\t\t501\tFAIL\tv0.0.0\n""")
5 changes: 5 additions & 0 deletions tests/test_data/qc_report_summary.tab
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
lane_id status
test_lane1 PASS
test_lane2 FAIL
test_lane3 FAIL
test_lane4
5 changes: 5 additions & 0 deletions tests/test_data/qc_report_summary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
lane_id status rel_abundance_status contig_no_status version
test_lane1 PASS PASS PASS v0.0.0
test_lane2 FAIL FAIL FAIL v0.0.0
test_lane3 FAIL FAIL PASS v0.0.0
test_lane4 FAIL v0.0.0
2 changes: 2 additions & 0 deletions tests/test_data/version.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
version
v0.0.0

0 comments on commit 3fe70b0

Please sign in to comment.