Skip to content

Commit

Permalink
Removing bgzip/tabix from libs; more checks and messages
Browse files Browse the repository at this point in the history
  • Loading branch information
alexeigurevich committed Nov 18, 2015
1 parent 14613ce commit cbdbaf7
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 27 deletions.
4 changes: 0 additions & 4 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,6 @@
picard_fpath = os.path.join(picard_dirpath, 'picard.jar')
picard_tmp_dirpath = os.path.join(SCRATCH_DIR, 'tmp')

samtools_dirpath = os.path.join(LIBS_LOCATION, 'samtools')
bgzip_fpath = os.path.join(LIBS_LOCATION, 'misc', 'bgzip')
tabix_fpath = os.path.join(LIBS_LOCATION, 'misc', 'tabix')

dbsnp_fpath = '/genomes/Homo_sapiens/UCSC/hg19/Annotation/dbsnp_132.hg19.vcf'
gold_indels_fpath = os.path.join(DIR_HOME, db_dirname, 'gold_indels.vcf')
tg_indels_fpath = os.path.join(DIR_HOME, db_dirname, '1000G_phase1.indels.hg19.vcf')
Expand Down
Binary file removed libs/misc/bgzip
Binary file not shown.
Binary file removed libs/misc/tabix
Binary file not shown.
4 changes: 3 additions & 1 deletion pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,15 @@ def main(args):
config.max_threads = max(1, config.max_memory/2)
config.max_gatk_threads = config.max_threads
utils.check_gatk()
utils.check_samtools()
utils.check_external_programs(['java', 'samtools', 'bgzip', 'tabix'])

# one sample per launch in multi-node mode
print '\nStarting GATK3 Best Practices workflow'
import preprocessing
bam_fpaths = preprocessing.do(ref_fpath, samples_fpaths, sample_ids, config.temp_output_dir, config.output_dir)
import variant_calling
variant_calling.do(ref_fpath, sample_ids, bam_fpaths, config.temp_output_dir, config.output_dir, project_id, samples_fpaths, sample_names)
print 'GATK3 Best Practices workflow finished!'
return

if __name__ == '__main__':
Expand Down
7 changes: 3 additions & 4 deletions preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,12 @@ def process_single_sample(ref_fpath, sample_id, bam_fpath, scratch_dirpath, outp
bam_fpath = replace_rg_fpath
if not os.path.exists(ref_fpath + '.fai'):
print 'Preparing reference file...'
samtools_fpath = os.path.join(config.samtools_dirpath, 'samtools')
utils.call_subprocess([samtools_fpath, 'faidx', ref_fpath], stderr=open(log_fpath, 'a'))
utils.call_subprocess(['samtools', 'faidx', ref_fpath], stderr=open(log_fpath, 'a'))
ref_fname, _ = os.path.splitext(ref_fpath)
utils.call_subprocess(['java', '-jar', config.picard_fpath, 'CreateSequenceDictionary', 'R=%s' % ref_fpath,
'O=%s' % ref_fname + '.dict'], stderr=open(log_fpath, 'a'))
get_chr_lengths(ref_fpath)
print 'Realign indels...'
print 'Realigning indels...'
cmd = ['java', '-Xmx%sg' % mem_gb, '-jar', config.gatk_fpath, '-T', 'RealignerTargetCreator', '-R', ref_fpath, '-nt', num_threads,
'-I', bam_fpath, '-o', targetintervals_fpath]
if not config.reduced_workflow:
Expand All @@ -78,7 +77,7 @@ def process_single_sample(ref_fpath, sample_id, bam_fpath, scratch_dirpath, outp
utils.call_subprocess(cmd, stderr=open(log_fpath, 'a'))

if not config.reduced_workflow:
print 'Recalibrate bases...'
print 'Recalibrating bases...'
recaltable_fpath = os.path.join(scratch_dirpath, sample_id + '.table')
utils.call_subprocess(['java', '-Xmx%sg' % mem_gb, '-jar', config.gatk_fpath, '-T', 'BaseRecalibrator', '-R', ref_fpath, '-nct', num_threads,
'-I', final_bam_fpath, '-knownSites', config.dbsnp_fpath, '-dt', 'ALL_READS', '-dfrac', '0.10 ',
Expand Down
27 changes: 13 additions & 14 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
def name_from_fpath(fpath):
return os.path.splitext(os.path.basename(fpath))[0]

def call_subprocess(args, stdin=None, stdout=None, stderr=None, env=None):

def call_subprocess(args, stdin=None, stdout=None, stderr=None, env=None, debug=False):
printed_args = args[:]
if stdin:
printed_args += ['<', stdin.name]
Expand All @@ -24,19 +25,23 @@ def call_subprocess(args, stdin=None, stdout=None, stderr=None, env=None):
if stderr:
printed_args += ['2>>' if stderr.mode in ['a', 'a+'] else '2>', stderr.name]

if debug:
print ' Starting:', ' '.join(printed_args)
return_code = subprocess.call(args, stdin=stdin, stdout=stdout, stderr=stderr, env=env)
if return_code != 0 and 'ValidateSamFile' not in args:
print 'ERROR! See log files'
sys.exit(0)

return return_code


def assert_file_exists(fpath, message=''):
if not os.path.isfile(fpath):
print ("File not found (%s): %s" % (message, fpath))
sys.exit(2)
return fpath


def get_path_to_program(program):
"""
returns the path to an executable or None if it can't be found
Expand All @@ -50,21 +55,15 @@ def is_exe(fpath):
return exe_file
return None


def check_gatk():
if not os.path.exists(config.gatk_fpath):
print 'ERROR! GATK was not found. Please move GenomeAnalysisTK.jar to ' + config.gatk_dirpath + ' and restart the application.'
sys.exit(1)

def check_samtools():
if not os.path.exists(os.path.join(config.samtools_dirpath, 'samtools')):
samtools_path = get_path_to_program('samtools')
if not samtools_path:
print 'Compiling SAMtools (details are in ' + os.path.join(config.samtools_dirpath, 'make.log') + ' and make.err)'
return_code = call_subprocess(['make', '-C', config.samtools_dirpath], stdout=open(os.path.join(config.samtools_dirpath, 'make.log'), 'w'),
stderr=open(os.path.join(config.samtools_dirpath, 'make.err'), 'w'), )

if return_code != 0 or not os.path.exists(os.path.join(config.samtools_dirpath, 'samtools')):
print 'Failed to compile SAMtools (' + config.samtools_dirpath + ')! Try to compile it manually. '
sys.exit(1)
else:
config.samtools_dirpath = ''

def check_external_programs(names):
for name in names:
if not get_path_to_program(name):
print 'ERROR! %s was not found in PATH, please install it manually and/or add to PATH variable' % name
sys.exit(1)
8 changes: 4 additions & 4 deletions variant_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,11 @@ def process_files(ref_fpath, sample_ids, bam_fpaths, scratch_dirpath, output_dir
printReport(report_vars_fpath, report_tstv_fpath, sample_names, sample_ids, sample_files, output_dirpath)

for g_vcf_fpath in g_vcf_fpaths:
utils.call_subprocess([bgzip_fpath, g_vcf_fpath], stderr=open(log_fpath, 'a'))
utils.call_subprocess([tabix_fpath, '-p', 'vcf', g_vcf_fpath + '.gz'], stderr=open(log_fpath, 'a'))
utils.call_subprocess(['bgzip', '-f', g_vcf_fpath], stderr=open(log_fpath, 'a'))
utils.call_subprocess(['tabix', '-p', 'vcf', g_vcf_fpath + '.gz'], stderr=open(log_fpath, 'a'))

utils.call_subprocess([bgzip_fpath, vcf_fpath], stderr=open(log_fpath, 'a'))
utils.call_subprocess([tabix_fpath, '-p', 'vcf', vcf_fpath + '.gz'], stderr=open(log_fpath, 'a'))
utils.call_subprocess(['bgzip', '-f', vcf_fpath], stderr=open(log_fpath, 'a'))
utils.call_subprocess(['tabix', '-p', 'vcf', vcf_fpath + '.gz'], stderr=open(log_fpath, 'a'))
return vcf_fpath

def printReport(report_vars_fpath, report_tstv_fpath, sample_names, sample_ids, sample_files, output_dirpath):
Expand Down

0 comments on commit cbdbaf7

Please sign in to comment.