diff --git a/isoquant.py b/isoquant.py index 5064d954..edc3b515 100755 --- a/isoquant.py +++ b/isoquant.py @@ -683,18 +683,6 @@ def run_pipeline(args): # convert GTF/GFF if needed if args.genedb and not args.genedb.lower().endswith('db'): - logger.info("Checking input gene annotation") - gtf_is_correct, corrected_gtf, out_fname = check_gtf_duplicates(args.genedb) - if not gtf_is_correct: - new_gtf_path = os.path.join(args.output, out_fname) - with open(new_gtf_path, "w") as out_gtf: - out_gtf.write(corrected_gtf) - logger.error("Input GTF seems to be corrupted (see warnings above). " - "An attempt to correct this GTF was made, the result is written to %s" % new_gtf_path) - logger.error("Provide a correct GTF by fixing the original input GTF or checking the corrected one.") - exit(-3) - else: - logger.info("Gene annotation seems to be correct") args.genedb = convert_gtf_to_db(args) # map reads if fastqs are provided diff --git a/src/gtf2db.py b/src/gtf2db.py index c7caa73b..444e8bcc 100755 --- a/src/gtf2db.py +++ b/src/gtf2db.py @@ -92,6 +92,21 @@ def get_color(transcript_kind): def gtf2db(gtf, db, complete_db=False): + logger.info("Checking input gene annotation") + gtf_is_correct, corrected_gtf, out_fname = check_gtf_duplicates(gtf) + if not gtf_is_correct: + outdir = os.path.dirname(db) + new_gtf_path = os.path.join(outdir, out_fname) + with open(new_gtf_path, "w") as out_gtf: + out_gtf.write(corrected_gtf) + logger.error("Input GTF seems to be corrupted (see warnings above).") + logger.error("An attempt to correct this GTF was made, the result is written to %s" % new_gtf_path) + logger.error("NB! some transcript / gene ids in the corrected annotation are modified.") + logger.error("Provide a correct GTF by fixing the original input GTF or checking the corrected one.") + exit(-3) + else: + logger.info("Gene annotation seems to be correct") + logger.info("Converting gene annotation file to .db format (takes a while)...") gffutils.create_db(gtf, db, force=True, keep_order=True, merge_strategy='error', sort_attribute_values=True, disable_infer_transcripts=complete_db, @@ -128,8 +143,12 @@ def check_gtf_duplicates(gtf): gtf_name = os.path.basename(gtf) gtf_name, outer_ext = os.path.splitext(gtf_name) - gtf_name, inner_ext = os.path.splitext(gtf_name) - handle = gzip.open(gtf, "rt") if outer_ext.lower() in ['.gz', '.gzip', '.bgz'] else open(gtf, "rt") + if outer_ext.lower() in ['.gz', '.gzip', '.bgz']: + handle = gzip.open(gtf, "rt") + gtf_name, inner_ext = os.path.splitext(gtf_name) + else: + handle = open(gtf, "rt") + inner_ext = outer_ext for l in handle.readlines(): line_count += 1