Skip to content

Commit

Permalink
Check if previous output can be reused by md5 instead of file size
Browse files Browse the repository at this point in the history
  • Loading branch information
almiheenko authored and alexeigurevich committed Jan 24, 2018
1 parent 33f7e72 commit 8d93d5e
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 14 deletions.
12 changes: 6 additions & 6 deletions quast_libs/ca_utils/align_contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from __future__ import with_statement

import os
from os.path import isfile, join, getsize, basename, dirname
from os.path import isfile, basename
import datetime
import shutil
import sys
Expand All @@ -19,7 +19,7 @@
create_nucmer_output_dir, clean_tmp_files, get_installed_emem, reset_aligner_selection, draw_mummer_plot

from quast_libs.log import get_logger
from quast_libs.qutils import is_python2, safe_create
from quast_libs.qutils import is_python2, md5

logger = get_logger(qconfig.LOGGER_DEFAULT_NAME)

Expand All @@ -33,8 +33,8 @@ class NucmerStatus:

def create_nucmer_successful_check(fpath, contigs_fpath, ref_fpath):
nucmer_successful_check_file = open(fpath, 'w')
nucmer_successful_check_file.write("Assembly file size in bytes: %d\n" % getsize(contigs_fpath))
nucmer_successful_check_file.write("Reference file size in bytes: %d\n" % getsize(ref_fpath))
nucmer_successful_check_file.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath))
nucmer_successful_check_file.write("Reference md5 checksum: %s\n" % md5(ref_fpath))
nucmer_successful_check_file.write("Successfully finished on " +
datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S') + '\n')
nucmer_successful_check_file.close()
Expand All @@ -44,9 +44,9 @@ def check_nucmer_successful_check(fpath, contigs_fpath, ref_fpath):
successful_check_content = open(fpath).read().split('\n')
if len(successful_check_content) < 2:
return False
if not successful_check_content[0].strip().endswith(str(getsize(contigs_fpath))):
if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)):
return False
if not successful_check_content[1].strip().endswith(str(getsize(ref_fpath))):
if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)):
return False
return True

Expand Down
9 changes: 9 additions & 0 deletions quast_libs/qutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from __future__ import with_statement
import glob
import hashlib
import shutil
import subprocess
import os
Expand Down Expand Up @@ -908,3 +909,11 @@ def is_ascii_string(line):
return False
else:
return True


def md5(fname):
hash_md5 = hashlib.md5()
with open(fname, 'rb') as f:
for chunk in iter(lambda: f.read(65536), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
16 changes: 8 additions & 8 deletions quast_libs/search_references_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from quast_libs import qconfig, qutils
from quast_libs.fastaparser import _get_fasta_file_handler
from quast_libs.log import get_logger
from quast_libs.qutils import is_non_empty_file, is_python2, slugify, correct_name, get_dir_for_download
from quast_libs.qutils import is_non_empty_file, is_python2, slugify, correct_name, get_dir_for_download, md5

logger = get_logger(qconfig.LOGGER_META_NAME)
try:
Expand Down Expand Up @@ -314,14 +314,14 @@ def parallel_blast(contigs_fpath, label, corrected_dirpath, err_fpath, blast_res
qutils.call_subprocess(shlex.split(cmd), stdout=open(res_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
logger.info(' ' + 'BLAST results for %s are saved to %s...' % (label, res_fpath))
with open(check_fpath, 'w') as check_file:
check_file.writelines('Assembly: %s size: %d\n' % (contigs_fpath, os.path.getsize(contigs_fpath)))
check_file.writelines('Assembly: %s md5 checksum: %s\n' % (contigs_fpath, md5(contigs_fpath)))


def get_blast_output_fpath(blast_output_fpath, label):
return blast_output_fpath + '_' + slugify(label)


def check_blast(blast_check_fpath, blast_res_fpath, files_sizes, assemblies_fpaths, assemblies, labels):
def check_blast(blast_check_fpath, blast_res_fpath, files_md5, assemblies_fpaths, assemblies, labels):
downloaded_organisms = []
not_founded_organisms = []
blast_assemblies = [assembly for assembly in assemblies]
Expand All @@ -336,8 +336,8 @@ def check_blast(blast_check_fpath, blast_res_fpath, files_sizes, assemblies_fpat
if '---' in line:
assembly_info = False
if line and assembly_info:
assembly, size = line.split()[1], line.split()[3]
if assembly in files_sizes.keys() and int(size) == files_sizes[assembly]:
assembly, md5 = line.split()[1], line.split()[-1]
if assembly in files_md5.keys() and md5 == files_md5[assembly]:
existing_assembly = assemblies_fpaths[assembly]
logger.main_info(' Using existing BLAST alignments for %s... ' % labels[i])
blast_assemblies.remove(existing_assembly)
Expand All @@ -356,10 +356,10 @@ def do(assemblies, labels, downloaded_dirpath, corrected_dirpath, ref_txt_fpath=
err_fpath = os.path.join(downloaded_dirpath, 'blast.err')
blast_check_fpath = os.path.join(downloaded_dirpath, 'blast.check')
blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res')
files_sizes = dict((assembly.fpath, os.path.getsize(assembly.fpath)) for assembly in assemblies)
files_md5 = dict((assembly.fpath, md5(assembly.fpath)) for assembly in assemblies)
assemblies_fpaths = dict((assembly.fpath, assembly) for assembly in assemblies)
blast_assemblies, downloaded_organisms, not_founded_organisms = \
check_blast(blast_check_fpath, blast_res_fpath, files_sizes, assemblies_fpaths, assemblies, labels)
check_blast(blast_check_fpath, blast_res_fpath, files_md5, assemblies_fpaths, assemblies, labels)
organisms = []

if ref_txt_fpath:
Expand Down Expand Up @@ -569,7 +569,7 @@ def process_refs(organisms, assemblies, labels, downloaded_dirpath, not_founded_
text = check_file.read()
text = text[:text.find('\n')]
else:
text = 'Assembly: %s size: %d\n' % (assembly.fpath, os.path.getsize(assembly.fpath))
text = 'Assembly: %s md5 checksum: %s\n' % (assembly.fpath, md5(assembly.fpath))
with open(check_fpath, 'w') as check_file:
check_file.writelines(text)
check_file.writelines('\n---\n')
Expand Down

0 comments on commit 8d93d5e

Please sign in to comment.