From 2905d1355d816e64de9be7a389fa3eec2b435d35 Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Thu, 29 Aug 2024 15:14:07 -0400 Subject: [PATCH 01/17] parallelized isPcr calls (#30) --- bin/validatePrimers.py | 113 ++++++++++++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 29 deletions(-) diff --git a/bin/validatePrimers.py b/bin/validatePrimers.py index e5172c6..a89e5fa 100644 --- a/bin/validatePrimers.py +++ b/bin/validatePrimers.py @@ -1,9 +1,10 @@ -import os, subprocess from Bio import SeqIO from bin.Clock import Clock +from typing import Generator from bin.Primer import Primer from Bio.SeqRecord import SeqRecord from bin.Parameters import Parameters +import multiprocessing, os, subprocess # global constants @@ -38,38 +39,45 @@ def __makeFastaFile(params:Parameters) -> None: SeqIO.write(SeqRecord(contig.seq, contig.id, '', ''), fh, OUT_FORMAT) -def __makeQueryFile(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> None: - """makes the query file for +def __makeQueryString(pairs:list[tuple[Primer,Primer]]) -> str: + """makes the query string from a list of primer pairs Args: - params (Parameters): a Parameters object - pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): key=Primer pair; dict:key=genome name; val=tuple: contig, pcr product size, bin pair (contig, num1, num2) + pairs (list[tuple[Primer,Primer]]): a list of primer pairs + + Returns: + str: the query string """ - # open the file - with open(params.queryFn, 'w') as fh: - for pair in pairs.keys(): - # recast the pair as a strings - fwd,rev = map(str, pair) - - # the "key" column will be the pair separated by a pipe - key = __KEY_SEP.join((fwd,rev)) - - # write the data for each pair to file; one per line - fh.write(__ROW_SEP.join((key,fwd,rev)) + "\n") + # initialize the output + out = "" + + # for each pair + for pair in pairs: + # recast the pair as a strings + fwd,rev = map(str, pair) + + # the "key" column will be the pair separated by a pipe + key = __KEY_SEP.join((fwd,rev)) + + # save the data for each pair; one per line + out += __ROW_SEP.join((key,fwd,rev)) + "\n" + + return out -def __runPcr(params:Parameters) -> dict[tuple[str,str],set[tuple[str,int]]]: - """runs isPcr on the primer pairs +def __runOnePcr(allContigsFna:str, query:str) -> dict[tuple[str,str],set[tuple[str,int]]]: + """runs isPcr on a properly formatted query string Args: - params (Parameters): a Parameters object + allContigsFna (str): the filename containing all the contigs + query (str): the query to search Returns: - dict[tuple[str,str],set[tuple[str,int]]]: key=primer pair (as strings); val=pcr product size + dict[tuple[str,str],set[tuple[str,int]]]: key=primer pair (as strings); val=(contig, pcr product size) """ # constants CMD = 'isPcr' - ARGS = ['stdout', '-out=bed', '-minGood=6', '-minPerfect=8'] + ARGS = ['stdin', 'stdout', '-out=bed', '-minGood=6', '-minPerfect=8'] TILE = '-tileSize=' EMPTY = [''] @@ -77,15 +85,17 @@ def __runPcr(params:Parameters) -> dict[tuple[str,str],set[tuple[str,int]]]: out = dict() # build command - cmd = [CMD, params.allContigsFna, params.queryFn] + cmd = [CMD, allContigsFna] cmd.extend(ARGS) cmd.append(TILE + str(Parameters._MIN_LEN)) - # run the command - results = subprocess.run(cmd, capture_output=True, check=True) + # run the command; inject the query string using a pipe + results = subprocess.run(cmd, input=query.encode(), capture_output=True, check=True) + + # convert the results to a collection of rows (lists of strings) + results = map(lambda x: x.split(__ROW_SEP), results.stdout.decode().split('\n')) - # convert the results to a list of values; do not keep empty lines - results = list(map(lambda x: x.split(__ROW_SEP), results.stdout.decode().split('\n'))) + # do not keep empty rows results = [x for x in results if x != EMPTY] # for each result @@ -102,6 +112,50 @@ def __runPcr(params:Parameters) -> dict[tuple[str,str],set[tuple[str,int]]]: return out +def __runAllPcrs(params:Parameters, pairs:list[tuple[Primer,Primer]]) -> dict[tuple[str,str],set[tuple[str,int]]]: + """runs all pcrs in parallel + + Args: + params (Parameters): a Parameters object + pairs (list[tuple[Primer,Primer]]): a list of primer pairs + + Returns: + dict[tuple[str,str],set[tuple[str,int]]]: key=primer pair; val=(contig, pcr product size) + """ + # helper function + def genArgs() -> Generator[tuple[str,str],None,None]: + """generates arguments for __runOnePcr + """ + # constant + MAX_CHUNK = 2000 + + # initialize values for iteration + numPairs = len(pairs) + chunk = min((MAX_CHUNK, numPairs // params.numThreads)) + start = 0 + end = chunk + + # keep iterating until all the pairs have been evaluated + while start <= numPairs: + # make the query for this chunk + query = __makeQueryString(pairs[start:end]) + + # update the start and end for the next chunk + start = end + end += chunk + + yield (params.allContigsFna, query) + + # run pcrs in parallel + pool = multiprocessing.Pool(params.numThreads) + results = pool.starmap(__runOnePcr, genArgs()) + pool.close() + pool.join() + + # combine the results and return them + return {k:v for r in results for k,v in r.items()} + + def __filterPairs(pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]], pcrs:dict[tuple[str,str],list[tuple[str,int]]]) -> None: """filters primer pairs based on isPcr results; does not return @@ -143,10 +197,11 @@ def _validatePrimerPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict clock.printStart(MSG_1) params.log.info(MSG_1) - # create the fasta and query files and run isPcr + # create the fasta file __makeFastaFile(params) - __makeQueryFile(params, pairs) - pcrs = __runPcr(params) + + # run isPcr in parallel + pcrs = __runAllPcrs(params, list(pairs.keys())) # filter out bad pairs __filterPairs(pairs, pcrs) From 38e129fca9d9d12621092aa9702a92f50fb501d0 Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Thu, 5 Sep 2024 13:32:00 -0400 Subject: [PATCH 02/17] removed unused "query file" param --- bin/Parameters.py | 5 ----- bin/main.py | 1 - 2 files changed, 6 deletions(-) diff --git a/bin/Parameters.py b/bin/Parameters.py index 5049974..b2366af 100644 --- a/bin/Parameters.py +++ b/bin/Parameters.py @@ -12,7 +12,6 @@ class Parameters(): _MIN_LEN = 10 __ALLOWED_FORMATS = ('genbank', 'fasta') __ALL_CONTIGS_FNA = ".all_contigs.fna" - __QUERY_FN = ".ispcr_query.tsv" __PICKLE_DIR = "_pickles" _PARAMS = 0 _SHARED = 1 @@ -69,7 +68,6 @@ def __init__(self, author:str, version:str, initializeLog:bool=True) -> Paramete self.pickles:dict[int,str] self.keepIntermediateFiles:bool self.allContigsFna:str - self.queryFn:str # save author and version as private attributes self.__author:str = author @@ -93,9 +91,6 @@ def __init__(self, author:str, version:str, initializeLog:bool=True) -> Paramete # get the all contigs fasta filename self.allContigsFna = os.path.join(os.path.dirname(self.resultsFn), Parameters.__ALL_CONTIGS_FNA) - - # get the ispcr query filename - self.queryFn = os.path.join(os.path.dirname(self.resultsFn), Parameters.__QUERY_FN) def __eq__(self, other:Parameters) -> bool: """equality overload diff --git a/bin/main.py b/bin/main.py index ce7f66a..af275b4 100755 --- a/bin/main.py +++ b/bin/main.py @@ -288,7 +288,6 @@ def __removeIntermediateFiles(params:Parameters) -> None: # remove the isPcr files os.remove(params.allContigsFna) - os.remove(params.queryFn) def _runner(params:Parameters) -> None: From 73479a5e694072d8cbeaee87553ae7c5d1890360 Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Thu, 5 Sep 2024 13:38:19 -0400 Subject: [PATCH 03/17] sleeker cleanup --- bin/main.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/bin/main.py b/bin/main.py index af275b4..b9e2936 100755 --- a/bin/main.py +++ b/bin/main.py @@ -1,4 +1,4 @@ -import os +import os, shutil from Bio import SeqIO from bin.Clock import Clock from typing import Generator @@ -279,14 +279,10 @@ def __removeIntermediateFiles(params:Parameters) -> None: Args: params (Parameters): a Parameters object """ - # remove all the pickle files - for fn in params.pickles.values(): - os.remove(fn) + # remove the pickle directory along with any pickles + shutil.rmtree(os.path.dirname(next(iter(params.pickles.values())))) - # remove the pickle directory - os.rmdir(os.path.dirname(fn)) - - # remove the isPcr files + # remove the isPcr query file os.remove(params.allContigsFna) From 5a1da0e0c310b3ac35c140a90a086da1adc382a4 Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Thu, 5 Sep 2024 15:43:28 -0400 Subject: [PATCH 04/17] increment version --- bin/main.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/main.py b/bin/main.py index b9e2936..02e7a46 100755 --- a/bin/main.py +++ b/bin/main.py @@ -11,7 +11,7 @@ from bin.removeOutgroupPrimers import _removeOutgroupPrimers # global constants -__version__ = "1.3.6" +__version__ = "1.3.7" __author__ = "Joseph S. Wirth" # functions diff --git a/setup.py b/setup.py index 8eeb356..e9af4b3 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setup( name='primerforge', - version='1.3.6', + version='1.3.7', author='Joseph S. Wirth', packages=find_packages(), description='software to identify primers that can be used to distinguish genomes', From 4dd9d9cf89f2c30e2168712d6fc6e4c5ef40e4df Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Tue, 10 Sep 2024 11:08:36 -0400 Subject: [PATCH 05/17] implemented a sort method before writing results to file (#31) --- bin/main.py | 6 +- bin/sortPrimerPairs.py | 139 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 bin/sortPrimerPairs.py diff --git a/bin/main.py b/bin/main.py index 02e7a46..ebabf15 100755 --- a/bin/main.py +++ b/bin/main.py @@ -5,6 +5,7 @@ from bin.Primer import Primer from Bio.SeqRecord import SeqRecord from bin.Parameters import Parameters +from bin.sortPrimerPairs import _sortPairs from bin.getPrimerPairs import _getPrimerPairs from bin.validatePrimers import _validatePrimerPairs from bin.getCandidateKmers import _getAllCandidateKmers @@ -241,6 +242,9 @@ def getHeaders(names) -> list[str]: names = list(next(iter(pairs.values())).keys()) headers = getHeaders(names) + # sort the primer pairs + sortedPairs = _sortPairs(params, pairs) + # open the file with open(params.resultsFn, 'w') as fh: # write the headers @@ -248,7 +252,7 @@ def getHeaders(names) -> list[str]: fh.flush() # for each primer pair - for fwd,rev in pairs.keys(): + for fwd,rev in sortedPairs: # save the primer pair data row = [fwd.seq, round(fwd.Tm, NUM_DEC), diff --git a/bin/sortPrimerPairs.py b/bin/sortPrimerPairs.py new file mode 100644 index 0000000..2f73ee3 --- /dev/null +++ b/bin/sortPrimerPairs.py @@ -0,0 +1,139 @@ +import os, numpy, primer3 +from bin.Primer import Primer +from bin.Parameters import Parameters + + +def __getHomopolymerTemps(pair:tuple[Primer,Primer]) -> float: + """returns the sum of the homodimer melting temps for a primer pair + + Args: + pair (tuple[Primer,Primer]): a pair of primers + + Returns: + float: the sum of both primer's homodimer melting temperatures + """ + # parse the pair as strings + fwd,rev = map(str, pair) + + # calculate dimer tms + fDimer = primer3.calc_homodimer_tm(fwd) + rDimer = primer3.calc_homodimer_tm(rev) + + # return the sum of these temps + return fDimer + rDimer + + +def __getHairpinTemps(pair:tuple[Primer,Primer]) -> float: + """gets the sume of hairpin melting tempertaures for a primer pair + + Args: + pair (tuple[Primer,Primer]): a pair of primer + + Returns: + float: the sum of the hairpin melting tempertures + """ + # parse the pair as strings + fwd,rev = map(str, pair) + + # calculate hairpin tms + fHairpin = primer3.calc_hairpin_tm(fwd) + rHairpin = primer3.calc_hairpin_tm(rev) + + # return the sum of these temps + return fHairpin + rHairpin + + +def __getHeteroDimerTemp(pair:tuple[Primer,Primer]) -> float: + """gets the heterodimer melting temperature for a pair of primers + + Args: + pair (tuple[Primer,Primer]): a pair of primers + + Returns: + float: the heterodimer melting temperature + """ + + # parse the pair as strings + fwd,rev = map(str, pair) + + return primer3.calc_heterodimer_tm(fwd, rev) + + +def __getGcDiff(pair:tuple[Primer,Primer]) -> float: + """finds the G+C content (mol %) difference between a pair of primers + + Args: + pair (tuple[Primer,Primer]): a pair of primers + + Returns: + float: the difference in G+C content (mol %) + """ + # parse the pair + fwd,rev = pair + + return abs(fwd.gcPer - rev.gcPer) + + +def __getTmDiff(pair:tuple[Primer,Primer]) -> float: + """finds the melting temperature difference between a pair of primers + + Args: + pair (tuple[Primer,Primer]): a pair of primers + + Returns: + float: the difference between melting temperatures + """ + # parse the pair + fwd,rev = pair + + return abs(fwd.Tm - rev.Tm) + + +def __getMedianIngroupProductSize(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], ingroup:list[str]) -> float: + """calculates the median product size for the ingroup genomes + + Args: + metadata (dict[str,tuple[str,int,tuple[str,int,int]]]): the metadata for a primer pair + + Returns: + float: the median PCR product size for the ingroup genomes + """ + # determine the genome names from the ingroup file names + ingroup = set(map(os.path.basename, ingroup)) + + # extract the sizes from the metadata for the pair + sizes = [size for name,(contig,size,binpair) in metadata.items() if name in ingroup] + + # negate the median so that pairs will be sorted largest to smallest + return -numpy.median(sizes) + + +def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> list[tuple[Primer,Primer]]: + """sorts pairs of primers based on several characteristics: + * first by difference in GC + * next by difference in Tm + * next by primer-dimer Tm + * next by homopolymer Tm + * next by hairpin Tm + * finally by product size (largest to smallest) + + Args: + params (Parameters): a Parameters object + pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): key=primer pair; val=dict; key=genome name; val=(contig, pcr size, binpair) + + Returns: + list[tuple[Primer,Primer]]: a list of sorted primer pairs + """ + # sort the results: + # * first by difference in GC + # * next by difference in Tm + # * next by primer-dimer Tm + # * next by homopolymer Tm + # * next by hairpin Tm + # * finally by product size (largest to smallest) + return sorted(pairs.keys(), key=lambda p: (__getGcDiff(p), + __getTmDiff(p), + __getHeteroDimerTemp(p), + __getHomopolymerTemps(p), + __getHairpinTemps(p), + __getMedianIngroupProductSize(pairs[p], params.ingroupFns))) From de650ca9e7f75fb84f51ceba14f02f56d10799d7 Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Tue, 10 Sep 2024 11:09:23 -0400 Subject: [PATCH 06/17] added tests to ensure sort method is functioning properly (#31) --- bin/unit_tests/results_test.py | 43 ++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/bin/unit_tests/results_test.py b/bin/unit_tests/results_test.py index 744c443..88db4cc 100644 --- a/bin/unit_tests/results_test.py +++ b/bin/unit_tests/results_test.py @@ -12,13 +12,15 @@ from bin.Primer import Primer from Bio.SeqRecord import SeqRecord from bin.Parameters import Parameters +from bin.sortPrimerPairs import _sortPairs from bin.getPrimerPairs import _formsDimers import gzip, os, pickle, primer3, signal, subprocess, time, unittest class Result(): """class to save results for easy lookup """ - def __init__(self, fwdTm:float, fwdGc:float, revTm:float, revGc:float, additional:dict[str,dict[str,list]]) -> Result: + def __init__(self, rowNum:int, fwdTm:float, fwdGc:float, revTm:float, revGc:float, additional:dict[str,dict[str,list]]) -> Result: + self.rowNum:int = rowNum self.fwdTm:float = fwdTm self.fwdGc:float = fwdGc self.revTm:float = revTm @@ -361,10 +363,14 @@ def _parseResultsFile(fn:str) -> dict[tuple[Seq,Seq],Result]: out = dict() key = dict() header = True + rowNum = -1 # go through each line in the file with open(fn, 'r') as fh: for line in fh: + # increment the row number + rowNum += 1 + # convert the line to a row of columns line = line.rstrip() row = line.split(SEP_1) @@ -411,7 +417,7 @@ def _parseResultsFile(fn:str) -> dict[tuple[Seq,Seq],Result]: additional[name][ResultsTest.CONTIG] = row[idx].split(SEP_2) # store the result for this primer pair - out[(fwd,rev)] = Result(ftm,fgc,rtm,rgc,additional) + out[(fwd,rev)] = Result(rowNum,ftm,fgc,rtm,rgc,additional) return out @@ -990,6 +996,39 @@ def testS_isProductLengthCorrect(self) -> None: # and the pcr lengths should not be disallowed for plen in tru[contig]: self.assertNotIn(plen, self.params.disallowedLens, f"disallowed sizes in {name} with {fwd},{rev}") + + def testT_allPairsWritten(self) -> None: + """verifies that all primer pairs were written to file + """ + # unpickle the pairs + with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh: + pairs:dict[tuple[Primer,Primer]] = pickle.load(fh) + + # convert the pairs to a set of pairs as Seq + expected = {(f.seq,r.seq) for f,r in pairs.keys()} + + # get the pairs saved in the results as a set + observed = set(self.results.keys()) + + # make sure that the pairs match + self.assertEqual(len(expected), len(observed), f'expected {len(expected)} primer pairs but observed {len(observed)}') + self.assertSetEqual(expected, observed, 'expected pairs do not match those written to file') + + def testU_arePairsSortedCorrectly(self) -> None: + """verifies that the pairs were written in the correct order + """ + # load the pairs from the pickle + with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh: + pairs:dict[tuple[Primer,Primer]] = pickle.load(fh) + + # get the expected sort order as a tuples of Seq + expected = [(f.seq,r.seq) for f,r in _sortPairs(self.params, pairs)] + + # get the observed order that was written + observed = sorted(self.results.keys(), key=lambda x: self.results[x].rowNum) + + # check that the order is conserved + self.assertListEqual(expected, observed, 'expected sort order does not match the order written to file') if __name__ == "__main__": From c8b4584e5d1f0387d49c0ef4692d638f17262470 Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Wed, 11 Sep 2024 11:16:54 -0400 Subject: [PATCH 07/17] added sorting for outgroup products; improved ingroup product sorting --- bin/sortPrimerPairs.py | 115 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 101 insertions(+), 14 deletions(-) diff --git a/bin/sortPrimerPairs.py b/bin/sortPrimerPairs.py index 2f73ee3..f94d8ac 100644 --- a/bin/sortPrimerPairs.py +++ b/bin/sortPrimerPairs.py @@ -1,8 +1,87 @@ +from typing import Union import os, numpy, primer3 from bin.Primer import Primer from bin.Parameters import Parameters +def __parseOutgroupProductSizes(product:Union[str,int]) -> list[int]: + """parses the PCR product sizes for an outgroup genome + + Args: + product (Union[str,int]): a comma-separated list of ints (str) or a single int (int) + + Returns: + list[int]: a list of PCR product sizes + """ + # constant + SEP = "," + + # convert comma-separated list (str) to a list of ints + if type(product) is str: + out = [int(x) for x in product.split(SEP)] + + # if it just an int, then make it a list + elif type(product) is int: + out = [product] + + return out + + +def __minDiffFromRange(intgr:int, rng:range) -> int: + """calculates the minimum distance of an integer from a range + + Args: + intgr (int): an integer + rng (range): a range + + Returns: + int: the minimum distance of the integer from the range + """ + # calculate the differnces from the upper and lower limits + lower = abs(intgr - min(rng)) + upper = abs(intgr - max(rng)) + + # return the minimum distance from the disallowed range + return min((lower, upper)) + + +def __sortOutgroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], outgroup:list[str], disallowed:range) -> tuple[int,int]: + """evaluates the outgroup products for a primer pair in order to sort the pair + + Args: + metadata (dict[str,tuple[str,int,tuple[str,int,int]]]): the metadata for a primer pair + outgroup (list[str]): a list of outgroup filenames + disallowed (range): the range that outgroup products are not allowed to be in + + Returns: + tuple[int,int]: minimum distance from the range (negated); the number of outgroup products + """ + # initialize variables + products = list() + + # determine the genome names from the ingroup file names + outgroup = frozenset(map(os.path.basename, outgroup)) + + # get the outgroup products that are >0bp + products = [v[1] for k,v in metadata.items() if k in outgroup and v != 0] + + # handle situations where there are no products (best case) + if products == []: + minDiff = float('inf') # largest number possible + + # otherwise, need to calculate the minimum distance from the disallowed range + else: + # parse the outgroup products so that we have a list ints + products = list(map(__parseOutgroupProductSizes, products)) + products = [y for x in products for y in x] + + # get the smallest value distance from the disallowed range + minDiff = min(map(lambda x:__minDiffFromRange(x, disallowed), products)) + + # return the minimum distance (negated so larger differences are sorted first) and the number of products + return -minDiff, len(products) + + def __getHomopolymerTemps(pair:tuple[Primer,Primer]) -> float: """returns the sum of the homodimer melting temps for a primer pair @@ -89,7 +168,7 @@ def __getTmDiff(pair:tuple[Primer,Primer]) -> float: return abs(fwd.Tm - rev.Tm) -def __getMedianIngroupProductSize(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], ingroup:list[str]) -> float: +def __sortIngroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], ingroup:list[str]) -> tuple[float,float]: """calculates the median product size for the ingroup genomes Args: @@ -99,13 +178,17 @@ def __getMedianIngroupProductSize(metadata:dict[str,tuple[str,int,tuple[str,int, float: the median PCR product size for the ingroup genomes """ # determine the genome names from the ingroup file names - ingroup = set(map(os.path.basename, ingroup)) + ingroup = frozenset(map(os.path.basename, ingroup)) # extract the sizes from the metadata for the pair - sizes = [size for name,(contig,size,binpair) in metadata.items() if name in ingroup] + products = [size for name,(contig,size,binpair) in metadata.items() if name in ingroup] + + # calculate the variance and the median of the + variance = numpy.var(products) + median = numpy.var(products) - # negate the median so that pairs will be sorted largest to smallest - return -numpy.median(sizes) + # return the variance and the median (negated to prioritze larger product sizes) + return variance, -median def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> list[tuple[Primer,Primer]]: @@ -124,16 +207,20 @@ def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple Returns: list[tuple[Primer,Primer]]: a list of sorted primer pairs """ - # sort the results: - # * first by difference in GC - # * next by difference in Tm - # * next by primer-dimer Tm - # * next by homopolymer Tm - # * next by hairpin Tm - # * finally by product size (largest to smallest) - return sorted(pairs.keys(), key=lambda p: (__getGcDiff(p), + # sort the results using the following criteria (in order): + # * largest difference from the disallowed range for outgroup PCR products + # * fewest number of outgroup PCR products + # * lowest G+C difference between the pair + # * lowest Tm difference between the pair + # * lowest Tm for heterodimers + # * lowest Tm (sum) for homodimers + # * lowest Tm (sum) for hairpins + # * lowest variance in ingroup PCR product sizes + # * largest median ingroup PCR product size + return sorted(pairs.keys(), key=lambda p: (__sortOutgroupProducts(pairs[p], params.outgroupFns, params.disallowedLens), + __getGcDiff(p), __getTmDiff(p), __getHeteroDimerTemp(p), __getHomopolymerTemps(p), __getHairpinTemps(p), - __getMedianIngroupProductSize(pairs[p], params.ingroupFns))) + __sortIngroupProducts(pairs[p], params.ingroupFns))) From 7cf398cc2d4b3c741fe1c32c76310a772324cf0b Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Wed, 11 Sep 2024 11:33:20 -0400 Subject: [PATCH 08/17] several changes: * added Results section describing sort method * added sort step to workflow * updated the example to match v1.3.7 --- README.md | 69 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index e9d8d79..b845650 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,21 @@ optional arguments: --debug run in debug mode (default: False) ``` +## Results +The results produced by `primerForge` list one primer pair per line, and these pairs are sorted based on the following criteria (in order of importance): + + * largest difference from the ingroup PCR product range for outgroup PCR products + * lowest number of outgroup PCR products + * lowest G+C difference between the pair + * lowest Tm difference between the pair + * lowest Tm for heterodimers + * lowest Tm (sum) for homodimers + * lowest Tm (sum) for hairpins + * lowest variance in ingroup PCR product sizes + * largest median ingroup PCR product size + +See [the example](#examining-the-results) for details about the file format. + ## Workflow ```mermaid @@ -208,8 +223,8 @@ flowchart TB ispcr["filter pairs using isPcr"] final(["final primer pairs"]) dump5[/"dump to file"/] - write[/"write pairs to file"/] - + write[/"sort pairs and + write to file"/] noout --> ispcr ispcr --> final @@ -223,7 +238,7 @@ This example assumes you have already installed `primerForge` [as described abov ### Motivation In this example, we will use `primerForge` to find pairs of primers between 18bp and 24bp that can be used to differentiate three strains of _Mycoplasma mycoides_ subspecies mycoides (the "ingroup") from two strains of _Mycoplasma mycoides_ subspecies capri (the "outgroup"). The primer pairs identified by `primerForge` are predicted to produce a single PCR product between 500bp and 2000bp in the ingroup. These same primer pairs are predicted to produce PCR products <300bp, >3000bp, or no PCR products at all in the outgroup. -### preparing the workspace +### Preparing the workspace In order to get started, create a directory called `mycoplasma_test` and move into it: ```bash @@ -252,7 +267,7 @@ We will use the following flags to specify specific parameters for this example: You can get a list of all available flags using the command `primerForge --help`. -Run `primerForge` using the following command (requires at least 3 Gb of RAM): +Run `primerForge` using the following command (requires at least 2 Gb of RAM): ```bash primerForge --ingroup "./i*gbff" --outgroup "./o*gbff" --pcr_prod 500,2000 --bad_sizes 300,3000 --primer_len 18,24 @@ -261,43 +276,43 @@ primerForge --ingroup "./i*gbff" --outgroup "./o*gbff" --pcr_prod 500,2000 --bad After running the command, you should see something like this printed to the screen: ```text +dumping Parameters to '_pickles/parameters.p' ... done 00:00:00.49 identifying kmers suitable for use as primers in all 3 ingroup genome sequences - getting shared ingroup kmers that appear once in each genome ... done 00:01:19.32 - dumping shared kmers to '_pickles/sharedKmers.p' ... done 00:00:07.34 - evaluating kmers ... done 00:03:20.11 + getting shared ingroup kmers that appear once in each genome ... done 00:01:36.15 + dumping shared kmers to '_pickles/sharedKmers.p' ... done 00:00:11.22 + evaluating 2430140 kmers ... done 00:01:51.73 identified 30413 candidate kmers -done 00:04:49.18 - dumping candidate kmers to '_pickles/candidates.p' ... done 00:00:01.03 -identifying pairs of primers found in all ingroup sequences ... done 00:00:11.60 + dumping candidate kmers to '_pickles/candidates.p' ... done 00:00:01.55 +done 00:03:42.94 +identifying pairs of primers found in all ingroup sequences ... done 00:00:11.62 identified 16050 primer pairs shared in all ingroup sequences - dumping unfiltered pairs to '_pickles/pairs.p' ... done 00:00:00.54 + dumping unfiltered pairs to '_pickles/pairs.p' ... done 00:00:00.56 removing primer pairs present in the outgroup sequences - getting outgroup PCR products ... done 00:00:01.01 - filtering primer pairs ... done 00:00:00.53 - processing outgroup results ... done 00:00:00.52 + getting outgroup PCR products ... done 00:00:01.03 + filtering primer pairs ... done 00:00:00.54 + processing outgroup results ... done 00:00:00.54 removed 5905 pairs present in the outgroup (10145 pairs remaining) - dumping filtered pairs to '_pickles/pairs_noOutgroup.p' ... done 00:00:00.53 -validating primer pairs with isPcr ... done 00:00:01.97 - removed 1665 pairs not validated by isPcr (8480 pairs remaining) - dumping validated pairs to '_pickles/pairs_noOutgroup_validated.p' ... done 00:00:00.53 -writing primer pairs to 'results.tsv' ... done 00:00:00.52 + dumping filtered pairs to '_pickles/pairs_noOutgroup.p' ... done 00:00:00.54 +validating primer pairs with isPcr ... done 00:00:03.44 + removed 3659 pairs not validated by isPcr (6486 pairs remaining) + dumping validated pairs to '_pickles/pairs_noOutgroup_validated.p' ... done 00:00:00.54 +writing 6486 primer pairs to 'results.tsv' ... done 00:00:11.62 -total runtime: 00:05:08.49 +total runtime: 00:04:14.50 ``` -As we can see, `primerForge` found 30,413 kmers that were suitable for use as a primer in all three ingroup genomes. It then went on to identify 16,050 primer pairs that would produce PCR products between 500bp and 2000bp in the ingroup genomes. Next, it found that of those 16,050 pairs, 5,905 of them formed PCR products between 300bp and 3000bp in one or more of the outgroup genomes. Finally, it used `isPcr` to validate the remaining 10,145 primer pairs resulting in 8,480 primer pairs being written to file. +As we can see, `primerForge` found 30,413 kmers that were suitable for use as a primer in all three ingroup genomes. It then went on to identify 16,050 primer pairs that would produce PCR products between 500bp and 2000bp in the ingroup genomes. Next, it found that of those 16,050 pairs, 5,905 of them formed PCR products between 300bp and 3000bp in one or more of the outgroup genomes. Finally, it used `isPcr` to validate the remaining 10,145 primer pairs resulting in 6,486 primer pairs being written to file. +### Examining the results `primerForge` generated `results.tsv`, the file that contains the sequences and details for each primer pair, and `primerForge.log`. Here are a few lines from `results.tsv`: |fwd_seq|fwd_Tm|fwd_GC|rev_seq|rev_Tm|rev_GC|i1.gbff_contig|i1.gbff_length|i2.gbff_contig|i2.gbff_length|i3.gbff_contig|i3.gbff_length|o1.gbff_contig|o1.gbff_length|o2.gbff_contig|o2.gbff_length| |:------|:-----|:-----|:------|:-----|:-----|:------------:|:------------:|:------------:|:------------:|:------------:|:------------:|:------------:|:------------:|:------------:|:------------:| -|TATGCAACTAATCCCGAGTATCAC|56.1|41.7|TGTAAGTGGCGTTGTATCCC|55.5|50|NZ_LAUX01000130.1|521|NZ_LAUV01000074.1|521|NZ_LAUY01000118.1|521|NA|0|NA|0| -|TGTTCCTTCACACTCAATAACAGC|57.3|41.7|AGAAGGAACAGTCGCTGAAG|55.4|50|NZ_LAUX01000081.1|1645|NZ_LAUV01000036.1|1645|NZ_LAUY01000073.1|1645|NZ_LS483503.1|3091|NZ_CP065583.1|3103| -|AAGGAGAGTATCGCTTAGTTGATG|56|41.7|CAACAGCAGATGGTTTAGAAAGTG|56.4|41.7|NZ_LAUX01000079.1|788|NZ_LAUV01000035.1|788|NZ_LAUY01000072.1|788|NA|0|NA|0| -|AAGGAGAGTATCGCTTAGTTGATG|56|41.7|ACTCCAATTGCTCTTCCTGAAG|56.2|45.5|NZ_LAUX01000079.1|1053|NZ_LAUV01000035.1|1053|NZ_LAUY01000072.1|1053|NA|0|NA|0| -|TGAAATCACCAGCTATTGCATCAG|57.5|41.7|ACATTGCAACTCCTGAGATTTG|55.1|40.9|NZ_LAUX01000081.1|1998|NZ_LAUV01000036.1|1998|NZ_LAUY01000073.1|1998|NA|0|NA|0| +|AGAAGCAAAGGATATGGGAACAAC|57.1|41.7|AAATCAACACCCTCAATAAGCTCC|57.1|41.7|NZ_LAUX01000078.1|804|NZ_LAUV01000035.1|804|NZ_LAUY01000092.1|804|NA|0|NA|0| +|TCCATCTAATGAAGATCAACCAGG|55.9|41.7|CCCTAATTGTGATGAGTTGACAAC|55.9|41.7|NZ_LAUX01000010.1|710|NZ_LAUV01000042.1|713|NZ_LAUY01000078.1|710|NA|0|NA|0| +|CATCAGCTGTTGTAAATAACCCAC|56.2|41.7|GTGGAGCTATGAAACCAATATCAG|55.3|41.7|NZ_LAUX01000117.1|1694|NZ_LAUV01000064.1|1694|NZ_LAUY01000018.1|1694|NZ_LS483503.1|3212|NA|0| -The first six columns show the forward and reverse sequences (5' --> 3') as well as their melting temperatures and their G+C %. Next, for each genome it lists the contig and the PCR product size that is predicted be produced by this pair. For example, the first pair of primers are predicted to produce PCR products of 521bp the ingroup genomes, and the binding sites for this primer pair in the files `i1.gbff`, `i2.gbff`, and `i3.gbff` can be found in contigs `NZ_LAUX01000130.1`, `NZ_LAUV01000074.1`, and `NZ_LAUY01000118.1`, respectively. This same pair is not predicted to produce any PCR products in either outgroup genome. Similarly, the next pair is predicted to produce a PCR product size of 1,645bp in all three ingroup genomes and PCR products sizes of >3,000bp in both the outgroup genomes. +The first six columns show the forward and reverse sequences (5' --> 3') as well as their melting temperatures and their G+C content (mol %). Next, for each genome it lists the contig and the PCR product size that is predicted be produced by this pair. For example, the first pair of primers are predicted to produce PCR products of 804bp the ingroup genomes, and the binding sites for this primer pair in the files `i1.gbff`, `i2.gbff`, and `i3.gbff` can be found in contigs `NZ_LAUX01000078.1`, `NZ_LAUV01000035.1`, and `NZ_LAUY01000092.1`, respectively. This same pair is not predicted to produce any PCR products in either outgroup genome. Similarly, the third pair is predicted to produce a PCR product size of 1,694bp in all three ingroup genomes, no products `o2.gbff`, and a PCR product >3,000bp in `o1.gbff`. If a primer pair is predicted to produce multiple products in an outgroup genome, then the contig column and the size column will list contigs and sizes in a comma-separated list linked by position. For example, if a primer pair was expected to produce a product of 1,990bp in `contig_1` and 2,024bp in `contig_2` in the genome file `example.gbff`, then the columns for this genome would look like this: From 3e2131f10bd01473b8c1dc535399ba389b59b34c Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Wed, 11 Sep 2024 11:33:53 -0400 Subject: [PATCH 09/17] revised sort method for outgroup products --- bin/sortPrimerPairs.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bin/sortPrimerPairs.py b/bin/sortPrimerPairs.py index f94d8ac..324d76a 100644 --- a/bin/sortPrimerPairs.py +++ b/bin/sortPrimerPairs.py @@ -45,13 +45,13 @@ def __minDiffFromRange(intgr:int, rng:range) -> int: return min((lower, upper)) -def __sortOutgroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], outgroup:list[str], disallowed:range) -> tuple[int,int]: +def __sortOutgroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], outgroup:list[str], ingroupPcrLens:range) -> tuple[int,int]: """evaluates the outgroup products for a primer pair in order to sort the pair Args: metadata (dict[str,tuple[str,int,tuple[str,int,int]]]): the metadata for a primer pair outgroup (list[str]): a list of outgroup filenames - disallowed (range): the range that outgroup products are not allowed to be in + ingroupPcrLens (range): the range that ingroup PCR products are within Returns: tuple[int,int]: minimum distance from the range (negated); the number of outgroup products @@ -76,7 +76,7 @@ def __sortOutgroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], products = [y for x in products for y in x] # get the smallest value distance from the disallowed range - minDiff = min(map(lambda x:__minDiffFromRange(x, disallowed), products)) + minDiff = min(map(lambda x:__minDiffFromRange(x, ingroupPcrLens), products)) # return the minimum distance (negated so larger differences are sorted first) and the number of products return -minDiff, len(products) @@ -208,7 +208,7 @@ def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple list[tuple[Primer,Primer]]: a list of sorted primer pairs """ # sort the results using the following criteria (in order): - # * largest difference from the disallowed range for outgroup PCR products + # * largest difference from the ingroup product range for outgroup PCR products # * fewest number of outgroup PCR products # * lowest G+C difference between the pair # * lowest Tm difference between the pair @@ -217,7 +217,9 @@ def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple # * lowest Tm (sum) for hairpins # * lowest variance in ingroup PCR product sizes # * largest median ingroup PCR product size - return sorted(pairs.keys(), key=lambda p: (__sortOutgroupProducts(pairs[p], params.outgroupFns, params.disallowedLens), + return sorted(pairs.keys(), key=lambda p: (__sortOutgroupProducts(pairs[p], + params.outgroupFns, + range(params.minPcr, params.maxPcr+1)), __getGcDiff(p), __getTmDiff(p), __getHeteroDimerTemp(p), From 79f726fc1c897511ab18beedd1444b94c1e85c70 Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Wed, 11 Sep 2024 12:47:44 -0400 Subject: [PATCH 10/17] only ask for dataset when downloading genomes --- bin/unit_tests/results_test.py | 78 +++++++++++++++------------------- 1 file changed, 34 insertions(+), 44 deletions(-) diff --git a/bin/unit_tests/results_test.py b/bin/unit_tests/results_test.py index 88db4cc..7f98d19 100644 --- a/bin/unit_tests/results_test.py +++ b/bin/unit_tests/results_test.py @@ -39,20 +39,22 @@ class ResultsTest(unittest.TestCase): FAKE_FN = 'fakefile' PCRLEN = "length" CONTIG = "contig" + INGROUP_FILES = ('i1.gbff', 'i2.gbff', 'i3.gbff') + OUTGROUP_FILES = ('o1.gbff', 'o2.gbff') # default ingroup and outgroup genomes (Mycoplasma mycoides) - DEFAULT_INGROUP = {'i1.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/034/305/GCF_003034305.1_ASM303430v1/GCF_003034305.1_ASM303430v1_genomic.gbff.gz', - 'i2.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/034/275/GCF_003034275.1_ASM303427v1/GCF_003034275.1_ASM303427v1_genomic.gbff.gz', - 'i3.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/034/345/GCF_003034345.1_ASM303434v1/GCF_003034345.1_ASM303434v1_genomic.gbff.gz'} - DEFAULT_OUTGROUP = {'o1.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/489/555/GCF_900489555.1_MMC68/GCF_900489555.1_MMC68_genomic.gbff.gz', - 'o2.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/018/389/745/GCF_018389745.1_ASM1838974v1/GCF_018389745.1_ASM1838974v1_genomic.gbff.gz'} + DEFAULT_INGROUP = {INGROUP_FILES[0]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/034/305/GCF_003034305.1_ASM303430v1/GCF_003034305.1_ASM303430v1_genomic.gbff.gz', + INGROUP_FILES[1]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/034/275/GCF_003034275.1_ASM303427v1/GCF_003034275.1_ASM303427v1_genomic.gbff.gz', + INGROUP_FILES[2]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/034/345/GCF_003034345.1_ASM303434v1/GCF_003034345.1_ASM303434v1_genomic.gbff.gz'} + DEFAULT_OUTGROUP = {OUTGROUP_FILES[0]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/489/555/GCF_900489555.1_MMC68/GCF_900489555.1_MMC68_genomic.gbff.gz', + OUTGROUP_FILES[1]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/018/389/745/GCF_018389745.1_ASM1838974v1/GCF_018389745.1_ASM1838974v1_genomic.gbff.gz'} # alternate ingroup and outgroup genomes (Escherichia coli) - ALTRNTE_INGROUP = {'i1.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/650/295/GCF_001650295.1_ASM165029v1/GCF_001650295.1_ASM165029v1_genomic.gbff.gz', - 'i2.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/008/727/175/GCF_008727175.1_ASM872717v1/GCF_008727175.1_ASM872717v1_genomic.gbff.gz', - 'i3.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/208/865/GCF_002208865.2_ASM220886v2/GCF_002208865.2_ASM220886v2_genomic.gbff.gz'} - ALTRNTE_OUTGROUP = {'o1.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.gbff.gz', - 'o2.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/006/945/GCF_000006945.2_ASM694v2/GCF_000006945.2_ASM694v2_genomic.gbff.gz'} + ALTRNTE_INGROUP = {INGROUP_FILES[0]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/650/295/GCF_001650295.1_ASM165029v1/GCF_001650295.1_ASM165029v1_genomic.gbff.gz', + INGROUP_FILES[1]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/008/727/175/GCF_008727175.1_ASM872717v1/GCF_008727175.1_ASM872717v1_genomic.gbff.gz', + INGROUP_FILES[2]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/208/865/GCF_002208865.2_ASM220886v2/GCF_002208865.2_ASM220886v2_genomic.gbff.gz'} + ALTRNTE_OUTGROUP = {OUTGROUP_FILES[0]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.gbff.gz', + OUTGROUP_FILES[1]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/006/945/GCF_000006945.2_ASM694v2/GCF_000006945.2_ASM694v2_genomic.gbff.gz'} @classmethod def setUpClass(cls) -> None: @@ -64,24 +66,23 @@ def setUpClass(cls) -> None: # get the number of threads to use numThreads = ResultsTest._getNumThreads() - # determine which dataset to use - cls.ingroup:dict[str,str] - cls.outgroup:dict[str,str] - cls.ingroup, cls.outgroup = ResultsTest._selectDataset() - # make the test directory if it doesn't exist if not os.path.isdir(ResultsTest.TEST_DIR): os.mkdir(ResultsTest.TEST_DIR) - # get a list of all the genome files - allFiles = list(ResultsTest.ingroup.keys()) + list(ResultsTest.outgroup.keys()) - allFiles = map(os.path.join, [ResultsTest.TEST_DIR]*len(allFiles), allFiles) + # get a list of the genome filenames + cls.seqFiles:list[str] = list(map(lambda x: os.path.join(ResultsTest.TEST_DIR, x), + ResultsTest.INGROUP_FILES + ResultsTest.OUTGROUP_FILES)) # download genome files if necessary - if not all(map(os.path.exists, allFiles)): + if not all(map(os.path.exists, ResultsTest.seqFiles)): + # determine which dataset to use + ingroupFtp, outgroupFtp = ResultsTest._selectDataset() + + # download the files clock.printStart('downloading genomes') - ResultsTest._downloadGenomesForGroup(ResultsTest.ingroup) - ResultsTest._downloadGenomesForGroup(ResultsTest.outgroup) + ResultsTest._downloadGenomesForGroup(ingroupFtp) + ResultsTest._downloadGenomesForGroup(outgroupFtp) clock.printDone() # get the parameters @@ -91,13 +92,13 @@ def setUpClass(cls) -> None: # run primerForge if results file does not exist if not os.path.exists(ResultsTest.params.resultsFn): # run primerForge - clock.printStart('running primerForge', end=' ...\n', spin=False) + clock.printStart('running primerForge', end=' ...\n\n', spin=False) _runner(ResultsTest.params) clock.printDone() # otherwise using existing file else: - print('running tests on existing files') + print('running tests on existing files\n') # load the results file into memory clock.printStart('reading results into memory') @@ -447,18 +448,7 @@ def _loadGenomeSequences() -> dict[str,dict[str,dict[str,Seq]]]: Returns: dict[str,dict[str,dict[str,Seq]]]: key=genome name; val=dict: key=contig name; val=dict: key=strand; val=sequence """ - # initialize output - out = dict() - - # load ingroup sequences - for fn in ResultsTest.ingroup.keys(): - out[fn] = ResultsTest._loadOneGenomeSequence(fn) - - # load outgroup sequences - for fn in ResultsTest.outgroup.keys(): - out[fn] = ResultsTest._loadOneGenomeSequence(fn) - - return out + return {os.path.basename(fn):ResultsTest._loadOneGenomeSequence(fn) for fn in ResultsTest.seqFiles} def _kmerListToDict(klist:list[tuple[str,int]]) -> dict[str,list[int]]: """converts a list of kmers to a dictionary @@ -777,7 +767,7 @@ def testL_ingroupHasOneProduct(self) -> None: # for each pair in the ingroup for pair in self.results.keys(): - for name in ResultsTest.ingroup.keys(): + for name in ResultsTest.INGROUP_FILES: # there should be exactly one pcr product size for each ingroup genome self.assertEqual(len(self.results[pair].additional[name][ResultsTest.PCRLEN]), 1, f"{FAIL_MSG_A}{pair}{FAIL_MSG_B}{name}") self.assertEqual(len(self.results[pair].additional[name][ResultsTest.CONTIG]), 1, f"{FAIL_MSG_A}{pair}{FAIL_MSG_B}{name}") @@ -791,7 +781,7 @@ def testM_ingroupProductsWithinRange(self) -> None: # for each pari in the ingroup for pair in self.results.keys(): - for name in ResultsTest.ingroup.keys(): + for name in ResultsTest.INGROUP_FILES: # make sure the pcr products are within the allowed range self.assertGreaterEqual(self.results[pair].additional[name][ResultsTest.PCRLEN][0], self.params.minPcr, f"{FAIL_MSG_A}{pair}{FAIL_MSG_B}{name}") self.assertLessEqual(self.results[pair].additional[name][ResultsTest.PCRLEN][0], self.params.maxPcr, f"{FAIL_MSG_A}{pair}{FAIL_MSG_B}{name}") @@ -805,7 +795,7 @@ def testN_outgroupProductsSavedCorrectly(self) -> None: # for each pair in the outgroup for pair in self.results.keys(): - for name in ResultsTest.outgroup.keys(): + for name in ResultsTest.OUTGROUP_FILES: # the length of the contig and pcr size lists should be equal numContigs = len(self.results[pair].additional[name][ResultsTest.CONTIG]) numPcrLens = len(self.results[pair].additional[name][ResultsTest.PCRLEN]) @@ -821,7 +811,7 @@ def testO_outgroupProductsNaAreZero(self) -> None: # for each pair in the outgroup for pair in self.results.keys(): - for name in ResultsTest.outgroup.keys(): + for name in ResultsTest.OUTGROUP_FILES: # for each pcr product saved for idx in range(len(self.results[pair].additional[name][ResultsTest.CONTIG])): # if the contig is NA, then the pcr size should be zero @@ -841,7 +831,7 @@ def testP_outgroupProductsNotWithinDisallowedRange(self) -> None: # for each pair in the outgroup for pair in self.results.keys(): - for name in ResultsTest.outgroup.keys(): + for name in ResultsTest.OUTGROUP_FILES: # each pcr product size should not be in the disallowed range for pcrLen in self.results[pair].additional[name][ResultsTest.PCRLEN]: self.assertNotIn(pcrLen, self.params.disallowedLens, f"{FAIL_MSG_A}{pair}{FAIL_MSG_B}{name}") @@ -858,7 +848,7 @@ def testQ_isRepeatedInIngroup(self) -> None: rbind = self.bindingSites[rev] # for each ingroup genome - for name in ResultsTest.ingroup.keys(): + for name in ResultsTest.INGROUP_FILES: # count the number of binding sites in the genome for each primer fCount = 0 rCount = 0 @@ -880,7 +870,7 @@ def testR_onSeparateStrandsInIngroup(self) -> None: rbind = self.bindingSites[rev] # only evaluate for the ingroup - for name in ResultsTest.ingroup.keys(): + for name in ResultsTest.INGROUP_FILES: for contig in fbind[name].keys(): # if the forward primer is on the (+) strand if fbind[name][contig][Primer.PLUS] != []: @@ -914,7 +904,7 @@ def testS_isProductLengthCorrect(self) -> None: rbind = self.bindingSites[rev] # for each ingroup genome - for name in ResultsTest.ingroup.keys(): + for name in ResultsTest.INGROUP_FILES: # extract the stored pcr length and the contig pcrLen = self.results[(fwd,rev)].additional[name][ResultsTest.PCRLEN][0] contig = self.results[(fwd,rev)].additional[name][ResultsTest.CONTIG][0] @@ -939,7 +929,7 @@ def testS_isProductLengthCorrect(self) -> None: self.assertEqual(pcrLen, truLen, f"bad pcr product sizes in {name} for {fwd}, {rev}") # for each outgroup genome - for name in ResultsTest.outgroup.keys(): + for name in ResultsTest.OUTGROUP_FILES: # get the list of pcr products and their contigs pcrLens = self.results[(fwd,rev)].additional[name][ResultsTest.PCRLEN] contigs = self.results[(fwd,rev)].additional[name][ResultsTest.CONTIG] From 1c1f24767f9c5c0469885b6616c31917f8f7db0d Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Thu, 12 Sep 2024 15:03:39 -0400 Subject: [PATCH 11/17] created new class to store PCR product data --- bin/Product.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 bin/Product.py diff --git a/bin/Product.py b/bin/Product.py new file mode 100644 index 0000000..96a25a4 --- /dev/null +++ b/bin/Product.py @@ -0,0 +1,32 @@ +from __future__ import annotations +from typing import Union + +class Product: + """class for storing PCR product data for a pair of primers + """ + def __init__(self, contig:str, size:Union[int,str], fbin:int, rbin:int, dimerTm:float) -> Product: + """constructor + + Args: + contig (str): the contig where the product resides + size (Union[int,str]): the contig size (will be str if multiple sizes) + fbin (int): the bin the forward primer belongs to + rbin (int): the bin the reverse primer belongs to + dimerTm (float): the heterodimer Tm for the primer pair + + Returns: + Product: a Product object + """ + # set attributes + self.contig:str = contig + self.size:Union[int,str] = size + self.dimerTm:float = dimerTm + self.fbin:int = fbin + self.rbin:int = rbin + + # overloads + def __str__(self) -> str: + return f'{self.contig}: {self.size} bp' + + def __repr__(self) -> str: + return str(self) From b599e4d46597fc39c2aaa715711c8edaa473d872 Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Thu, 12 Sep 2024 15:07:43 -0400 Subject: [PATCH 12/17] saving primer homodimer/hairpin Tms for O(1) lookup during sorting (#31) --- bin/Primer.py | 10 ++++++++++ bin/getCandidateKmers.py | 38 +++++++++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/bin/Primer.py b/bin/Primer.py index b985f0b..60e2886 100644 --- a/bin/Primer.py +++ b/bin/Primer.py @@ -33,6 +33,10 @@ def __init__(self, seq:Seq, contig:str, start:int, length:int, strand:str) -> Pr self.strand:str = strand self.Tm:float = None self.gcPer:float = None + self.hairpinTm:float = None + self.rcHairpin:float = None + self.homodimerTm:float = None + self.rcHomodimer:float = None self.__length:int = length # flip start and end if on the minus strand @@ -111,6 +115,12 @@ def reverseComplement(self) -> Primer: new = Primer(self.seq.reverse_complement(), self.contig, self.start, len(self), Primer.MINUS) else: new = Primer(self.seq.reverse_complement(), self.contig, self.end, len(self), Primer.PLUS) + + # flip the hairpin/dimer Tms + new.hairpinTm = self.rcHairpin + new.rcHairpin = self.hairpinTm + new.homodimerTm = self.rcHomodimer + new.rcHomodimer = self.homodimerTm # make the new object return new diff --git a/bin/getCandidateKmers.py b/bin/getCandidateKmers.py index 782501e..f729d9a 100644 --- a/bin/getCandidateKmers.py +++ b/bin/getCandidateKmers.py @@ -316,18 +316,26 @@ def noLongRepeats(primer:Primer) -> bool: def noHairpins(primer:Primer) -> bool: """verifies that the primer does not form hairpins """ + # save the hairpin Tms + primer.hairpinTm = primer3.calc_hairpin_tm(str(primer)) + primer.rcHairpin = primer3.calc_hairpin_tm(str(primer.reverseComplement())) + # hairpin tm should be less than (minTm - 5°); need to check both strands - fwdOk = primer3.calc_hairpin_tm(str(primer)) < (minTm - FIVE_DEGREES) - revOk = primer3.calc_hairpin_tm(str(primer.reverseComplement())) < (minTm - FIVE_DEGREES) + fwdOk = primer.hairpinTm < (minTm - FIVE_DEGREES) + revOk = primer.rcHairpin < (minTm - FIVE_DEGREES) return fwdOk and revOk def noHomodimers(primer:Primer) -> bool: """verifies that the primer does not form homodimers """ + # save the homodimer Tms + primer.homodimerTm = primer3.calc_homodimer_tm(str(primer)) + primer.rcHomodimer = primer3.calc_homodimer_tm(str(primer.reverseComplement())) + # homodimer tm should be less than (minTm - 5°); need to check both strands - fwdOk = primer3.calc_homodimer_tm(str(primer)) < (minTm - FIVE_DEGREES) - revOk = primer3.calc_homodimer_tm(str(primer.reverseComplement())) < (minTm - FIVE_DEGREES) + fwdOk = primer.homodimerTm < (minTm - FIVE_DEGREES) + revOk = primer.rcHomodimer < (minTm - FIVE_DEGREES) return fwdOk and revOk @@ -403,18 +411,34 @@ def __buildOutput(kmers:dict[str,dict[str,tuple[str,int,str]]], candidates:list[ # identify which sequence is present in the genome try: entry = kmers[cand.seq] + hairpinTm = cand.hairpinTm + rcHairpin = cand.rcHairpin + homodimerTm = cand.homodimerTm + rcHomodimer = cand.rcHomodimer + except: entry = kmers[cand.seq.reverse_complement()] + hairpinTm = cand.rcHairpin + rcHairpin = cand.hairpinTm + homodimerTm = cand.rcHomodimer + rcHomodimer = cand.homodimerTm # for each genome for name in entry.keys(): # extract the data from the entry contig, start, strand = entry[name] - - # save a Primer object in this contig's list + + # create the new primer for this genome + primer = Primer(cand.seq, contig, start, len(cand.seq), strand) + primer.hairpinTm = hairpinTm + primer.rcHairpin = rcHairpin + primer.homodimerTm = homodimerTm + primer.rcHomodimer = rcHomodimer + + # save the Primer in this contig's list out[name] = out.get(name, dict()) out[name][contig] = out[name].get(contig, list()) - out[name][contig].append(Primer(cand.seq, contig, start, len(cand.seq), strand)) + out[name][contig].append(primer) return out From 2f2d628441a1f948dac79ec064bd29b2bf11ce22 Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Thu, 12 Sep 2024 15:13:02 -0400 Subject: [PATCH 13/17] major overhauls to speed up sorting (#31) * implemented Product class to replace tuples of values * saving heterodimer Tm in Product class for O(1) lookup during sorting * utilizing new Tm fields in the Primer class * significantly sped up sorting thanks to Product and Primer changes --- bin/getPrimerPairs.py | 85 ++++++++++++++++++++++++------------ bin/main.py | 39 +++++++++-------- bin/removeOutgroupPrimers.py | 37 ++++++++++------ bin/sortPrimerPairs.py | 52 +++++++++------------- bin/validatePrimers.py | 17 ++++---- 5 files changed, 133 insertions(+), 97 deletions(-) diff --git a/bin/getPrimerPairs.py b/bin/getPrimerPairs.py index 504d77b..050adf8 100644 --- a/bin/getPrimerPairs.py +++ b/bin/getPrimerPairs.py @@ -1,8 +1,9 @@ from Bio.Seq import Seq -from typing import Generator from bin.Primer import Primer +from bin.Product import Product from itertools import combinations import multiprocessing, os, primer3 +from typing import Generator, Union from bin.Parameters import Parameters @@ -144,7 +145,7 @@ def __getBinPairs(binned:dict[str,dict[int,list[Primer]]], minPrimerLen:int, min return out -def _formsDimers(fwd:str, rev:str, fwdTm:float, revTm:float) -> bool: +def _formsDimers(fwd:str, rev:str, fwdTm:float, revTm:float) -> tuple[bool, float]: """evaluates if two primers may form primer dimers Args: @@ -154,13 +155,17 @@ def _formsDimers(fwd:str, rev:str, fwdTm:float, revTm:float) -> bool: revTm (float): the reverse primer melting temperature Returns: - bool: indicates if primer dimer formation is possible + tuple[bool, float]: indicates if primer dimer formation is possible; the dimer Tm """ # constant FIVE_DEGREES = 5 - + + dimerTm = primer3.calc_heterodimer_tm(fwd, rev) + # dimer formation is a concern if the Tm is sufficiently high - return primer3.calc_heterodimer_tm(fwd, rev) >= (min(fwdTm, revTm) - FIVE_DEGREES) + highEnough = dimerTm >= (min(fwdTm, revTm) - FIVE_DEGREES) + + return highEnough, dimerTm def __isPairSuitable(fwd:Primer, rev:Primer, minPcr:int, maxPcr:int, maxTmDiff:float) -> tuple[bool,int]: @@ -188,7 +193,7 @@ def __isPairSuitable(fwd:Primer, rev:Primer, minPcr:int, maxPcr:int, maxTmDiff:f return True, pcrLen -def __evaluateOnePair(fwd:str, rev:str, fTm:float, rTm:float, binPair:str) -> str: +def __evaluateOnePair(fwd:str, rev:str, fTm:float, rTm:float, binPair:str) -> Union[tuple[str, float],None]: """evaluates a primer pair; designed to work in parallel; only returns if the pair passes evaluation Args: @@ -199,13 +204,16 @@ def __evaluateOnePair(fwd:str, rev:str, fTm:float, rTm:float, binPair:str) -> st binPair (str): the bin pair for this pair Returns: - str: the bin pair for this pair + tuple[str, float] | None: the bin pair for this pair, the heterodimer Tm; None if the pair does not pass evaluation """ + # get the dimer Tm and if it forms dimers + forms, dimerTm = _formsDimers(fwd, rev, fTm, rTm) + # return acceptable pairs and their pcr product sizes - if not _formsDimers(fwd, rev, fTm, rTm): return binPair + if not forms: return binPair, dimerTm -def __getCandidatePrimerPairs(binPairs:list[tuple[str,int,int]], bins:dict[str,dict[int,list[Primer]]], params:Parameters) -> list[tuple[Primer,Primer,int,tuple[str,int,int]]]: +def __getCandidatePrimerPairs(binPairs:list[tuple[str,int,int]], bins:dict[str,dict[int,list[Primer]]], params:Parameters) -> list[tuple[Primer,Primer,Product]]: """gets candidate primer pairs from pairs of bins Args: @@ -214,7 +222,7 @@ def __getCandidatePrimerPairs(binPairs:list[tuple[str,int,int]], bins:dict[str,d params (Parameters): a Parameters object Returns: - list[tuple[Primer,Primer,int,tuple[str,int,int]]]: a list of primer pairs, the corresponding pcr product size, and the bin pair (contig, bin1, bin2) + list[tuple[Primer,Primer,Product]]: a list of (forward primer, reverse primer, pcr product) """ # constants FWD = 'forward' @@ -279,12 +287,12 @@ def generateArgs() -> Generator[tuple[str, str, float, float, str],None,None]: # evaluate pairs of primers for heterodimer potential in parallel pool = multiprocessing.Pool(params.numThreads) - pairs = pool.starmap(__evaluateOnePair, generateArgs()) + results = pool.starmap(__evaluateOnePair, generateArgs()) pool.close() pool.join() # for each valid pair - for pair in [x for x in pairs if x is not None]: + for pair,dimerTm in [r for r in results if r is not None]: # recast the bin pair from a string to its component parts contig, fbin, rbin = eval(pair) @@ -296,7 +304,7 @@ def generateArgs() -> Generator[tuple[str, str, float, float, str],None,None]: rev:Primer = bins[contig][rbin][jdx] # save this pair in the output - out.append((fwd,rev.reverseComplement(),pcrLen,(contig,fbin,rbin))) + out.append((fwd,rev.reverseComplement(), Product(contig, pcrLen, fbin, rbin, dimerTm))) return out @@ -322,18 +330,21 @@ def __restructureCandidateKmerData(candidates:dict[str,list[Primer]]) -> dict[Se return out -def __getAllSharedPrimerPairs(firstName:str, candidateKmers:dict[str,dict[str,list[Primer]]], candidatePairs:list[tuple[Primer,Primer,int,tuple[str,int,int]]], params:Parameters) -> dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]: +def __getAllSharedPrimerPairs(firstName:str, candidateKmers:dict[str,dict[str,list[Primer]]], candidatePairs:list[tuple[Primer,Primer,Product]], params:Parameters) -> dict[tuple[Primer,Primer],dict[str,Product]]: """gets all the primer pairs that are shared in all the genomes Args: firstName (str): the name of the genome that has already been evaluated candidateKmers (dict[str,dict[str,list[Primer]]]): key=genome name; val=dict: key=contig name; val=list of candidate kmers - candidatePairs (list[tuple[Primer,Primer,int,tuple[str,int,int]]]): the list produced by __getCandidatePrimerPairs + candidatePairs (list[tuple[Primer,Primer,Product]): the list produced by __getCandidatePrimerPairs params (Parameters): a Parameters object Returns: - dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]: key=pair of Primers; val=dict: key=genome name; val=tuple: contig name, PCR product length, bin pair (contig, bin1, bin2) + dict[tuple[Primer,Primer],dict[str,Product]]: key=pair of Primers; val=dict: key=genome name; val=Product object """ + # constant + TEMP_BIN = -1 + # initialize variables k1:Primer k2:Primer @@ -351,30 +362,46 @@ def __getAllSharedPrimerPairs(firstName:str, candidateKmers:dict[str,dict[str,li kmers[name] = __restructureCandidateKmerData(candidateKmers[name]) # for each pair of primers in the candidate pairs - for p1,p2,length,binPair in candidatePairs: + for p1,p2,product in candidatePairs: # store the data for the genome (firstName) that has already been evaluated out[(p1,p2)] = dict() - out[(p1,p2)][firstName] = (p1.contig,length,binPair) + out[(p1,p2)][firstName] = product # for each unprocessed genome for name in remaining: - # get the first kmer (may be rev comp) + # get the first kmer (may be rev comp) and its Tms try: - k1 = kmers[name][p1.seq] + k1:Primer = kmers[name][p1.seq] k1_rev = False + k1.hairpinTm = p1.hairpinTm + k1.rcHairpin = p1.rcHairpin + k1.homodimerTm = p1.homodimerTm + k1.rcHomodimer = p1.rcHomodimer except KeyError: k1 = kmers[name][p1.seq.reverse_complement()] k1_rev = True + k1.hairpinTm = p1.rcHairpin + k1.rcHairpin = p1.hairpinTm + k1.homodimerTm = p1.rcHomodimer + k1.rcHomodimer = p1.homodimerTm - # get the second kmer (may be rev comp) + # get the second kmer (may be rev comp) and its Tms try: k2 = kmers[name][p2.seq] k2_rev = False + k2.hairpinTm = p2.hairpinTm + k2.rcHairpin = p2.rcHairpin + k2.homodimerTm = p2.homodimerTm + k2.rcHomodimer = p2.rcHomodimer except KeyError: k2 = kmers[name][p2.seq.reverse_complement()] k2_rev = True + k2.hairpinTm = p2.rcHairpin + k2.rcHairpin = p2.hairpinTm + k2.homodimerTm = p2.rcHomodimer + k2.rcHomodimer = p2.homodimerTm # both primers need to be on the same contig if k1.contig == k2.contig: @@ -400,7 +427,8 @@ def __getAllSharedPrimerPairs(firstName:str, candidateKmers:dict[str,dict[str,li # only save the pair if the length falls within the acceptable range length = rev.end - fwd.start + 1 if length in allowedLengths: - out[(p1,p2)][name] = (fwd.contig, length, binPair) + # the bin numbers are incorrect and need to be fixed later + out[(p1,p2)][name] = Product(fwd.contig, length, TEMP_BIN, TEMP_BIN, product.dimerTm) # remove any pairs that are not universally suitable in all genomes for pair in set(out.keys()): @@ -410,13 +438,13 @@ def __getAllSharedPrimerPairs(firstName:str, candidateKmers:dict[str,dict[str,li return out -def __updateBinsForUnprocessedGenomes(name:str, kmers:dict[str,list[Primer]], pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> None: +def __updateBinsForUnprocessedGenomes(name:str, kmers:dict[str,list[Primer]], pairs:dict[tuple[Primer,Primer],dict[str,Product]]) -> None: """updates the bin pairs for the unprocessed genomes to ensure that all redundant primers are removed Args: name (str): the name of an unprocessed genome kmers (dict[str,list[Primer]]): candidate kmers for this genome (key=contig; val=list of candidate kmers) - pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): the dictionary produced by __getAllSharedPrimerPairs + pairs (dict[tuple[Primer,Primer],dict[str,Product]]): the dictionary produced by __getAllSharedPrimerPairs """ # bin the candidate kmers for this genome binned = __binCandidateKmers(kmers) @@ -440,11 +468,12 @@ def __updateBinsForUnprocessedGenomes(name:str, kmers:dict[str,list[Primer]], pa contig, fbin = primerToBin[fwd.reverseComplement()] contig, rbin = primerToBin[rev] - # replace the bin pair with the data for this genome - pairs[(fwd,rev)][name] = pairs[(fwd,rev)][name][:-1] + ((contig, fbin, rbin),) + # update the bin numbers for this pair + pairs[(fwd,rev)][name].fbin = fbin + pairs[(fwd,rev)][name].rbin = rbin -def _getPrimerPairs(candidateKmers:dict[str,dict[str,list[Primer]]], params:Parameters) -> dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]: +def _getPrimerPairs(candidateKmers:dict[str,dict[str,list[Primer]]], params:Parameters) -> dict[tuple[Primer,Primer],dict[str,Product]]: """gets primer pairs found in all the ingroup genomes Args: @@ -456,7 +485,7 @@ def _getPrimerPairs(candidateKmers:dict[str,dict[str,list[Primer]]], params:Para RuntimeError: unable to identify primer pairs in every ingroup genome Returns: - dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]: key=Primer pairs; val=dict: key=genome name; val=tuple: contig, pcr product size, bin pair (contig, bin1, bin2) + dict[tuple[Primer,Primer],dict[str,Product]]: key=Primer pairs; val=dict: key=genome name; val=Product """ # messages ERR_MSG_1 = "could not identify suitable primer pairs from the candidate kmers" diff --git a/bin/main.py b/bin/main.py index ebabf15..c8d02a5 100755 --- a/bin/main.py +++ b/bin/main.py @@ -3,6 +3,7 @@ from bin.Clock import Clock from typing import Generator from bin.Primer import Primer +from bin.Product import Product from Bio.SeqRecord import SeqRecord from bin.Parameters import Parameters from bin.sortPrimerPairs import _sortPairs @@ -90,7 +91,7 @@ def __getCandidates(params:Parameters, sharedExists:bool, clock:Clock) -> dict[s return candidateKmers -def __getUnfilteredPairs(params:Parameters, candidateKmers:dict[str,dict[str,list[Primer]]], clock:Clock) -> dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]: +def __getUnfilteredPairs(params:Parameters, candidateKmers:dict[str,dict[str,list[Primer]]], clock:Clock) -> dict[tuple[Primer,Primer],dict[str,Product]]: """gets the unfiltered pairs Args: @@ -99,7 +100,7 @@ def __getUnfilteredPairs(params:Parameters, candidateKmers:dict[str,dict[str,lis clock (Clock): a Clock object Returns: - dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]: key=Primer pairs; val=dict: key=genome name; val=tuple: contig, pcr product size, bin pair (contig, bin1, bin2) + dict[tuple[Primer,Primer],dict[str,Product]]: key=Primer pairs; val=dict: key=genome name; val=Product """ # messages MSG_1 = "identifying pairs of primers found in all ingroup sequences" @@ -126,12 +127,12 @@ def __getUnfilteredPairs(params:Parameters, candidateKmers:dict[str,dict[str,lis return pairs -def __removeOutgroup(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> None: +def __removeOutgroup(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,Product]]) -> None: """removes outgroup primers from the pairs dictionary. does not return Args: params (Parameters): a Parameters object - pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): the dictionary produced by __getUnfilteredPairs + pairs (dict[tuple[Primer,Primer],dict[str,Product]]): the dictionary produced by __getUnfilteredPairs """ # messages GAP = " "*4 @@ -165,12 +166,12 @@ def __removeOutgroup(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str params.dumpObj(pairs, params.pickles[Parameters._PAIR_2], "filtered pairs", prefix=GAP) -def __validatePairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> None: +def __validatePairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,Product]]) -> None: """validates primer pairs with in silico PCR and removes invalid pairs Args: params (Parameters): a Parameters object - pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): the dictionary produced by __getUnfilteredPairs + pairs (dict[tuple[Primer,Primer],dict[str,Product]]): the dictionary produced by __getUnfilteredPairs """ # messages GAP = ' '*4 @@ -196,17 +197,18 @@ def __validatePairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str, params.dumpObj(pairs, params.pickles[Parameters._PAIR_3], "validated pairs", prefix=GAP) -def __writePrimerPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]], clock:Clock) -> None: +def __writePrimerPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,Product]], clock:Clock) -> None: """writes pairs of primers to file Args: params (Parameters): the Parameters object for the run - pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]])): key=primer pair; val=dict: key=genome name; val=tuple: contig; pcr len; bin pair + pairs (dict[tuple[Primer,Primer],dict[str,Product]])): key=primer pair; val=dict: key=genome name; val=Product clock (Clock): a Clock object """ # messages - MSG_A = "writing " - MSG_B = " primer pairs to " + MSG_1 = "sorting primer pairs" + MSG_2A = "writing " + MSG_2B = " primer pairs to " # contants EOL = "\n" @@ -235,8 +237,8 @@ def getHeaders(names) -> list[str]: # print status params.log.rename(__writePrimerPairs.__name__) - params.log.info(f"{MSG_A}{len(pairs)}{MSG_B}'{os.path.relpath(params.resultsFn)}'") - clock.printStart(f"{MSG_A}{len(pairs)}{MSG_B}'{os.path.relpath(params.resultsFn)}'") + params.log.info(MSG_1) + clock.printStart(MSG_1) # get the names of genomes, and create headers with the same order names = list(next(iter(pairs.values())).keys()) @@ -245,6 +247,11 @@ def getHeaders(names) -> list[str]: # sort the primer pairs sortedPairs = _sortPairs(params, pairs) + # print status + clock.printDone() + params.log.info(f"{MSG_2A}{len(pairs)}{MSG_2B}'{os.path.relpath(params.resultsFn)}'") + clock.printStart(f"{MSG_2A}{len(pairs)}{MSG_2B}'{os.path.relpath(params.resultsFn)}'") + # open the file with open(params.resultsFn, 'w') as fh: # write the headers @@ -263,10 +270,8 @@ def getHeaders(names) -> list[str]: # then save the contig name and PCR product length for each genome for name in names: - contig = pairs[(fwd,rev)][name][0] - pcrLen = pairs[(fwd,rev)][name][1] - row.append(contig) - row.append(pcrLen) + row.append(pairs[(fwd,rev)][name].contig) + row.append(pairs[(fwd,rev)][name].size) # write the data to the file fh.write(f"{SEP.join(map(str, row))}{EOL}") @@ -274,7 +279,7 @@ def getHeaders(names) -> list[str]: # print status clock.printDone() - params.log.info(f"done {clock.getTimeString()}") + params.log.info(f"done {clock.getTimeString()}") def __removeIntermediateFiles(params:Parameters) -> None: diff --git a/bin/removeOutgroupPrimers.py b/bin/removeOutgroupPrimers.py index f6819aa..6ae4fd8 100644 --- a/bin/removeOutgroupPrimers.py +++ b/bin/removeOutgroupPrimers.py @@ -1,12 +1,13 @@ from bin.Clock import Clock from typing import Generator from bin.Primer import Primer +from bin.Product import Product from ahocorasick import Automaton from Bio.SeqRecord import SeqRecord from bin.Parameters import Parameters # global constant -__NULL_PRODUCT = ("NA", 0, ()) +__NULL_PRODUCT = ("NA", 0) # functions @@ -23,7 +24,7 @@ def __getAutomaton(pairs:list[tuple[Primer,Primer]]) -> Automaton: out = Automaton() # add each kmer in the pairs to the automaton once and make the automaton - for kmer in {str(x) for y in pairs for x in y}: + for kmer in {str(primer) for pair in pairs for primer in pair}: out.add_word(kmer, kmer) out.make_automaton() @@ -121,16 +122,22 @@ def __getOutgroupProductSizes(kmers:dict[str,dict[str,list[int]]], fwd:str, rev: return out -def __processOutgroupResults(outgroupProducts:dict[str,dict[tuple[Primer,Primer],set[tuple[str,int,tuple]]]], pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> None: +def __processOutgroupResults(outgroupProducts:dict[str,dict[tuple[Primer,Primer],set[tuple[str,int]]]], pairs:dict[tuple[Primer,Primer],dict[str,Product]]) -> None: """adds the outgroup results to the pairs dictionary Args: - outgroupProducts (dict[str,dict[tuple[Primer,Primer],set[tuple[str,int,tuple]]]]): key=genome name; val=dict: key=primer pair; val=set of tuples: contig, pcrLen, binpair - pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): key=Primer pair; val=dict: key=genome name; val=tuple: contig, pcrLen, binpair + outgroupProducts (dict[str,dict[tuple[Primer,Primer],set[tuple[str,int]]]]): key=genome name; val=dict: key=primer pair; val=set of tuples: contig, pcrLen + pairs (dict[tuple[Primer,Primer],dict[str,Product]]): key=Primer pair; val=dict: key=genome name; val=Product Returns: does not return. modifies the pairs dictionary """ + # constant + FAKE_BIN = -1 + + # get the name of an ingroup genome + ingroupName = next(iter([n for v in pairs.values() for n in v.keys() if n not in outgroupProducts.keys()])) + # for each pair remaining to process for pair in pairs.keys(): # for each outgroup genome @@ -140,7 +147,7 @@ def __processOutgroupResults(outgroupProducts:dict[str,dict[tuple[Primer,Primer] # if there is only one primer size, then save it if len(result) == 1: - pairs[pair][name] = result.pop() + contig, size = result.pop() # otherwise else: @@ -150,27 +157,31 @@ def __processOutgroupResults(outgroupProducts:dict[str,dict[tuple[Primer,Primer] # if there is only one primer size, then save it if len(result) == 1: - pairs[pair][name] = result.pop() + contig, size = result.pop() # otherwise else: # combine all contigs and pcrLens into separate lists contigs = list() pcrLens = list() - for contig,pcrLen,binPair in result: + for contig,pcrLen in result: contigs.append(contig) pcrLens.append(pcrLen) - # convert the contigs and lengths to comma-separated strings; add empty bin pair - pairs[pair][name] = (",".join(contigs), ",".join(map(str,pcrLens)), ()) + # convert the contigs and lengths to comma-separated strings + contig = ",".join(contigs) + size = ",".join(map(str, pcrLens)) + + # create and save the Product for this pair + pairs[pair][name] = Product(contig, size, FAKE_BIN, FAKE_BIN, pairs[pair][ingroupName].dimerTm) -def _removeOutgroupPrimers(outgroup:dict[str,Generator[SeqRecord,None,None]], pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]], params:Parameters) -> None: +def _removeOutgroupPrimers(outgroup:dict[str,Generator[SeqRecord,None,None]], pairs:dict[tuple[Primer,Primer],dict[str,Product]], params:Parameters) -> None: """removes primers found in the outgroup that produce disallowed product sizes Args: outgroup (dict[str,Generator[SeqRecord,None,None]]): key=genome name; val=contig generator - pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): key=Primer pair; dict:key=genome name; val=tuple: contig, pcr product size, bin pair (contig, num1, num2) + pairs (dict[tuple[Primer,Primer],dict[str,Product]]): key=Primer pair; dict:key=genome name; val=Product params (Parameters): a Parameters object Raises: @@ -274,7 +285,7 @@ def _removeOutgroupPrimers(outgroup:dict[str,Generator[SeqRecord,None,None]], pa # if the pcr product lengths are not disallowed, then save them in the dictionary if not done: - outgroupProducts[name][(fwd,rev)].update({(contig, x, ()) for x in products}) + outgroupProducts[name][(fwd,rev)].update({(contig, x) for x in products}) # log the number of pairs removed and remaining from the last genome if debugging params.log.debug(f"{MSG_5A}{startNumPairs - len(pairs)}{MSG_5B}{name}{MSG_5C}{len(pairs)}{MSG_5D}") diff --git a/bin/sortPrimerPairs.py b/bin/sortPrimerPairs.py index 324d76a..a1e5141 100644 --- a/bin/sortPrimerPairs.py +++ b/bin/sortPrimerPairs.py @@ -1,6 +1,7 @@ +import os, numpy from typing import Union -import os, numpy, primer3 from bin.Primer import Primer +from bin.Product import Product from bin.Parameters import Parameters @@ -45,11 +46,11 @@ def __minDiffFromRange(intgr:int, rng:range) -> int: return min((lower, upper)) -def __sortOutgroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], outgroup:list[str], ingroupPcrLens:range) -> tuple[int,int]: +def __sortOutgroupProducts(metadata:dict[str,Product], outgroup:list[str], ingroupPcrLens:range) -> tuple[int,int]: """evaluates the outgroup products for a primer pair in order to sort the pair Args: - metadata (dict[str,tuple[str,int,tuple[str,int,int]]]): the metadata for a primer pair + metadata (dict[str,Product]): the metadata for a primer pair outgroup (list[str]): a list of outgroup filenames ingroupPcrLens (range): the range that ingroup PCR products are within @@ -63,7 +64,7 @@ def __sortOutgroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], outgroup = frozenset(map(os.path.basename, outgroup)) # get the outgroup products that are >0bp - products = [v[1] for k,v in metadata.items() if k in outgroup and v != 0] + products = [v.size for k,v in metadata.items() if k in outgroup and v.size != 0] # handle situations where there are no products (best case) if products == []: @@ -92,14 +93,10 @@ def __getHomopolymerTemps(pair:tuple[Primer,Primer]) -> float: float: the sum of both primer's homodimer melting temperatures """ # parse the pair as strings - fwd,rev = map(str, pair) - - # calculate dimer tms - fDimer = primer3.calc_homodimer_tm(fwd) - rDimer = primer3.calc_homodimer_tm(rev) + fwd,rev = pair - # return the sum of these temps - return fDimer + rDimer + # return the sum of the dimer temps + return fwd.homodimerTm + rev.homodimerTm def __getHairpinTemps(pair:tuple[Primer,Primer]) -> float: @@ -112,17 +109,13 @@ def __getHairpinTemps(pair:tuple[Primer,Primer]) -> float: float: the sum of the hairpin melting tempertures """ # parse the pair as strings - fwd,rev = map(str, pair) - - # calculate hairpin tms - fHairpin = primer3.calc_hairpin_tm(fwd) - rHairpin = primer3.calc_hairpin_tm(rev) + fwd,rev = pair - # return the sum of these temps - return fHairpin + rHairpin + # return the sum of the hairpin temps + return fwd.hairpinTm + rev.hairpinTm -def __getHeteroDimerTemp(pair:tuple[Primer,Primer]) -> float: +def __getHeteroDimerTemp(products:dict[str,Product]) -> float: """gets the heterodimer melting temperature for a pair of primers Args: @@ -131,11 +124,8 @@ def __getHeteroDimerTemp(pair:tuple[Primer,Primer]) -> float: Returns: float: the heterodimer melting temperature """ - - # parse the pair as strings - fwd,rev = map(str, pair) - - return primer3.calc_heterodimer_tm(fwd, rev) + # all products have the same dimerTm; extract it from the first one + return next(iter(products.values())).dimerTm def __getGcDiff(pair:tuple[Primer,Primer]) -> float: @@ -168,30 +158,30 @@ def __getTmDiff(pair:tuple[Primer,Primer]) -> float: return abs(fwd.Tm - rev.Tm) -def __sortIngroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], ingroup:list[str]) -> tuple[float,float]: +def __sortIngroupProducts(metadata:dict[str,Product], ingroup:list[str]) -> tuple[float,float]: """calculates the median product size for the ingroup genomes Args: - metadata (dict[str,tuple[str,int,tuple[str,int,int]]]): the metadata for a primer pair + metadata (dict[str,Product]): the metadata for a primer pair Returns: - float: the median PCR product size for the ingroup genomes + tuple[float,float]: variance of product sizes; median product size (negated) """ # determine the genome names from the ingroup file names ingroup = frozenset(map(os.path.basename, ingroup)) # extract the sizes from the metadata for the pair - products = [size for name,(contig,size,binpair) in metadata.items() if name in ingroup] + products = [prod.size for name,prod in metadata.items() if name in ingroup] # calculate the variance and the median of the variance = numpy.var(products) median = numpy.var(products) - # return the variance and the median (negated to prioritze larger product sizes) + # return the variance and the median (negated to prioritize larger product sizes) return variance, -median -def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> list[tuple[Primer,Primer]]: +def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,Product]]) -> list[tuple[Primer,Primer]]: """sorts pairs of primers based on several characteristics: * first by difference in GC * next by difference in Tm @@ -222,7 +212,7 @@ def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple range(params.minPcr, params.maxPcr+1)), __getGcDiff(p), __getTmDiff(p), - __getHeteroDimerTemp(p), + __getHeteroDimerTemp(pairs[p]), __getHomopolymerTemps(p), __getHairpinTemps(p), __sortIngroupProducts(pairs[p], params.ingroupFns))) diff --git a/bin/validatePrimers.py b/bin/validatePrimers.py index a89e5fa..a408675 100644 --- a/bin/validatePrimers.py +++ b/bin/validatePrimers.py @@ -2,6 +2,7 @@ from bin.Clock import Clock from typing import Generator from bin.Primer import Primer +from bin.Product import Product from Bio.SeqRecord import SeqRecord from bin.Parameters import Parameters import multiprocessing, os, subprocess @@ -117,10 +118,10 @@ def __runAllPcrs(params:Parameters, pairs:list[tuple[Primer,Primer]]) -> dict[tu Args: params (Parameters): a Parameters object - pairs (list[tuple[Primer,Primer]]): a list of primer pairs + pairs (list[tuple[Primer,Primer]]): a list of Primer pairs Returns: - dict[tuple[str,str],set[tuple[str,int]]]: key=primer pair; val=(contig, pcr product size) + dict[tuple[str,str],set[tuple[str,int]]]: key=Primer pair; val=(contig, pcr product size) """ # helper function def genArgs() -> Generator[tuple[str,str],None,None]: @@ -156,29 +157,29 @@ def genArgs() -> Generator[tuple[str,str],None,None]: return {k:v for r in results for k,v in r.items()} -def __filterPairs(pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]], pcrs:dict[tuple[str,str],list[tuple[str,int]]]) -> None: +def __filterPairs(pairs:dict[tuple[Primer,Primer],dict[str,Product]], pcrs:dict[tuple[str,str],list[tuple[str,int]]]) -> None: """filters primer pairs based on isPcr results; does not return Args: - pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): key=Primer pair; dict:key=genome name; val=tuple: contig, pcr product size, bin pair (contig, num1, num2) - pcrs (dict[tuple[str,str],list[tuple[str,int]]]): the dictionary produced by __runPcr + pairs (dict[tuple[Primer,Primer],dict[str,Product]]): key=Primer pair; dict:key=genome name; val=Product + pcrs (dict[tuple[str,str],list[tuple[str,int]]]): the dictionary produced by __runAllPcrs """ # for each pair for pair,observed in pcrs.items(): # get a set of the expected product sizes (contig, size) - expected = {x[:2] for x in pairs[pair].values() if x[0] != 'NA'} + expected = {(x.contig, x.size) for x in pairs[pair].values() if x.contig != 'NA'} # remove any pairs that do not match the observed product sizes if expected != observed: del pairs[pair] -def _validatePrimerPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> None: +def _validatePrimerPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,Product]]) -> None: """uses in silico PCR to validate primer pairs Args: params (Parameters): a Parameters object - pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): key=Primer pair; dict:key=genome name; val=tuple: contig, pcr product size, bin pair (contig, num1, num2) + pairs (dict[tuple[Primer,Primer],dict[str,Product]]): key=Primer pair; dict:key=genome name; val=Product Raises: RuntimeError: no valid primer pairs From 18071dae68148680a5ba6342482f9739cd66e4f9 Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Thu, 12 Sep 2024 15:14:18 -0400 Subject: [PATCH 14/17] added additional tests to ensure Tms are stored correctly --- bin/unit_tests/primer_test.py | 32 ++++++++++++++++++++++++ bin/unit_tests/results_test.py | 45 ++++++++++++++++++++++++++++++++-- 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/bin/unit_tests/primer_test.py b/bin/unit_tests/primer_test.py index 4a47968..efc88f1 100644 --- a/bin/unit_tests/primer_test.py +++ b/bin/unit_tests/primer_test.py @@ -28,6 +28,26 @@ def setUp(self) -> None: self.fwd_2 = Primer(PrimerTest.FWD_SEQ, PrimerTest.CNTG_2, PrimerTest.FSTR_2, len(PrimerTest.FWD_SEQ), Primer.MINUS) self.rev_1 = Primer(PrimerTest.REV_SEQ, PrimerTest.CNTG_1, PrimerTest.RSTR_1, len(PrimerTest.REV_SEQ), Primer.MINUS) self.rev_2 = Primer(PrimerTest.REV_SEQ, PrimerTest.CNTG_2, PrimerTest.RSTR_2, len(PrimerTest.REV_SEQ), Primer.PLUS) + + # save the hairpin Tms + self.fwd_1.hairpinTm = primer3.calc_hairpin_tm(PrimerTest.FWD_SEQ) + self.fwd_1.rcHairpin = primer3.calc_hairpin_tm(str(Seq(PrimerTest.FWD_SEQ).reverse_complement())) + self.fwd_2.hairpinTm = primer3.calc_hairpin_tm(PrimerTest.FWD_SEQ) + self.fwd_2.rcHairpin = primer3.calc_hairpin_tm(str(Seq(PrimerTest.FWD_SEQ).reverse_complement())) + self.rev_1.hairpinTm = primer3.calc_hairpin_tm(PrimerTest.REV_SEQ) + self.rev_1.rcHairpin = primer3.calc_hairpin_tm(str(Seq(PrimerTest.REV_SEQ).reverse_complement())) + self.rev_2.hairpinTm = primer3.calc_hairpin_tm(PrimerTest.REV_SEQ) + self.rev_2.rcHairpin = primer3.calc_hairpin_tm(str(Seq(PrimerTest.REV_SEQ).reverse_complement())) + + # save the homodimer Tms + self.fwd_1.homodimerTm = primer3.calc_homodimer_tm(PrimerTest.FWD_SEQ) + self.fwd_1.rcHomodimer = primer3.calc_homodimer_tm(str(Seq(PrimerTest.FWD_SEQ).reverse_complement())) + self.fwd_2.homodimerTm = primer3.calc_homodimer_tm(PrimerTest.FWD_SEQ) + self.fwd_2.rcHomodimer = primer3.calc_homodimer_tm(str(Seq(PrimerTest.FWD_SEQ).reverse_complement())) + self.rev_1.homodimerTm = primer3.calc_homodimer_tm(PrimerTest.REV_SEQ) + self.rev_1.rcHomodimer = primer3.calc_homodimer_tm(str(Seq(PrimerTest.REV_SEQ).reverse_complement())) + self.rev_2.homodimerTm = primer3.calc_homodimer_tm(PrimerTest.REV_SEQ) + self.rev_2.rcHomodimer = primer3.calc_homodimer_tm(str(Seq(PrimerTest.REV_SEQ).reverse_complement())) def tearDown(self) -> None: return super().tearDown() @@ -235,6 +255,18 @@ def testN_revComp(self) -> None: self.assertEqual(rc_f2.contig, self.fwd_2.contig) self.assertEqual(rc_r1.contig, self.rev_1.contig) self.assertEqual(rc_r2.contig, self.rev_2.contig) + + # check hairpin Tm + self.assertEqual(rc_f1.hairpinTm, self.fwd_1.rcHairpin) + self.assertEqual(rc_f2.hairpinTm, self.fwd_2.rcHairpin) + self.assertEqual(rc_f1.rcHairpin, self.fwd_1.hairpinTm) + self.assertEqual(rc_f2.rcHairpin, self.fwd_2.hairpinTm) + + # check homodimer Tm + self.assertEqual(rc_f1.homodimerTm, self.fwd_1.rcHomodimer) + self.assertEqual(rc_f2.homodimerTm, self.fwd_2.rcHomodimer) + self.assertEqual(rc_f1.rcHomodimer, self.fwd_1.homodimerTm) + self.assertEqual(rc_f2.rcHomodimer, self.fwd_2.homodimerTm) if __name__ == "__main__": diff --git a/bin/unit_tests/results_test.py b/bin/unit_tests/results_test.py index 7f98d19..2e9ed4f 100644 --- a/bin/unit_tests/results_test.py +++ b/bin/unit_tests/results_test.py @@ -10,6 +10,7 @@ from bin.main import _runner from khmer import Countgraph from bin.Primer import Primer +from bin.Product import Product from Bio.SeqRecord import SeqRecord from bin.Parameters import Parameters from bin.sortPrimerPairs import _sortPairs @@ -755,8 +756,11 @@ def testK_noPrimerDimers(self) -> None: fwd = Primer(fwd, '', 100, len(fwd), Primer.PLUS) rev = Primer(rev, '', 100, len(rev), Primer.MINUS) + # check dimer Tm + forms,tm = _formsDimers(str(fwd),str(rev.reverseComplement()),fwd.Tm, rev.Tm) + # check for primer dimers; both primers need to be on the same strand - self.assertFalse(_formsDimers(str(fwd),str(rev.reverseComplement()),fwd.Tm, rev.Tm), f'primer dimers present in {fwd}, {rev}') + self.assertFalse(forms, f'primer dimers present in {fwd}, {rev}') def testL_ingroupHasOneProduct(self) -> None: """do all pairs produce one product size in the ingroup @@ -1009,7 +1013,7 @@ def testU_arePairsSortedCorrectly(self) -> None: """ # load the pairs from the pickle with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh: - pairs:dict[tuple[Primer,Primer]] = pickle.load(fh) + pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh) # get the expected sort order as a tuples of Seq expected = [(f.seq,r.seq) for f,r in _sortPairs(self.params, pairs)] @@ -1019,6 +1023,43 @@ def testU_arePairsSortedCorrectly(self) -> None: # check that the order is conserved self.assertListEqual(expected, observed, 'expected sort order does not match the order written to file') + + def testV_primerTmsWereCorrect(self) -> None: + """verifies that the pcr product data was populated correctly during runtime + """ + # load the pairs from the pickle + with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh: + pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh) + + # get a set of the non-redundant primer sequences + primers = {x for p in pairs.keys() for x in p} + + # for each primer + for primer in primers: + # check hairpin Tms + self.assertAlmostEqual(primer.hairpinTm, primer3.calc_hairpin_tm(str(primer)), places=3) + self.assertAlmostEqual(primer.rcHairpin, primer3.calc_hairpin_tm(str(primer.reverseComplement())), places=3) + + # check homodimer Tms + self.assertAlmostEqual(primer.homodimerTm, primer3.calc_homodimer_tm(str(primer)), places=3) + self.assertAlmostEqual(primer.rcHomodimer, primer3.calc_homodimer_tm(str(primer.reverseComplement())), places=3) + + def testW_productTmsWereCorrect(self) -> None: + # load the pairs from the pickle + with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh: + pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh) + + # for each pair + for fwd,rev in pairs.keys(): + # calculate the heterodimer Tm + dimerTm = primer3.calc_heterodimer_tm(str(fwd), str(rev)) + + # for each product + for name in pairs[(fwd,rev)].keys(): + product = pairs[(fwd,rev)][name] + + # check that the dimer Tm was saved correctly + self.assertAlmostEqual(dimerTm, product.dimerTm, places=3) if __name__ == "__main__": From d60bda0493acafb3395885ef833bf739bfa3b151 Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Thu, 12 Sep 2024 15:14:41 -0400 Subject: [PATCH 15/17] checking file writability in a safer way --- bin/Parameters.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bin/Parameters.py b/bin/Parameters.py index b2366af..ed8b458 100644 --- a/bin/Parameters.py +++ b/bin/Parameters.py @@ -177,11 +177,9 @@ def __checkOutputFile(fn:str) -> None: if proceed == YN[1]: raise FileExistsError(f"{WARN_MSG_A}{fn}{WARN_MSG_B}") - # make sure the file can be written + # make sure we can write the file to the output directory try: - fh = open(fn, 'w') - fh.close() - os.remove(fn) + os.access(os.path.dirname(fn), os.W_OK) except: raise ValueError(f"{ERR_MSG}{fn}") From 4fb2483afbe3ac9e420911aca118e62cc0552b4b Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Thu, 12 Sep 2024 15:43:03 -0400 Subject: [PATCH 16/17] reduced redundant operations; better documentation --- bin/unit_tests/results_test.py | 36 +++++++++++++++------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/bin/unit_tests/results_test.py b/bin/unit_tests/results_test.py index 2e9ed4f..2a2e2dc 100644 --- a/bin/unit_tests/results_test.py +++ b/bin/unit_tests/results_test.py @@ -111,6 +111,12 @@ def setUpClass(cls) -> None: cls.sequences:dict[str,dict[str,dict[str,Seq]]] = ResultsTest._loadGenomeSequences() clock.printDone() + # load the pairs from the pickle + clock.printStart('unpickling pairs') + with open(ResultsTest.params.pickles[Parameters._PAIR_3], 'rb') as fh: + cls.pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh) + clock.printDone() + # calculate the binding sites clock.printStart('getting binding sites for all kmers and primers', end='...\n', spin=False) cls.bindingSites:dict[Seq,dict[str,dict[str,dict[str,list[int]]]]] = ResultsTest._getKmerBindingSites(cls.results, cls.sequences, cls.params.minLen, cls.params.maxLen) @@ -1011,12 +1017,8 @@ def testT_allPairsWritten(self) -> None: def testU_arePairsSortedCorrectly(self) -> None: """verifies that the pairs were written in the correct order """ - # load the pairs from the pickle - with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh: - pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh) - # get the expected sort order as a tuples of Seq - expected = [(f.seq,r.seq) for f,r in _sortPairs(self.params, pairs)] + expected = [(f.seq,r.seq) for f,r in _sortPairs(self.params, self.pairs)] # get the observed order that was written observed = sorted(self.results.keys(), key=lambda x: self.results[x].rowNum) @@ -1025,14 +1027,10 @@ def testU_arePairsSortedCorrectly(self) -> None: self.assertListEqual(expected, observed, 'expected sort order does not match the order written to file') def testV_primerTmsWereCorrect(self) -> None: - """verifies that the pcr product data was populated correctly during runtime + """verifies that the homodimer Tms and hairpin Tms were stored correctly """ - # load the pairs from the pickle - with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh: - pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh) - # get a set of the non-redundant primer sequences - primers = {x for p in pairs.keys() for x in p} + primers = {x for p in self.pairs.keys() for x in p} # for each primer for primer in primers: @@ -1045,22 +1043,20 @@ def testV_primerTmsWereCorrect(self) -> None: self.assertAlmostEqual(primer.rcHomodimer, primer3.calc_homodimer_tm(str(primer.reverseComplement())), places=3) def testW_productTmsWereCorrect(self) -> None: - # load the pairs from the pickle - with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh: - pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh) - + """verifies that heterodimer Tm was correct for all PCR products + """ # for each pair - for fwd,rev in pairs.keys(): - # calculate the heterodimer Tm + for fwd,rev in self.pairs.keys(): + # calculate the heterodimer Tm for this pair dimerTm = primer3.calc_heterodimer_tm(str(fwd), str(rev)) # for each product - for name in pairs[(fwd,rev)].keys(): - product = pairs[(fwd,rev)][name] + for name in self.pairs[(fwd,rev)].keys(): + product = self.pairs[(fwd,rev)][name] # check that the dimer Tm was saved correctly self.assertAlmostEqual(dimerTm, product.dimerTm, places=3) - +# entrypoint if __name__ == "__main__": unittest.main() From f3d0f90b13bc1f0478d45c83a1c981e36e01494d Mon Sep 17 00:00:00 2001 From: dr-joe-wirth Date: Fri, 13 Sep 2024 14:08:10 -0400 Subject: [PATCH 17/17] first draft of a docker github action --- .github/workflows/docker-build-push.yml | 41 +++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 .github/workflows/docker-build-push.yml diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml new file mode 100644 index 0000000..b0be019 --- /dev/null +++ b/.github/workflows/docker-build-push.yml @@ -0,0 +1,41 @@ +name: Build and Push Docker Image + +on: + push: + tags: + - 'v*.*.*' + +env: + DOCKER_USERNAME: jwirth + DOCKER_IMAGE_NAME: primerforge + +jobs: + docker-build-push: + name: docker build and push + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Log in to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ env.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Extract version from tag + run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: ./docker + file: ./docker/Dockerfile + push: true + tags: | + ${{ env.DOCKER_USERNAME }}/${{ env.DOCKER_IMAGE_NAME }}:v${{ env.VERSION }} + ${{ env.DOCKER_USERNAME }}/${{ env.DOCKER_IMAGE_NAME }}:latest