From 2905d1355d816e64de9be7a389fa3eec2b435d35 Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Thu, 29 Aug 2024 15:14:07 -0400
Subject: [PATCH 01/17] parallelized isPcr calls (#30)

---
 bin/validatePrimers.py | 113 ++++++++++++++++++++++++++++++-----------
 1 file changed, 84 insertions(+), 29 deletions(-)

diff --git a/bin/validatePrimers.py b/bin/validatePrimers.py
index e5172c6..a89e5fa 100644
--- a/bin/validatePrimers.py
+++ b/bin/validatePrimers.py
@@ -1,9 +1,10 @@
-import os, subprocess
 from Bio import SeqIO
 from bin.Clock import Clock
+from typing import Generator
 from bin.Primer import Primer
 from Bio.SeqRecord import SeqRecord
 from bin.Parameters import Parameters
+import multiprocessing, os, subprocess
 
 
 # global constants
@@ -38,38 +39,45 @@ def __makeFastaFile(params:Parameters) -> None:
                 SeqIO.write(SeqRecord(contig.seq, contig.id, '', ''), fh, OUT_FORMAT)
 
 
-def __makeQueryFile(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> None:
-    """makes the query file for 
+def __makeQueryString(pairs:list[tuple[Primer,Primer]]) -> str:
+    """makes the query string from a list of primer pairs
 
     Args:
-        params (Parameters): a Parameters object
-        pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): key=Primer pair; dict:key=genome name; val=tuple: contig, pcr product size, bin pair (contig, num1, num2)
+        pairs (list[tuple[Primer,Primer]]): a list of primer pairs
+    
+    Returns:
+        str: the query string
     """
-    # open the file
-    with open(params.queryFn, 'w') as fh:
-        for pair in pairs.keys():
-            # recast the pair as a strings
-            fwd,rev = map(str, pair)
-            
-            # the "key" column will be the pair separated by a pipe
-            key = __KEY_SEP.join((fwd,rev))
-            
-            # write the data for each pair to file; one per line
-            fh.write(__ROW_SEP.join((key,fwd,rev)) + "\n")
+    # initialize the output
+    out = ""
+    
+    # for each pair
+    for pair in pairs:
+        # recast the pair as a strings
+        fwd,rev = map(str, pair)
+        
+        # the "key" column will be the pair separated by a pipe
+        key = __KEY_SEP.join((fwd,rev))
+        
+        # save the data for each pair; one per line
+        out += __ROW_SEP.join((key,fwd,rev)) + "\n"
+    
+    return out
 
 
-def __runPcr(params:Parameters) -> dict[tuple[str,str],set[tuple[str,int]]]:
-    """runs isPcr on the primer pairs
+def __runOnePcr(allContigsFna:str, query:str) -> dict[tuple[str,str],set[tuple[str,int]]]:
+    """runs isPcr on a properly formatted query string
 
     Args:
-        params (Parameters): a Parameters object
+        allContigsFna (str): the filename containing all the contigs
+        query (str): the query to search
 
     Returns:
-        dict[tuple[str,str],set[tuple[str,int]]]: key=primer pair (as strings); val=pcr product size
+        dict[tuple[str,str],set[tuple[str,int]]]: key=primer pair (as strings); val=(contig, pcr product size)
     """
     # constants
     CMD = 'isPcr'
-    ARGS = ['stdout', '-out=bed', '-minGood=6', '-minPerfect=8']
+    ARGS = ['stdin', 'stdout', '-out=bed', '-minGood=6', '-minPerfect=8']
     TILE = '-tileSize='
     EMPTY = ['']
     
@@ -77,15 +85,17 @@ def __runPcr(params:Parameters) -> dict[tuple[str,str],set[tuple[str,int]]]:
     out = dict()
     
     # build command
-    cmd = [CMD, params.allContigsFna, params.queryFn]
+    cmd = [CMD, allContigsFna]
     cmd.extend(ARGS)
     cmd.append(TILE + str(Parameters._MIN_LEN))
     
-    # run the command
-    results = subprocess.run(cmd, capture_output=True, check=True)
+    # run the command; inject the query string using a pipe
+    results = subprocess.run(cmd, input=query.encode(), capture_output=True, check=True)
+    
+    # convert the results to a collection of rows (lists of strings)
+    results = map(lambda x: x.split(__ROW_SEP), results.stdout.decode().split('\n'))
     
-    # convert the results to a list of values; do not keep empty lines
-    results = list(map(lambda x: x.split(__ROW_SEP), results.stdout.decode().split('\n')))
+    # do not keep empty rows
     results = [x for x in results if x != EMPTY]
     
     # for each result
@@ -102,6 +112,50 @@ def __runPcr(params:Parameters) -> dict[tuple[str,str],set[tuple[str,int]]]:
     return out
 
 
+def __runAllPcrs(params:Parameters, pairs:list[tuple[Primer,Primer]]) -> dict[tuple[str,str],set[tuple[str,int]]]:
+    """runs all pcrs in parallel
+
+    Args:
+        params (Parameters): a Parameters object
+        pairs (list[tuple[Primer,Primer]]): a list of primer pairs
+
+    Returns:
+        dict[tuple[str,str],set[tuple[str,int]]]: key=primer pair; val=(contig, pcr product size)
+    """
+    # helper function
+    def genArgs() -> Generator[tuple[str,str],None,None]:
+        """generates arguments for __runOnePcr
+        """
+        # constant
+        MAX_CHUNK = 2000
+        
+        # initialize values for iteration
+        numPairs = len(pairs)
+        chunk = min((MAX_CHUNK, numPairs // params.numThreads))
+        start = 0
+        end = chunk
+        
+        # keep iterating until all the pairs have been evaluated
+        while start <= numPairs:
+            # make the query for this chunk
+            query = __makeQueryString(pairs[start:end])
+            
+            # update the start and end for the next chunk
+            start = end
+            end += chunk
+            
+            yield (params.allContigsFna, query)
+    
+    # run pcrs in parallel
+    pool = multiprocessing.Pool(params.numThreads)
+    results = pool.starmap(__runOnePcr, genArgs())
+    pool.close()
+    pool.join()
+    
+    # combine the results and return them
+    return {k:v for r in results for k,v in r.items()}
+
+
 def __filterPairs(pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]], pcrs:dict[tuple[str,str],list[tuple[str,int]]]) -> None:
     """filters primer pairs based on isPcr results; does not return
 
@@ -143,10 +197,11 @@ def _validatePrimerPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict
     clock.printStart(MSG_1)
     params.log.info(MSG_1)
     
-    # create the fasta and query files and run isPcr
+    # create the fasta file
     __makeFastaFile(params)
-    __makeQueryFile(params, pairs)
-    pcrs = __runPcr(params)
+    
+    # run isPcr in parallel
+    pcrs = __runAllPcrs(params, list(pairs.keys()))
     
     # filter out bad pairs
     __filterPairs(pairs, pcrs)

From 38e129fca9d9d12621092aa9702a92f50fb501d0 Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Thu, 5 Sep 2024 13:32:00 -0400
Subject: [PATCH 02/17] removed unused "query file" param

---
 bin/Parameters.py | 5 -----
 bin/main.py       | 1 -
 2 files changed, 6 deletions(-)

diff --git a/bin/Parameters.py b/bin/Parameters.py
index 5049974..b2366af 100644
--- a/bin/Parameters.py
+++ b/bin/Parameters.py
@@ -12,7 +12,6 @@ class Parameters():
     _MIN_LEN = 10
     __ALLOWED_FORMATS = ('genbank', 'fasta')
     __ALL_CONTIGS_FNA = ".all_contigs.fna"
-    __QUERY_FN = ".ispcr_query.tsv"
     __PICKLE_DIR = "_pickles"
     _PARAMS = 0
     _SHARED = 1
@@ -69,7 +68,6 @@ def __init__(self, author:str, version:str, initializeLog:bool=True) -> Paramete
         self.pickles:dict[int,str]
         self.keepIntermediateFiles:bool
         self.allContigsFna:str
-        self.queryFn:str
         
         # save author and version as private attributes
         self.__author:str = author
@@ -93,9 +91,6 @@ def __init__(self, author:str, version:str, initializeLog:bool=True) -> Paramete
             
             # get the all contigs fasta filename
             self.allContigsFna = os.path.join(os.path.dirname(self.resultsFn), Parameters.__ALL_CONTIGS_FNA)
-            
-            # get the ispcr query filename
-            self.queryFn = os.path.join(os.path.dirname(self.resultsFn), Parameters.__QUERY_FN)
 
     def __eq__(self, other:Parameters) -> bool:
         """equality overload
diff --git a/bin/main.py b/bin/main.py
index ce7f66a..af275b4 100755
--- a/bin/main.py
+++ b/bin/main.py
@@ -288,7 +288,6 @@ def __removeIntermediateFiles(params:Parameters) -> None:
     
     # remove the isPcr files
     os.remove(params.allContigsFna)
-    os.remove(params.queryFn)
 
 
 def _runner(params:Parameters) -> None:

From 73479a5e694072d8cbeaee87553ae7c5d1890360 Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Thu, 5 Sep 2024 13:38:19 -0400
Subject: [PATCH 03/17] sleeker cleanup

---
 bin/main.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/bin/main.py b/bin/main.py
index af275b4..b9e2936 100755
--- a/bin/main.py
+++ b/bin/main.py
@@ -1,4 +1,4 @@
-import os
+import os, shutil
 from Bio import SeqIO
 from bin.Clock import Clock
 from typing import Generator
@@ -279,14 +279,10 @@ def __removeIntermediateFiles(params:Parameters) -> None:
     Args:
         params (Parameters): a Parameters object
     """
-    # remove all the pickle files
-    for fn in params.pickles.values():
-        os.remove(fn)
+    # remove the pickle directory along with any pickles
+    shutil.rmtree(os.path.dirname(next(iter(params.pickles.values()))))
     
-    # remove the pickle directory
-    os.rmdir(os.path.dirname(fn))
-    
-    # remove the isPcr files
+    # remove the isPcr query file
     os.remove(params.allContigsFna)
 
 

From 5a1da0e0c310b3ac35c140a90a086da1adc382a4 Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Thu, 5 Sep 2024 15:43:28 -0400
Subject: [PATCH 04/17] increment version

---
 bin/main.py | 2 +-
 setup.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/main.py b/bin/main.py
index b9e2936..02e7a46 100755
--- a/bin/main.py
+++ b/bin/main.py
@@ -11,7 +11,7 @@
 from bin.removeOutgroupPrimers import _removeOutgroupPrimers
 
 # global constants
-__version__ = "1.3.6"
+__version__ = "1.3.7"
 __author__ = "Joseph S. Wirth"
 
 # functions
diff --git a/setup.py b/setup.py
index 8eeb356..e9af4b3 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name='primerforge',
-    version='1.3.6',
+    version='1.3.7',
     author='Joseph S. Wirth',
     packages=find_packages(),
     description='software to identify primers that can be used to distinguish genomes',

From 4dd9d9cf89f2c30e2168712d6fc6e4c5ef40e4df Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Tue, 10 Sep 2024 11:08:36 -0400
Subject: [PATCH 05/17] implemented a sort method before writing results to
 file (#31)

---
 bin/main.py            |   6 +-
 bin/sortPrimerPairs.py | 139 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 bin/sortPrimerPairs.py

diff --git a/bin/main.py b/bin/main.py
index 02e7a46..ebabf15 100755
--- a/bin/main.py
+++ b/bin/main.py
@@ -5,6 +5,7 @@
 from bin.Primer import Primer
 from Bio.SeqRecord import SeqRecord
 from bin.Parameters import Parameters
+from bin.sortPrimerPairs import _sortPairs
 from bin.getPrimerPairs import _getPrimerPairs
 from bin.validatePrimers import _validatePrimerPairs
 from bin.getCandidateKmers import _getAllCandidateKmers
@@ -241,6 +242,9 @@ def getHeaders(names) -> list[str]:
     names = list(next(iter(pairs.values())).keys())
     headers = getHeaders(names)
     
+    # sort the primer pairs
+    sortedPairs = _sortPairs(params, pairs)
+    
     # open the file
     with open(params.resultsFn, 'w') as fh:
         # write the headers
@@ -248,7 +252,7 @@ def getHeaders(names) -> list[str]:
         fh.flush()
         
         # for each primer pair
-        for fwd,rev in pairs.keys():
+        for fwd,rev in sortedPairs:
             # save the primer pair data
             row = [fwd.seq,
                    round(fwd.Tm, NUM_DEC),
diff --git a/bin/sortPrimerPairs.py b/bin/sortPrimerPairs.py
new file mode 100644
index 0000000..2f73ee3
--- /dev/null
+++ b/bin/sortPrimerPairs.py
@@ -0,0 +1,139 @@
+import os, numpy, primer3
+from bin.Primer import Primer
+from bin.Parameters import Parameters
+
+
+def __getHomopolymerTemps(pair:tuple[Primer,Primer]) -> float:
+    """returns the sum of the homodimer melting temps for a primer pair
+
+    Args:
+        pair (tuple[Primer,Primer]): a pair of primers
+
+    Returns:
+        float: the sum of both primer's homodimer melting temperatures
+    """
+    # parse the pair as strings
+    fwd,rev = map(str, pair)
+    
+    # calculate dimer tms
+    fDimer = primer3.calc_homodimer_tm(fwd)
+    rDimer = primer3.calc_homodimer_tm(rev)
+    
+    # return the sum of these temps
+    return fDimer + rDimer
+
+
+def __getHairpinTemps(pair:tuple[Primer,Primer]) -> float:
+    """gets the sume of hairpin melting tempertaures for a primer pair
+
+    Args:
+        pair (tuple[Primer,Primer]): a pair of primer
+
+    Returns:
+        float: the sum of the hairpin melting tempertures
+    """
+    # parse the pair as strings
+    fwd,rev = map(str, pair)
+
+    # calculate hairpin tms
+    fHairpin = primer3.calc_hairpin_tm(fwd)
+    rHairpin = primer3.calc_hairpin_tm(rev)
+    
+    # return the sum of these temps
+    return fHairpin + rHairpin
+
+
+def __getHeteroDimerTemp(pair:tuple[Primer,Primer]) -> float:
+    """gets the heterodimer melting temperature for a pair of primers
+
+    Args:
+        pair (tuple[Primer,Primer]): a pair of primers
+
+    Returns:
+        float: the heterodimer melting temperature
+    """
+    
+    # parse the pair as strings
+    fwd,rev = map(str, pair)
+
+    return primer3.calc_heterodimer_tm(fwd, rev)
+
+
+def __getGcDiff(pair:tuple[Primer,Primer]) -> float:
+    """finds the G+C content (mol %) difference between a pair of primers
+
+    Args:
+        pair (tuple[Primer,Primer]): a pair of primers
+
+    Returns:
+        float: the difference in G+C content (mol %)
+    """
+    # parse the pair
+    fwd,rev = pair
+    
+    return abs(fwd.gcPer - rev.gcPer)
+
+
+def __getTmDiff(pair:tuple[Primer,Primer]) -> float:
+    """finds the melting temperature difference between a pair of primers
+
+    Args:
+        pair (tuple[Primer,Primer]): a pair of primers
+
+    Returns:
+        float: the difference between melting temperatures
+    """
+    # parse the pair
+    fwd,rev = pair
+
+    return abs(fwd.Tm - rev.Tm)
+
+
+def __getMedianIngroupProductSize(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], ingroup:list[str]) -> float:
+    """calculates the median product size for the ingroup genomes
+
+    Args:
+        metadata (dict[str,tuple[str,int,tuple[str,int,int]]]): the metadata for a primer pair
+
+    Returns:
+        float: the median PCR product size for the ingroup genomes
+    """
+    # determine the genome names from the ingroup file names
+    ingroup = set(map(os.path.basename, ingroup))
+    
+    # extract the sizes from the metadata for the pair
+    sizes = [size for name,(contig,size,binpair) in metadata.items() if name in ingroup]
+    
+    # negate the median so that pairs will be sorted largest to smallest
+    return -numpy.median(sizes)
+
+
+def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> list[tuple[Primer,Primer]]:
+    """sorts pairs of primers based on several characteristics:
+          * first by difference in GC
+          * next by difference in Tm
+          * next by primer-dimer Tm
+          * next by homopolymer Tm
+          * next by hairpin Tm
+          * finally by product size (largest to smallest)
+    
+    Args:
+        params (Parameters): a Parameters object
+        pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): key=primer pair; val=dict; key=genome name; val=(contig, pcr size, binpair)
+
+    Returns:
+        list[tuple[Primer,Primer]]: a list of sorted primer pairs
+    """
+    # sort the results:
+    #  * first by difference in GC
+    #  * next by difference in Tm
+    #  * next by primer-dimer Tm
+    #  * next by homopolymer Tm
+    #  * next by hairpin Tm
+    #  * finally by product size (largest to smallest)
+    return sorted(pairs.keys(), key=lambda p: (__getGcDiff(p),
+                                               __getTmDiff(p),
+                                               __getHeteroDimerTemp(p),
+                                               __getHomopolymerTemps(p),
+                                               __getHairpinTemps(p),
+                                               __getMedianIngroupProductSize(pairs[p], params.ingroupFns)))

From de650ca9e7f75fb84f51ceba14f02f56d10799d7 Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Tue, 10 Sep 2024 11:09:23 -0400
Subject: [PATCH 06/17] added tests to ensure sort method is functioning
 properly (#31)

---
 bin/unit_tests/results_test.py | 43 ++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/bin/unit_tests/results_test.py b/bin/unit_tests/results_test.py
index 744c443..88db4cc 100644
--- a/bin/unit_tests/results_test.py
+++ b/bin/unit_tests/results_test.py
@@ -12,13 +12,15 @@
 from bin.Primer import Primer
 from Bio.SeqRecord import SeqRecord
 from bin.Parameters import Parameters
+from bin.sortPrimerPairs import _sortPairs
 from bin.getPrimerPairs import _formsDimers
 import gzip, os, pickle, primer3, signal, subprocess, time, unittest
 
 class Result():
     """class to save results for easy lookup
     """
-    def __init__(self, fwdTm:float, fwdGc:float, revTm:float, revGc:float, additional:dict[str,dict[str,list]]) -> Result:
+    def __init__(self, rowNum:int, fwdTm:float, fwdGc:float, revTm:float, revGc:float, additional:dict[str,dict[str,list]]) -> Result:
+        self.rowNum:int  = rowNum
         self.fwdTm:float = fwdTm
         self.fwdGc:float = fwdGc
         self.revTm:float = revTm
@@ -361,10 +363,14 @@ def _parseResultsFile(fn:str) -> dict[tuple[Seq,Seq],Result]:
         out = dict()
         key = dict()
         header = True
+        rowNum = -1
         
         # go through each line in the file
         with open(fn, 'r') as fh:
             for line in fh:
+                # increment the row number
+                rowNum += 1
+                
                 # convert the line to a row of columns
                 line = line.rstrip()
                 row = line.split(SEP_1)
@@ -411,7 +417,7 @@ def _parseResultsFile(fn:str) -> dict[tuple[Seq,Seq],Result]:
                             additional[name][ResultsTest.CONTIG] = row[idx].split(SEP_2)
                     
                     # store the result for this primer pair
-                    out[(fwd,rev)] = Result(ftm,fgc,rtm,rgc,additional)
+                    out[(fwd,rev)] = Result(rowNum,ftm,fgc,rtm,rgc,additional)
         
         return out
 
@@ -990,6 +996,39 @@ def testS_isProductLengthCorrect(self) -> None:
                         # and the pcr lengths should not be disallowed
                         for plen in tru[contig]:
                             self.assertNotIn(plen, self.params.disallowedLens, f"disallowed sizes in {name} with {fwd},{rev}")
+    
+    def testT_allPairsWritten(self) -> None:
+        """verifies that all primer pairs were written to file
+        """
+        # unpickle the pairs
+        with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh:
+            pairs:dict[tuple[Primer,Primer]] = pickle.load(fh)
+        
+        # convert the pairs to a set of pairs as Seq
+        expected = {(f.seq,r.seq) for f,r in pairs.keys()}
+        
+        # get the pairs saved in the results as a set
+        observed = set(self.results.keys())
+        
+        # make sure that the pairs match
+        self.assertEqual(len(expected), len(observed), f'expected {len(expected)} primer pairs but observed {len(observed)}')
+        self.assertSetEqual(expected, observed, 'expected pairs do not match those written to file')
+    
+    def testU_arePairsSortedCorrectly(self) -> None:
+        """verifies that the pairs were written in the correct order
+        """
+        # load the pairs from the pickle
+        with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh:
+            pairs:dict[tuple[Primer,Primer]] = pickle.load(fh)
+        
+        # get the expected sort order as a tuples of Seq
+        expected = [(f.seq,r.seq) for f,r in _sortPairs(self.params, pairs)]
+        
+        # get the observed order that was written
+        observed  = sorted(self.results.keys(), key=lambda x: self.results[x].rowNum)
+        
+        # check that the order is conserved
+        self.assertListEqual(expected, observed, 'expected sort order does not match the order written to file')
 
 
 if __name__ == "__main__":

From c8b4584e5d1f0387d49c0ef4692d638f17262470 Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Wed, 11 Sep 2024 11:16:54 -0400
Subject: [PATCH 07/17] added sorting for outgroup products; improved ingroup
 product sorting

---
 bin/sortPrimerPairs.py | 115 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 101 insertions(+), 14 deletions(-)

diff --git a/bin/sortPrimerPairs.py b/bin/sortPrimerPairs.py
index 2f73ee3..f94d8ac 100644
--- a/bin/sortPrimerPairs.py
+++ b/bin/sortPrimerPairs.py
@@ -1,8 +1,87 @@
+from typing import Union
 import os, numpy, primer3
 from bin.Primer import Primer
 from bin.Parameters import Parameters
 
 
+def __parseOutgroupProductSizes(product:Union[str,int]) -> list[int]:
+    """parses the PCR product sizes for an outgroup genome
+
+    Args:
+        product (Union[str,int]): a comma-separated list of ints (str) or a single int (int)
+
+    Returns:
+        list[int]: a list of PCR product sizes
+    """
+    # constant
+    SEP = ","
+    
+    # convert comma-separated list (str) to a list of ints
+    if type(product) is str:
+        out = [int(x) for x in product.split(SEP)]
+    
+    # if it just an int, then make it a list
+    elif type(product) is int:
+        out = [product]
+    
+    return out
+
+
+def __minDiffFromRange(intgr:int, rng:range) -> int:
+    """calculates the minimum distance of an integer from a range
+
+    Args:
+        intgr (int): an integer
+        rng (range): a range
+
+    Returns:
+        int: the minimum distance of the integer from the range
+    """
+    # calculate the differnces from the upper and lower limits
+    lower = abs(intgr - min(rng))
+    upper = abs(intgr - max(rng))
+    
+    # return the minimum distance from the disallowed range
+    return min((lower, upper))
+
+
+def __sortOutgroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], outgroup:list[str], disallowed:range) -> tuple[int,int]:
+    """evaluates the outgroup products for a primer pair in order to sort the pair
+
+    Args:
+        metadata (dict[str,tuple[str,int,tuple[str,int,int]]]): the metadata for a primer pair
+        outgroup (list[str]): a list of outgroup filenames
+        disallowed (range): the range that outgroup products are not allowed to be in
+
+    Returns:
+        tuple[int,int]: minimum distance from the range (negated); the number of outgroup products
+    """
+    # initialize variables
+    products = list()
+    
+    # determine the genome names from the ingroup file names
+    outgroup = frozenset(map(os.path.basename, outgroup))
+    
+    # get the outgroup products that are >0bp
+    products = [v[1] for k,v in metadata.items() if k in outgroup and v != 0]
+    
+    # handle situations where there are no products (best case)
+    if products == []:
+        minDiff = float('inf') # largest number possible
+    
+    # otherwise, need to calculate the minimum distance from the disallowed range
+    else:
+        # parse the outgroup products so that we have a list ints
+        products = list(map(__parseOutgroupProductSizes, products))
+        products = [y for x in products for y in x]
+        
+        # get the smallest value distance from the disallowed range
+        minDiff = min(map(lambda x:__minDiffFromRange(x, disallowed), products))
+    
+    # return the minimum distance (negated so larger differences are sorted first) and the number of products
+    return -minDiff, len(products)
+
+
 def __getHomopolymerTemps(pair:tuple[Primer,Primer]) -> float:
     """returns the sum of the homodimer melting temps for a primer pair
 
@@ -89,7 +168,7 @@ def __getTmDiff(pair:tuple[Primer,Primer]) -> float:
     return abs(fwd.Tm - rev.Tm)
 
 
-def __getMedianIngroupProductSize(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], ingroup:list[str]) -> float:
+def __sortIngroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], ingroup:list[str]) -> tuple[float,float]:
     """calculates the median product size for the ingroup genomes
 
     Args:
@@ -99,13 +178,17 @@ def __getMedianIngroupProductSize(metadata:dict[str,tuple[str,int,tuple[str,int,
         float: the median PCR product size for the ingroup genomes
     """
     # determine the genome names from the ingroup file names
-    ingroup = set(map(os.path.basename, ingroup))
+    ingroup = frozenset(map(os.path.basename, ingroup))
     
     # extract the sizes from the metadata for the pair
-    sizes = [size for name,(contig,size,binpair) in metadata.items() if name in ingroup]
+    products = [size for name,(contig,size,binpair) in metadata.items() if name in ingroup]
+    
+    # calculate the variance and the median of the 
+    variance = numpy.var(products)
+    median = numpy.var(products)
     
-    # negate the median so that pairs will be sorted largest to smallest
-    return -numpy.median(sizes)
+    # return the variance and the median (negated to prioritze larger product sizes)
+    return variance, -median
 
 
 def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> list[tuple[Primer,Primer]]:
@@ -124,16 +207,20 @@ def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple
     Returns:
         list[tuple[Primer,Primer]]: a list of sorted primer pairs
     """
-    # sort the results:
-    #  * first by difference in GC
-    #  * next by difference in Tm
-    #  * next by primer-dimer Tm
-    #  * next by homopolymer Tm
-    #  * next by hairpin Tm
-    #  * finally by product size (largest to smallest)
-    return sorted(pairs.keys(), key=lambda p: (__getGcDiff(p),
+    # sort the results using the following criteria (in order):
+    #  * largest difference from the disallowed range for outgroup PCR products
+    #  * fewest number of outgroup PCR products
+    #  * lowest G+C difference between the pair
+    #  * lowest Tm difference between the pair
+    #  * lowest Tm for heterodimers
+    #  * lowest Tm (sum) for homodimers
+    #  * lowest Tm (sum) for hairpins
+    #  * lowest variance in ingroup PCR product sizes
+    #  * largest median ingroup PCR product size
+    return sorted(pairs.keys(), key=lambda p: (__sortOutgroupProducts(pairs[p], params.outgroupFns, params.disallowedLens),
+                                               __getGcDiff(p),
                                                __getTmDiff(p),
                                                __getHeteroDimerTemp(p),
                                                __getHomopolymerTemps(p),
                                                __getHairpinTemps(p),
-                                               __getMedianIngroupProductSize(pairs[p], params.ingroupFns)))
+                                               __sortIngroupProducts(pairs[p], params.ingroupFns)))

From 7cf398cc2d4b3c741fe1c32c76310a772324cf0b Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Wed, 11 Sep 2024 11:33:20 -0400
Subject: [PATCH 08/17] several changes:   * added Results section describing
 sort method   * added sort step to workflow   * updated the example to match
 v1.3.7

---
 README.md | 69 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index e9d8d79..b845650 100644
--- a/README.md
+++ b/README.md
@@ -90,6 +90,21 @@ optional arguments:
     --debug              run in debug mode (default: False)
 ```
 
+## Results
+The results produced by `primerForge` list one primer pair per line, and these pairs are sorted based on the following criteria (in order of importance):
+
+  * largest difference from the ingroup PCR product range for outgroup PCR products
+  * lowest number of outgroup PCR products
+  * lowest G+C difference between the pair
+  * lowest Tm difference between the pair
+  * lowest Tm for heterodimers
+  * lowest Tm (sum) for homodimers
+  * lowest Tm (sum) for hairpins
+  * lowest variance in ingroup PCR product sizes
+  * largest median ingroup PCR product size
+
+See [the example](#examining-the-results) for details about the file format.
+
 ## Workflow
 
 ```mermaid
@@ -208,8 +223,8 @@ flowchart TB
     ispcr["filter pairs using isPcr"]
     final(["final primer pairs"])
     dump5[/"dump to file"/]
-    write[/"write pairs to file"/]
-
+    write[/"sort pairs and
+            write to file"/]
 
     noout --> ispcr
     ispcr --> final
@@ -223,7 +238,7 @@ This example assumes you have already installed `primerForge` [as described abov
 ### Motivation
 In this example, we will use `primerForge` to find pairs of primers between 18bp and 24bp that can be used to differentiate three strains of _Mycoplasma mycoides_ subspecies mycoides (the "ingroup") from two strains of _Mycoplasma mycoides_ subspecies capri (the "outgroup"). The primer pairs identified by `primerForge` are predicted to produce a single PCR product between 500bp and 2000bp in the ingroup. These same primer pairs are predicted to produce PCR products <300bp, >3000bp, or no PCR products at all in the outgroup.
 
-### preparing the workspace
+### Preparing the workspace
 In order to get started, create a directory called `mycoplasma_test` and move into it:
 
 ```bash
@@ -252,7 +267,7 @@ We will use the following flags to specify specific parameters for this example:
 
 You can get a list of all available flags using the command `primerForge --help`.
 
-Run `primerForge` using the following command (requires at least 3 Gb of RAM):
+Run `primerForge` using the following command (requires at least 2 Gb of RAM):
 
 ```bash
 primerForge --ingroup "./i*gbff" --outgroup "./o*gbff" --pcr_prod 500,2000 --bad_sizes 300,3000 --primer_len 18,24
@@ -261,43 +276,43 @@ primerForge --ingroup "./i*gbff" --outgroup "./o*gbff" --pcr_prod 500,2000 --bad
 After running the command, you should see something like this printed to the screen:
 
 ```text
+dumping Parameters to '_pickles/parameters.p' ... done 00:00:00.49
 identifying kmers suitable for use as primers in all 3 ingroup genome sequences
-    getting shared ingroup kmers that appear once in each genome ... done 00:01:19.32
-    dumping shared kmers to '_pickles/sharedKmers.p' ... done 00:00:07.34
-    evaluating kmers ... done 00:03:20.11
+    getting shared ingroup kmers that appear once in each genome ... done 00:01:36.15
+    dumping shared kmers to '_pickles/sharedKmers.p' ... done 00:00:11.22
+    evaluating 2430140 kmers ... done 00:01:51.73
     identified 30413 candidate kmers
-done 00:04:49.18
-    dumping candidate kmers to '_pickles/candidates.p' ... done 00:00:01.03
-identifying pairs of primers found in all ingroup sequences ... done 00:00:11.60
+    dumping candidate kmers to '_pickles/candidates.p' ... done 00:00:01.55
+done 00:03:42.94
+identifying pairs of primers found in all ingroup sequences ... done 00:00:11.62
     identified 16050 primer pairs shared in all ingroup sequences
-    dumping unfiltered pairs to '_pickles/pairs.p' ... done 00:00:00.54
+    dumping unfiltered pairs to '_pickles/pairs.p' ... done 00:00:00.56
 removing primer pairs present in the outgroup sequences
-    getting outgroup PCR products ... done 00:00:01.01
-    filtering primer pairs ... done 00:00:00.53
-    processing outgroup results ... done 00:00:00.52
+    getting outgroup PCR products ... done 00:00:01.03
+    filtering primer pairs ... done 00:00:00.54
+    processing outgroup results ... done 00:00:00.54
     removed 5905 pairs present in the outgroup (10145 pairs remaining)
-    dumping filtered pairs to '_pickles/pairs_noOutgroup.p' ... done 00:00:00.53
-validating primer pairs with isPcr ... done 00:00:01.97
-    removed 1665 pairs not validated by isPcr (8480 pairs remaining)
-    dumping validated pairs to '_pickles/pairs_noOutgroup_validated.p' ... done 00:00:00.53
-writing primer pairs to 'results.tsv' ... done 00:00:00.52
+    dumping filtered pairs to '_pickles/pairs_noOutgroup.p' ... done 00:00:00.54
+validating primer pairs with isPcr ... done 00:00:03.44
+    removed 3659 pairs not validated by isPcr (6486 pairs remaining)
+    dumping validated pairs to '_pickles/pairs_noOutgroup_validated.p' ... done 00:00:00.54
+writing 6486 primer pairs to 'results.tsv' ... done 00:00:11.62
 
-total runtime: 00:05:08.49
+total runtime: 00:04:14.50
 ```
 
-As we can see, `primerForge` found 30,413 kmers that were suitable for use as a primer in all three ingroup genomes. It then went on to identify 16,050 primer pairs that would produce PCR products between 500bp and 2000bp in the ingroup genomes. Next, it found that of those 16,050 pairs, 5,905 of them formed PCR products between 300bp and 3000bp in one or more of the outgroup genomes. Finally, it used `isPcr` to validate the remaining 10,145 primer pairs resulting in 8,480 primer pairs being written to file.
+As we can see, `primerForge` found 30,413 kmers that were suitable for use as a primer in all three ingroup genomes. It then went on to identify 16,050 primer pairs that would produce PCR products between 500bp and 2000bp in the ingroup genomes. Next, it found that of those 16,050 pairs, 5,905 of them formed PCR products between 300bp and 3000bp in one or more of the outgroup genomes. Finally, it used `isPcr` to validate the remaining 10,145 primer pairs resulting in 6,486 primer pairs being written to file.
 
+### Examining the results
 `primerForge` generated `results.tsv`, the file that contains the sequences and details for each primer pair, and `primerForge.log`. Here are a few lines from `results.tsv`:
 
 |fwd_seq|fwd_Tm|fwd_GC|rev_seq|rev_Tm|rev_GC|i1.gbff_contig|i1.gbff_length|i2.gbff_contig|i2.gbff_length|i3.gbff_contig|i3.gbff_length|o1.gbff_contig|o1.gbff_length|o2.gbff_contig|o2.gbff_length|
 |:------|:-----|:-----|:------|:-----|:-----|:------------:|:------------:|:------------:|:------------:|:------------:|:------------:|:------------:|:------------:|:------------:|:------------:|
-|TATGCAACTAATCCCGAGTATCAC|56.1|41.7|TGTAAGTGGCGTTGTATCCC|55.5|50|NZ_LAUX01000130.1|521|NZ_LAUV01000074.1|521|NZ_LAUY01000118.1|521|NA|0|NA|0|
-|TGTTCCTTCACACTCAATAACAGC|57.3|41.7|AGAAGGAACAGTCGCTGAAG|55.4|50|NZ_LAUX01000081.1|1645|NZ_LAUV01000036.1|1645|NZ_LAUY01000073.1|1645|NZ_LS483503.1|3091|NZ_CP065583.1|3103|
-|AAGGAGAGTATCGCTTAGTTGATG|56|41.7|CAACAGCAGATGGTTTAGAAAGTG|56.4|41.7|NZ_LAUX01000079.1|788|NZ_LAUV01000035.1|788|NZ_LAUY01000072.1|788|NA|0|NA|0|
-|AAGGAGAGTATCGCTTAGTTGATG|56|41.7|ACTCCAATTGCTCTTCCTGAAG|56.2|45.5|NZ_LAUX01000079.1|1053|NZ_LAUV01000035.1|1053|NZ_LAUY01000072.1|1053|NA|0|NA|0|
-|TGAAATCACCAGCTATTGCATCAG|57.5|41.7|ACATTGCAACTCCTGAGATTTG|55.1|40.9|NZ_LAUX01000081.1|1998|NZ_LAUV01000036.1|1998|NZ_LAUY01000073.1|1998|NA|0|NA|0|
+|AGAAGCAAAGGATATGGGAACAAC|57.1|41.7|AAATCAACACCCTCAATAAGCTCC|57.1|41.7|NZ_LAUX01000078.1|804|NZ_LAUV01000035.1|804|NZ_LAUY01000092.1|804|NA|0|NA|0|
+|TCCATCTAATGAAGATCAACCAGG|55.9|41.7|CCCTAATTGTGATGAGTTGACAAC|55.9|41.7|NZ_LAUX01000010.1|710|NZ_LAUV01000042.1|713|NZ_LAUY01000078.1|710|NA|0|NA|0|
+|CATCAGCTGTTGTAAATAACCCAC|56.2|41.7|GTGGAGCTATGAAACCAATATCAG|55.3|41.7|NZ_LAUX01000117.1|1694|NZ_LAUV01000064.1|1694|NZ_LAUY01000018.1|1694|NZ_LS483503.1|3212|NA|0|
 
-The first six columns show the forward and reverse sequences (5' --> 3') as well as their melting temperatures and their G+C %. Next, for each genome it lists the contig and the PCR product size that is predicted be produced by this pair. For example, the first pair of primers are predicted to produce PCR products of 521bp the ingroup genomes, and the binding sites for this primer pair in the files `i1.gbff`, `i2.gbff`, and `i3.gbff` can be found in contigs `NZ_LAUX01000130.1`, `NZ_LAUV01000074.1`, and `NZ_LAUY01000118.1`, respectively. This same pair is not predicted to produce any PCR products in either outgroup genome. Similarly, the next pair is predicted to produce a PCR product size of 1,645bp in all three ingroup genomes and PCR products sizes of >3,000bp in both the outgroup genomes.
+The first six columns show the forward and reverse sequences (5' --> 3') as well as their melting temperatures and their G+C content (mol %). Next, for each genome it lists the contig and the PCR product size that is predicted be produced by this pair. For example, the first pair of primers are predicted to produce PCR products of 804bp the ingroup genomes, and the binding sites for this primer pair in the files `i1.gbff`, `i2.gbff`, and `i3.gbff` can be found in contigs `NZ_LAUX01000078.1`, `NZ_LAUV01000035.1`, and `NZ_LAUY01000092.1`, respectively. This same pair is not predicted to produce any PCR products in either outgroup genome. Similarly, the third pair is predicted to produce a PCR product size of 1,694bp in all three ingroup genomes, no products `o2.gbff`, and a PCR product >3,000bp in `o1.gbff`.
 
 If a primer pair is predicted to produce multiple products in an outgroup genome, then the contig column and the size column will list contigs and sizes in a comma-separated list linked by position. For example, if a primer pair was expected to produce a product of 1,990bp in `contig_1` and 2,024bp in `contig_2` in the genome file `example.gbff`, then the columns for this genome would look like this:
 

From 3e2131f10bd01473b8c1dc535399ba389b59b34c Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Wed, 11 Sep 2024 11:33:53 -0400
Subject: [PATCH 09/17] revised sort method for outgroup products

---
 bin/sortPrimerPairs.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/bin/sortPrimerPairs.py b/bin/sortPrimerPairs.py
index f94d8ac..324d76a 100644
--- a/bin/sortPrimerPairs.py
+++ b/bin/sortPrimerPairs.py
@@ -45,13 +45,13 @@ def __minDiffFromRange(intgr:int, rng:range) -> int:
     return min((lower, upper))
 
 
-def __sortOutgroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], outgroup:list[str], disallowed:range) -> tuple[int,int]:
+def __sortOutgroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], outgroup:list[str], ingroupPcrLens:range) -> tuple[int,int]:
     """evaluates the outgroup products for a primer pair in order to sort the pair
 
     Args:
         metadata (dict[str,tuple[str,int,tuple[str,int,int]]]): the metadata for a primer pair
         outgroup (list[str]): a list of outgroup filenames
-        disallowed (range): the range that outgroup products are not allowed to be in
+        ingroupPcrLens (range): the range that ingroup PCR products are within
 
     Returns:
         tuple[int,int]: minimum distance from the range (negated); the number of outgroup products
@@ -76,7 +76,7 @@ def __sortOutgroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]],
         products = [y for x in products for y in x]
         
         # get the smallest value distance from the disallowed range
-        minDiff = min(map(lambda x:__minDiffFromRange(x, disallowed), products))
+        minDiff = min(map(lambda x:__minDiffFromRange(x, ingroupPcrLens), products))
     
     # return the minimum distance (negated so larger differences are sorted first) and the number of products
     return -minDiff, len(products)
@@ -208,7 +208,7 @@ def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple
         list[tuple[Primer,Primer]]: a list of sorted primer pairs
     """
     # sort the results using the following criteria (in order):
-    #  * largest difference from the disallowed range for outgroup PCR products
+    #  * largest difference from the ingroup product range for outgroup PCR products
     #  * fewest number of outgroup PCR products
     #  * lowest G+C difference between the pair
     #  * lowest Tm difference between the pair
@@ -217,7 +217,9 @@ def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple
     #  * lowest Tm (sum) for hairpins
     #  * lowest variance in ingroup PCR product sizes
     #  * largest median ingroup PCR product size
-    return sorted(pairs.keys(), key=lambda p: (__sortOutgroupProducts(pairs[p], params.outgroupFns, params.disallowedLens),
+    return sorted(pairs.keys(), key=lambda p: (__sortOutgroupProducts(pairs[p],
+                                                                      params.outgroupFns,
+                                                                      range(params.minPcr, params.maxPcr+1)),
                                                __getGcDiff(p),
                                                __getTmDiff(p),
                                                __getHeteroDimerTemp(p),

From 79f726fc1c897511ab18beedd1444b94c1e85c70 Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Wed, 11 Sep 2024 12:47:44 -0400
Subject: [PATCH 10/17] only ask for dataset when downloading genomes

---
 bin/unit_tests/results_test.py | 78 +++++++++++++++-------------------
 1 file changed, 34 insertions(+), 44 deletions(-)

diff --git a/bin/unit_tests/results_test.py b/bin/unit_tests/results_test.py
index 88db4cc..7f98d19 100644
--- a/bin/unit_tests/results_test.py
+++ b/bin/unit_tests/results_test.py
@@ -39,20 +39,22 @@ class ResultsTest(unittest.TestCase):
     FAKE_FN = 'fakefile'
     PCRLEN = "length"
     CONTIG = "contig"
+    INGROUP_FILES = ('i1.gbff', 'i2.gbff', 'i3.gbff')
+    OUTGROUP_FILES = ('o1.gbff', 'o2.gbff')
     
     # default ingroup and outgroup genomes (Mycoplasma mycoides)
-    DEFAULT_INGROUP  = {'i1.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/034/305/GCF_003034305.1_ASM303430v1/GCF_003034305.1_ASM303430v1_genomic.gbff.gz',
-                        'i2.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/034/275/GCF_003034275.1_ASM303427v1/GCF_003034275.1_ASM303427v1_genomic.gbff.gz',
-                        'i3.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/034/345/GCF_003034345.1_ASM303434v1/GCF_003034345.1_ASM303434v1_genomic.gbff.gz'}
-    DEFAULT_OUTGROUP = {'o1.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/489/555/GCF_900489555.1_MMC68/GCF_900489555.1_MMC68_genomic.gbff.gz',
-                        'o2.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/018/389/745/GCF_018389745.1_ASM1838974v1/GCF_018389745.1_ASM1838974v1_genomic.gbff.gz'}
+    DEFAULT_INGROUP  = {INGROUP_FILES[0]:  'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/034/305/GCF_003034305.1_ASM303430v1/GCF_003034305.1_ASM303430v1_genomic.gbff.gz',
+                        INGROUP_FILES[1]:  'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/034/275/GCF_003034275.1_ASM303427v1/GCF_003034275.1_ASM303427v1_genomic.gbff.gz',
+                        INGROUP_FILES[2]:  'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/034/345/GCF_003034345.1_ASM303434v1/GCF_003034345.1_ASM303434v1_genomic.gbff.gz'}
+    DEFAULT_OUTGROUP = {OUTGROUP_FILES[0]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/489/555/GCF_900489555.1_MMC68/GCF_900489555.1_MMC68_genomic.gbff.gz',
+                        OUTGROUP_FILES[1]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/018/389/745/GCF_018389745.1_ASM1838974v1/GCF_018389745.1_ASM1838974v1_genomic.gbff.gz'}
     
     # alternate ingroup and outgroup genomes (Escherichia coli)
-    ALTRNTE_INGROUP  = {'i1.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/650/295/GCF_001650295.1_ASM165029v1/GCF_001650295.1_ASM165029v1_genomic.gbff.gz',
-                        'i2.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/008/727/175/GCF_008727175.1_ASM872717v1/GCF_008727175.1_ASM872717v1_genomic.gbff.gz',
-                        'i3.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/208/865/GCF_002208865.2_ASM220886v2/GCF_002208865.2_ASM220886v2_genomic.gbff.gz'}
-    ALTRNTE_OUTGROUP = {'o1.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.gbff.gz',
-                        'o2.gbff': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/006/945/GCF_000006945.2_ASM694v2/GCF_000006945.2_ASM694v2_genomic.gbff.gz'}
+    ALTRNTE_INGROUP  = {INGROUP_FILES[0]:  'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/650/295/GCF_001650295.1_ASM165029v1/GCF_001650295.1_ASM165029v1_genomic.gbff.gz',
+                        INGROUP_FILES[1]:  'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/008/727/175/GCF_008727175.1_ASM872717v1/GCF_008727175.1_ASM872717v1_genomic.gbff.gz',
+                        INGROUP_FILES[2]:  'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/208/865/GCF_002208865.2_ASM220886v2/GCF_002208865.2_ASM220886v2_genomic.gbff.gz'}
+    ALTRNTE_OUTGROUP = {OUTGROUP_FILES[0]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.gbff.gz',
+                        OUTGROUP_FILES[1]: 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/006/945/GCF_000006945.2_ASM694v2/GCF_000006945.2_ASM694v2_genomic.gbff.gz'}
     
     @classmethod
     def setUpClass(cls) -> None:
@@ -64,24 +66,23 @@ def setUpClass(cls) -> None:
         # get the number of threads to use
         numThreads = ResultsTest._getNumThreads()
         
-        # determine which dataset to use
-        cls.ingroup:dict[str,str]
-        cls.outgroup:dict[str,str]
-        cls.ingroup, cls.outgroup = ResultsTest._selectDataset()
-        
         # make the test directory if it doesn't exist
         if not os.path.isdir(ResultsTest.TEST_DIR):
             os.mkdir(ResultsTest.TEST_DIR)
         
-        # get a list of all the genome files
-        allFiles = list(ResultsTest.ingroup.keys()) + list(ResultsTest.outgroup.keys())
-        allFiles = map(os.path.join, [ResultsTest.TEST_DIR]*len(allFiles), allFiles)
+        # get a list of the genome filenames
+        cls.seqFiles:list[str] = list(map(lambda x: os.path.join(ResultsTest.TEST_DIR, x),
+                                          ResultsTest.INGROUP_FILES + ResultsTest.OUTGROUP_FILES))
         
         # download genome files if necessary
-        if not all(map(os.path.exists, allFiles)):
+        if not all(map(os.path.exists, ResultsTest.seqFiles)):
+            # determine which dataset to use
+            ingroupFtp, outgroupFtp = ResultsTest._selectDataset()
+            
+            # download the files
             clock.printStart('downloading genomes')
-            ResultsTest._downloadGenomesForGroup(ResultsTest.ingroup)
-            ResultsTest._downloadGenomesForGroup(ResultsTest.outgroup)
+            ResultsTest._downloadGenomesForGroup(ingroupFtp)
+            ResultsTest._downloadGenomesForGroup(outgroupFtp)
             clock.printDone()
         
         # get the parameters
@@ -91,13 +92,13 @@ def setUpClass(cls) -> None:
         # run primerForge if results file does not exist
         if not os.path.exists(ResultsTest.params.resultsFn):  
             # run primerForge
-            clock.printStart('running primerForge', end=' ...\n', spin=False)
+            clock.printStart('running primerForge', end=' ...\n\n', spin=False)
             _runner(ResultsTest.params)
             clock.printDone()
         
         # otherwise using existing file
         else:
-            print('running tests on existing files')
+            print('running tests on existing files\n')
         
         # load the results file into memory
         clock.printStart('reading results into memory')
@@ -447,18 +448,7 @@ def _loadGenomeSequences() -> dict[str,dict[str,dict[str,Seq]]]:
         Returns:
             dict[str,dict[str,dict[str,Seq]]]: key=genome name; val=dict: key=contig name; val=dict: key=strand; val=sequence
         """
-        # initialize output
-        out = dict()
-        
-        # load ingroup sequences
-        for fn in ResultsTest.ingroup.keys():
-            out[fn] = ResultsTest._loadOneGenomeSequence(fn)
-        
-        # load outgroup sequences
-        for fn in ResultsTest.outgroup.keys():
-            out[fn] = ResultsTest._loadOneGenomeSequence(fn)
-        
-        return out
+        return {os.path.basename(fn):ResultsTest._loadOneGenomeSequence(fn) for fn in ResultsTest.seqFiles}
     
     def _kmerListToDict(klist:list[tuple[str,int]]) -> dict[str,list[int]]:
         """converts a list of kmers to a dictionary
@@ -777,7 +767,7 @@ def testL_ingroupHasOneProduct(self) -> None:
         
         # for each pair in the ingroup
         for pair in self.results.keys():
-            for name in ResultsTest.ingroup.keys():
+            for name in ResultsTest.INGROUP_FILES:
                 # there should be exactly one pcr product size for each ingroup genome
                 self.assertEqual(len(self.results[pair].additional[name][ResultsTest.PCRLEN]), 1, f"{FAIL_MSG_A}{pair}{FAIL_MSG_B}{name}")
                 self.assertEqual(len(self.results[pair].additional[name][ResultsTest.CONTIG]), 1, f"{FAIL_MSG_A}{pair}{FAIL_MSG_B}{name}")
@@ -791,7 +781,7 @@ def testM_ingroupProductsWithinRange(self) -> None:
         
         # for each pari in the ingroup
         for pair in self.results.keys():
-            for name in ResultsTest.ingroup.keys():
+            for name in ResultsTest.INGROUP_FILES:
                 # make sure the pcr products are within the allowed range
                 self.assertGreaterEqual(self.results[pair].additional[name][ResultsTest.PCRLEN][0], self.params.minPcr, f"{FAIL_MSG_A}{pair}{FAIL_MSG_B}{name}")
                 self.assertLessEqual(self.results[pair].additional[name][ResultsTest.PCRLEN][0], self.params.maxPcr, f"{FAIL_MSG_A}{pair}{FAIL_MSG_B}{name}")
@@ -805,7 +795,7 @@ def testN_outgroupProductsSavedCorrectly(self) -> None:
         
         # for each pair in the outgroup
         for pair in self.results.keys():
-            for name in ResultsTest.outgroup.keys():
+            for name in ResultsTest.OUTGROUP_FILES:
                 # the length of the contig and pcr size lists should be equal
                 numContigs = len(self.results[pair].additional[name][ResultsTest.CONTIG])
                 numPcrLens = len(self.results[pair].additional[name][ResultsTest.PCRLEN])
@@ -821,7 +811,7 @@ def testO_outgroupProductsNaAreZero(self) -> None:
         
         # for each pair in the outgroup
         for pair in self.results.keys():
-            for name in ResultsTest.outgroup.keys():
+            for name in ResultsTest.OUTGROUP_FILES:
                 # for each pcr product saved
                 for idx in range(len(self.results[pair].additional[name][ResultsTest.CONTIG])):
                     # if the contig is NA, then the pcr size should be zero
@@ -841,7 +831,7 @@ def testP_outgroupProductsNotWithinDisallowedRange(self) -> None:
         
         # for each pair in the outgroup
         for pair in self.results.keys():
-            for name in ResultsTest.outgroup.keys():
+            for name in ResultsTest.OUTGROUP_FILES:
                 # each pcr product size should not be in the disallowed range
                 for pcrLen in self.results[pair].additional[name][ResultsTest.PCRLEN]:
                     self.assertNotIn(pcrLen, self.params.disallowedLens, f"{FAIL_MSG_A}{pair}{FAIL_MSG_B}{name}")
@@ -858,7 +848,7 @@ def testQ_isRepeatedInIngroup(self) -> None:
             rbind = self.bindingSites[rev]
             
             # for each ingroup genome
-            for name in ResultsTest.ingroup.keys():
+            for name in ResultsTest.INGROUP_FILES:
                 # count the number of binding sites in the genome for each primer
                 fCount = 0
                 rCount = 0
@@ -880,7 +870,7 @@ def testR_onSeparateStrandsInIngroup(self) -> None:
             rbind = self.bindingSites[rev]
             
             # only evaluate for the ingroup
-            for name in ResultsTest.ingroup.keys():
+            for name in ResultsTest.INGROUP_FILES:
                 for contig in fbind[name].keys():
                     # if the forward primer is on the (+) strand
                     if fbind[name][contig][Primer.PLUS] != []:
@@ -914,7 +904,7 @@ def testS_isProductLengthCorrect(self) -> None:
             rbind = self.bindingSites[rev]
             
             # for each ingroup genome
-            for name in ResultsTest.ingroup.keys():
+            for name in ResultsTest.INGROUP_FILES:
                 # extract the stored pcr length and the contig
                 pcrLen = self.results[(fwd,rev)].additional[name][ResultsTest.PCRLEN][0]
                 contig = self.results[(fwd,rev)].additional[name][ResultsTest.CONTIG][0]
@@ -939,7 +929,7 @@ def testS_isProductLengthCorrect(self) -> None:
                 self.assertEqual(pcrLen, truLen, f"bad pcr product sizes in {name} for {fwd}, {rev}")
             
             # for each outgroup genome
-            for name in ResultsTest.outgroup.keys():
+            for name in ResultsTest.OUTGROUP_FILES:
                 # get the list of pcr products and their contigs
                 pcrLens = self.results[(fwd,rev)].additional[name][ResultsTest.PCRLEN]
                 contigs = self.results[(fwd,rev)].additional[name][ResultsTest.CONTIG]

From 1c1f24767f9c5c0469885b6616c31917f8f7db0d Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Thu, 12 Sep 2024 15:03:39 -0400
Subject: [PATCH 11/17] created new class to store PCR product data

---
 bin/Product.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 bin/Product.py

diff --git a/bin/Product.py b/bin/Product.py
new file mode 100644
index 0000000..96a25a4
--- /dev/null
+++ b/bin/Product.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+from typing import Union
+
+class Product:
+    """class for storing PCR product data for a pair of primers
+    """
+    def __init__(self, contig:str, size:Union[int,str], fbin:int, rbin:int, dimerTm:float) -> Product:
+        """constructor
+
+        Args:
+            contig (str): the contig where the product resides
+            size (Union[int,str]): the contig size (will be str if multiple sizes)
+            fbin (int): the bin the forward primer belongs to
+            rbin (int): the bin the reverse primer belongs to
+            dimerTm (float): the heterodimer Tm for the primer pair
+
+        Returns:
+            Product: a Product object
+        """
+        # set attributes
+        self.contig:str = contig
+        self.size:Union[int,str] = size
+        self.dimerTm:float = dimerTm
+        self.fbin:int = fbin
+        self.rbin:int = rbin
+    
+    # overloads
+    def __str__(self) -> str:
+        return f'{self.contig}: {self.size} bp'
+    
+    def __repr__(self) -> str:
+        return str(self)

From b599e4d46597fc39c2aaa715711c8edaa473d872 Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Thu, 12 Sep 2024 15:07:43 -0400
Subject: [PATCH 12/17] saving primer homodimer/hairpin Tms for O(1) lookup
 during sorting (#31)

---
 bin/Primer.py            | 10 ++++++++++
 bin/getCandidateKmers.py | 38 +++++++++++++++++++++++++++++++-------
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/bin/Primer.py b/bin/Primer.py
index b985f0b..60e2886 100644
--- a/bin/Primer.py
+++ b/bin/Primer.py
@@ -33,6 +33,10 @@ def __init__(self, seq:Seq, contig:str, start:int, length:int, strand:str) -> Pr
         self.strand:str = strand
         self.Tm:float = None
         self.gcPer:float = None
+        self.hairpinTm:float = None
+        self.rcHairpin:float = None
+        self.homodimerTm:float = None
+        self.rcHomodimer:float = None
         self.__length:int = length
         
         # flip start and end if on the minus strand
@@ -111,6 +115,12 @@ def reverseComplement(self) -> Primer:
             new = Primer(self.seq.reverse_complement(), self.contig, self.start, len(self), Primer.MINUS)
         else:
             new = Primer(self.seq.reverse_complement(), self.contig, self.end, len(self), Primer.PLUS)
+        
+        # flip the hairpin/dimer Tms
+        new.hairpinTm = self.rcHairpin
+        new.rcHairpin = self.hairpinTm
+        new.homodimerTm = self.rcHomodimer
+        new.rcHomodimer = self.homodimerTm
             
         # make the new object
         return new
diff --git a/bin/getCandidateKmers.py b/bin/getCandidateKmers.py
index 782501e..f729d9a 100644
--- a/bin/getCandidateKmers.py
+++ b/bin/getCandidateKmers.py
@@ -316,18 +316,26 @@ def noLongRepeats(primer:Primer) -> bool:
     def noHairpins(primer:Primer) -> bool:
         """verifies that the primer does not form hairpins
         """
+        # save the hairpin Tms
+        primer.hairpinTm = primer3.calc_hairpin_tm(str(primer))
+        primer.rcHairpin = primer3.calc_hairpin_tm(str(primer.reverseComplement()))
+        
         # hairpin tm should be less than (minTm - 5°); need to check both strands
-        fwdOk = primer3.calc_hairpin_tm(str(primer)) < (minTm - FIVE_DEGREES)
-        revOk = primer3.calc_hairpin_tm(str(primer.reverseComplement())) < (minTm - FIVE_DEGREES)
+        fwdOk = primer.hairpinTm < (minTm - FIVE_DEGREES)
+        revOk = primer.rcHairpin < (minTm - FIVE_DEGREES)
         
         return fwdOk and revOk
 
     def noHomodimers(primer:Primer) -> bool:
         """verifies that the primer does not form homodimers
         """
+        # save the homodimer Tms
+        primer.homodimerTm = primer3.calc_homodimer_tm(str(primer))
+        primer.rcHomodimer = primer3.calc_homodimer_tm(str(primer.reverseComplement()))
+        
         # homodimer tm should be less than (minTm - 5°); need to check both strands
-        fwdOk = primer3.calc_homodimer_tm(str(primer)) < (minTm - FIVE_DEGREES)
-        revOk = primer3.calc_homodimer_tm(str(primer.reverseComplement())) < (minTm - FIVE_DEGREES)
+        fwdOk = primer.homodimerTm < (minTm - FIVE_DEGREES)
+        revOk = primer.rcHomodimer < (minTm - FIVE_DEGREES)
         
         return fwdOk and revOk
     
@@ -403,18 +411,34 @@ def __buildOutput(kmers:dict[str,dict[str,tuple[str,int,str]]], candidates:list[
         # identify which sequence is present in the genome
         try:
             entry = kmers[cand.seq]
+            hairpinTm = cand.hairpinTm
+            rcHairpin = cand.rcHairpin
+            homodimerTm = cand.homodimerTm
+            rcHomodimer = cand.rcHomodimer
+        
         except:
             entry = kmers[cand.seq.reverse_complement()]
+            hairpinTm = cand.rcHairpin
+            rcHairpin = cand.hairpinTm
+            homodimerTm = cand.rcHomodimer
+            rcHomodimer = cand.homodimerTm
         
         # for each genome
         for name in entry.keys():
             # extract the data from the entry
             contig, start, strand = entry[name]
-        
-            # save a Primer object in this contig's list
+
+            # create the new primer for this genome
+            primer = Primer(cand.seq, contig, start, len(cand.seq), strand)
+            primer.hairpinTm = hairpinTm
+            primer.rcHairpin = rcHairpin
+            primer.homodimerTm = homodimerTm
+            primer.rcHomodimer = rcHomodimer
+            
+            # save the Primer in this contig's list
             out[name] = out.get(name, dict())
             out[name][contig] = out[name].get(contig, list())
-            out[name][contig].append(Primer(cand.seq, contig, start, len(cand.seq), strand))
+            out[name][contig].append(primer)
     
     return out
 

From 2f2d628441a1f948dac79ec064bd29b2bf11ce22 Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Thu, 12 Sep 2024 15:13:02 -0400
Subject: [PATCH 13/17] major overhauls to speed up sorting (#31)   *
 implemented Product class to replace tuples of values   * saving heterodimer
 Tm in Product class for O(1) lookup during sorting   * utilizing new Tm
 fields in the Primer class   * significantly sped up sorting thanks to
 Product and Primer changes

---
 bin/getPrimerPairs.py        | 85 ++++++++++++++++++++++++------------
 bin/main.py                  | 39 +++++++++--------
 bin/removeOutgroupPrimers.py | 37 ++++++++++------
 bin/sortPrimerPairs.py       | 52 +++++++++-------------
 bin/validatePrimers.py       | 17 ++++----
 5 files changed, 133 insertions(+), 97 deletions(-)

diff --git a/bin/getPrimerPairs.py b/bin/getPrimerPairs.py
index 504d77b..050adf8 100644
--- a/bin/getPrimerPairs.py
+++ b/bin/getPrimerPairs.py
@@ -1,8 +1,9 @@
 from Bio.Seq import Seq
-from typing import Generator
 from bin.Primer import Primer
+from bin.Product import Product
 from itertools import combinations
 import multiprocessing, os, primer3
+from typing import Generator, Union
 from bin.Parameters import Parameters
 
 
@@ -144,7 +145,7 @@ def __getBinPairs(binned:dict[str,dict[int,list[Primer]]], minPrimerLen:int, min
     return out
 
 
-def _formsDimers(fwd:str, rev:str, fwdTm:float, revTm:float) -> bool:
+def _formsDimers(fwd:str, rev:str, fwdTm:float, revTm:float) -> tuple[bool, float]:
     """evaluates if two primers may form primer dimers
 
     Args:
@@ -154,13 +155,17 @@ def _formsDimers(fwd:str, rev:str, fwdTm:float, revTm:float) -> bool:
         revTm (float): the reverse primer melting temperature
 
     Returns:
-        bool: indicates if primer dimer formation is possible
+        tuple[bool, float]: indicates if primer dimer formation is possible; the dimer Tm
     """
     # constant
     FIVE_DEGREES = 5
-
+    
+    dimerTm = primer3.calc_heterodimer_tm(fwd, rev)
+    
     # dimer formation is a concern if the Tm is sufficiently high
-    return primer3.calc_heterodimer_tm(fwd, rev) >= (min(fwdTm, revTm) - FIVE_DEGREES)
+    highEnough = dimerTm >= (min(fwdTm, revTm) - FIVE_DEGREES)
+
+    return highEnough, dimerTm
 
 
 def __isPairSuitable(fwd:Primer, rev:Primer, minPcr:int, maxPcr:int, maxTmDiff:float) -> tuple[bool,int]:
@@ -188,7 +193,7 @@ def __isPairSuitable(fwd:Primer, rev:Primer, minPcr:int, maxPcr:int, maxTmDiff:f
     return True, pcrLen
 
 
-def __evaluateOnePair(fwd:str, rev:str, fTm:float, rTm:float, binPair:str) -> str:
+def __evaluateOnePair(fwd:str, rev:str, fTm:float, rTm:float, binPair:str) -> Union[tuple[str, float],None]:
     """evaluates a primer pair; designed to work in parallel; only returns if the pair passes evaluation
 
     Args:
@@ -199,13 +204,16 @@ def __evaluateOnePair(fwd:str, rev:str, fTm:float, rTm:float, binPair:str) -> st
         binPair (str): the bin pair for this pair
 
     Returns:
-        str: the bin pair for this pair
+        tuple[str, float] | None: the bin pair for this pair, the heterodimer Tm; None if the pair does not pass evaluation
     """
+    # get the dimer Tm and if it forms dimers
+    forms, dimerTm = _formsDimers(fwd, rev, fTm, rTm)
+    
     # return acceptable pairs and their pcr product sizes
-    if not _formsDimers(fwd, rev, fTm, rTm): return binPair
+    if not forms: return binPair, dimerTm
 
 
-def __getCandidatePrimerPairs(binPairs:list[tuple[str,int,int]], bins:dict[str,dict[int,list[Primer]]], params:Parameters) -> list[tuple[Primer,Primer,int,tuple[str,int,int]]]:
+def __getCandidatePrimerPairs(binPairs:list[tuple[str,int,int]], bins:dict[str,dict[int,list[Primer]]], params:Parameters) -> list[tuple[Primer,Primer,Product]]:
     """gets candidate primer pairs from pairs of bins
 
     Args:
@@ -214,7 +222,7 @@ def __getCandidatePrimerPairs(binPairs:list[tuple[str,int,int]], bins:dict[str,d
         params (Parameters): a Parameters object
 
     Returns:
-        list[tuple[Primer,Primer,int,tuple[str,int,int]]]: a list of primer pairs, the corresponding pcr product size, and the bin pair (contig, bin1, bin2)
+        list[tuple[Primer,Primer,Product]]: a list of (forward primer, reverse primer, pcr product)
     """
     # constants
     FWD = 'forward'
@@ -279,12 +287,12 @@ def generateArgs() -> Generator[tuple[str, str, float, float, str],None,None]:
     
     # evaluate pairs of primers for heterodimer potential in parallel
     pool = multiprocessing.Pool(params.numThreads)
-    pairs = pool.starmap(__evaluateOnePair, generateArgs())
+    results = pool.starmap(__evaluateOnePair, generateArgs())
     pool.close()
     pool.join()
     
     # for each valid pair
-    for pair in [x for x in pairs if x is not None]:
+    for pair,dimerTm in [r for r in results if r is not None]:
         # recast the bin pair from a string to its component parts
         contig, fbin, rbin = eval(pair)
         
@@ -296,7 +304,7 @@ def generateArgs() -> Generator[tuple[str, str, float, float, str],None,None]:
         rev:Primer = bins[contig][rbin][jdx]
         
         # save this pair in the output
-        out.append((fwd,rev.reverseComplement(),pcrLen,(contig,fbin,rbin)))
+        out.append((fwd,rev.reverseComplement(), Product(contig, pcrLen, fbin, rbin, dimerTm)))
     
     return out
 
@@ -322,18 +330,21 @@ def __restructureCandidateKmerData(candidates:dict[str,list[Primer]]) -> dict[Se
     return out
 
 
-def __getAllSharedPrimerPairs(firstName:str, candidateKmers:dict[str,dict[str,list[Primer]]], candidatePairs:list[tuple[Primer,Primer,int,tuple[str,int,int]]], params:Parameters) -> dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]:
+def __getAllSharedPrimerPairs(firstName:str, candidateKmers:dict[str,dict[str,list[Primer]]], candidatePairs:list[tuple[Primer,Primer,Product]], params:Parameters) -> dict[tuple[Primer,Primer],dict[str,Product]]:
     """gets all the primer pairs that are shared in all the genomes
 
     Args:
         firstName (str): the name of the genome that has already been evaluated
         candidateKmers (dict[str,dict[str,list[Primer]]]): key=genome name; val=dict: key=contig name; val=list of candidate kmers
-        candidatePairs (list[tuple[Primer,Primer,int,tuple[str,int,int]]]): the list produced by __getCandidatePrimerPairs
+        candidatePairs (list[tuple[Primer,Primer,Product]): the list produced by __getCandidatePrimerPairs
         params (Parameters): a Parameters object
 
     Returns:
-        dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]: key=pair of Primers; val=dict: key=genome name; val=tuple: contig name, PCR product length, bin pair (contig, bin1, bin2)
+        dict[tuple[Primer,Primer],dict[str,Product]]: key=pair of Primers; val=dict: key=genome name; val=Product object
     """
+    # constant
+    TEMP_BIN = -1
+    
     # initialize variables
     k1:Primer
     k2:Primer
@@ -351,30 +362,46 @@ def __getAllSharedPrimerPairs(firstName:str, candidateKmers:dict[str,dict[str,li
         kmers[name] = __restructureCandidateKmerData(candidateKmers[name])
         
     # for each pair of primers in the candidate pairs
-    for p1,p2,length,binPair in candidatePairs:
+    for p1,p2,product in candidatePairs:
         # store the data for the genome (firstName) that has already been evaluated
         out[(p1,p2)] = dict()
-        out[(p1,p2)][firstName] = (p1.contig,length,binPair)
+        out[(p1,p2)][firstName] = product
         
         # for each unprocessed genome
         for name in remaining:
-            # get the first kmer (may be rev comp)
+            # get the first kmer (may be rev comp) and its Tms
             try:
-                k1 = kmers[name][p1.seq]
+                k1:Primer = kmers[name][p1.seq]
                 k1_rev = False
+                k1.hairpinTm = p1.hairpinTm
+                k1.rcHairpin = p1.rcHairpin
+                k1.homodimerTm = p1.homodimerTm
+                k1.rcHomodimer = p1.rcHomodimer
                 
             except KeyError:
                 k1 = kmers[name][p1.seq.reverse_complement()]
                 k1_rev = True
+                k1.hairpinTm = p1.rcHairpin
+                k1.rcHairpin = p1.hairpinTm
+                k1.homodimerTm = p1.rcHomodimer
+                k1.rcHomodimer = p1.homodimerTm
             
-            # get the second kmer (may be rev comp)
+            # get the second kmer (may be rev comp) and its Tms
             try:
                 k2 = kmers[name][p2.seq]
                 k2_rev = False
+                k2.hairpinTm = p2.hairpinTm
+                k2.rcHairpin = p2.rcHairpin
+                k2.homodimerTm = p2.homodimerTm
+                k2.rcHomodimer = p2.rcHomodimer
     
             except KeyError:
                 k2 = kmers[name][p2.seq.reverse_complement()]
                 k2_rev = True
+                k2.hairpinTm = p2.rcHairpin
+                k2.rcHairpin = p2.hairpinTm
+                k2.homodimerTm = p2.rcHomodimer
+                k2.rcHomodimer = p2.homodimerTm
             
             # both primers need to be on the same contig
             if k1.contig == k2.contig:
@@ -400,7 +427,8 @@ def __getAllSharedPrimerPairs(firstName:str, candidateKmers:dict[str,dict[str,li
                         # only save the pair if the length falls within the acceptable range
                         length = rev.end - fwd.start + 1
                         if length in allowedLengths:
-                            out[(p1,p2)][name] = (fwd.contig, length, binPair)
+                            # the bin numbers are incorrect and need to be fixed later
+                            out[(p1,p2)][name] = Product(fwd.contig, length, TEMP_BIN, TEMP_BIN, product.dimerTm)
 
     # remove any pairs that are not universally suitable in all genomes
     for pair in set(out.keys()):
@@ -410,13 +438,13 @@ def __getAllSharedPrimerPairs(firstName:str, candidateKmers:dict[str,dict[str,li
     return out
 
 
-def __updateBinsForUnprocessedGenomes(name:str, kmers:dict[str,list[Primer]], pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> None:
+def __updateBinsForUnprocessedGenomes(name:str, kmers:dict[str,list[Primer]], pairs:dict[tuple[Primer,Primer],dict[str,Product]]) -> None:
     """updates the bin pairs for the unprocessed genomes to ensure that all redundant primers are removed
 
     Args:
         name (str): the name of an unprocessed genome
         kmers (dict[str,list[Primer]]): candidate kmers for this genome (key=contig; val=list of candidate kmers)
-        pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): the dictionary produced by __getAllSharedPrimerPairs
+        pairs (dict[tuple[Primer,Primer],dict[str,Product]]): the dictionary produced by __getAllSharedPrimerPairs
     """
     # bin the candidate kmers for this genome
     binned = __binCandidateKmers(kmers)
@@ -440,11 +468,12 @@ def __updateBinsForUnprocessedGenomes(name:str, kmers:dict[str,list[Primer]], pa
             contig, fbin = primerToBin[fwd.reverseComplement()]
             contig, rbin = primerToBin[rev]
         
-        # replace the bin pair with the data for this genome
-        pairs[(fwd,rev)][name] = pairs[(fwd,rev)][name][:-1] + ((contig, fbin, rbin),)
+        # update the bin numbers for this pair
+        pairs[(fwd,rev)][name].fbin = fbin
+        pairs[(fwd,rev)][name].rbin = rbin
 
 
-def _getPrimerPairs(candidateKmers:dict[str,dict[str,list[Primer]]], params:Parameters) -> dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]:
+def _getPrimerPairs(candidateKmers:dict[str,dict[str,list[Primer]]], params:Parameters) -> dict[tuple[Primer,Primer],dict[str,Product]]:
     """gets primer pairs found in all the ingroup genomes
 
     Args:
@@ -456,7 +485,7 @@ def _getPrimerPairs(candidateKmers:dict[str,dict[str,list[Primer]]], params:Para
         RuntimeError: unable to identify primer pairs in every ingroup genome
 
     Returns:
-        dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]: key=Primer pairs; val=dict: key=genome name; val=tuple: contig, pcr product size, bin pair (contig, bin1, bin2)
+        dict[tuple[Primer,Primer],dict[str,Product]]: key=Primer pairs; val=dict: key=genome name; val=Product
     """
     # messages
     ERR_MSG_1 = "could not identify suitable primer pairs from the candidate kmers"
diff --git a/bin/main.py b/bin/main.py
index ebabf15..c8d02a5 100755
--- a/bin/main.py
+++ b/bin/main.py
@@ -3,6 +3,7 @@
 from bin.Clock import Clock
 from typing import Generator
 from bin.Primer import Primer
+from bin.Product import Product
 from Bio.SeqRecord import SeqRecord
 from bin.Parameters import Parameters
 from bin.sortPrimerPairs import _sortPairs
@@ -90,7 +91,7 @@ def __getCandidates(params:Parameters, sharedExists:bool, clock:Clock) -> dict[s
     return candidateKmers
 
 
-def __getUnfilteredPairs(params:Parameters, candidateKmers:dict[str,dict[str,list[Primer]]], clock:Clock) -> dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]:
+def __getUnfilteredPairs(params:Parameters, candidateKmers:dict[str,dict[str,list[Primer]]], clock:Clock) -> dict[tuple[Primer,Primer],dict[str,Product]]:
     """gets the unfiltered pairs
 
     Args:
@@ -99,7 +100,7 @@ def __getUnfilteredPairs(params:Parameters, candidateKmers:dict[str,dict[str,lis
         clock (Clock): a Clock object
 
     Returns:
-        dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]: key=Primer pairs; val=dict: key=genome name; val=tuple: contig, pcr product size, bin pair (contig, bin1, bin2)
+        dict[tuple[Primer,Primer],dict[str,Product]]: key=Primer pairs; val=dict: key=genome name; val=Product
     """
     # messages
     MSG_1  = "identifying pairs of primers found in all ingroup sequences"
@@ -126,12 +127,12 @@ def __getUnfilteredPairs(params:Parameters, candidateKmers:dict[str,dict[str,lis
     return pairs
 
 
-def __removeOutgroup(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> None:
+def __removeOutgroup(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,Product]]) -> None:
     """removes outgroup primers from the pairs dictionary. does not return
 
     Args:
         params (Parameters): a Parameters object
-        pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): the dictionary produced by __getUnfilteredPairs
+        pairs (dict[tuple[Primer,Primer],dict[str,Product]]): the dictionary produced by __getUnfilteredPairs
     """
     # messages
     GAP = " "*4
@@ -165,12 +166,12 @@ def __removeOutgroup(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str
         params.dumpObj(pairs, params.pickles[Parameters._PAIR_2], "filtered pairs", prefix=GAP)
 
 
-def __validatePairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> None:
+def __validatePairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,Product]]) -> None:
     """validates primer pairs with in silico PCR and removes invalid pairs
 
     Args:
         params (Parameters): a Parameters object
-        pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): the dictionary produced by __getUnfilteredPairs
+        pairs (dict[tuple[Primer,Primer],dict[str,Product]]): the dictionary produced by __getUnfilteredPairs
     """
     # messages
     GAP = ' '*4
@@ -196,17 +197,18 @@ def __validatePairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,
     params.dumpObj(pairs, params.pickles[Parameters._PAIR_3], "validated pairs", prefix=GAP)
 
 
-def __writePrimerPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]], clock:Clock) -> None:
+def __writePrimerPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,Product]], clock:Clock) -> None:
     """writes pairs of primers to file
 
     Args:
         params (Parameters): the Parameters object for the run
-        pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]])): key=primer pair; val=dict: key=genome name; val=tuple: contig; pcr len;  bin pair
+        pairs (dict[tuple[Primer,Primer],dict[str,Product]])): key=primer pair; val=dict: key=genome name; val=Product
         clock (Clock): a Clock object
     """
     # messages
-    MSG_A = "writing "
-    MSG_B = " primer pairs to "
+    MSG_1  = "sorting primer pairs"
+    MSG_2A = "writing "
+    MSG_2B = " primer pairs to "
     
     # contants
     EOL = "\n"
@@ -235,8 +237,8 @@ def getHeaders(names) -> list[str]:
     
     # print status
     params.log.rename(__writePrimerPairs.__name__)
-    params.log.info(f"{MSG_A}{len(pairs)}{MSG_B}'{os.path.relpath(params.resultsFn)}'")
-    clock.printStart(f"{MSG_A}{len(pairs)}{MSG_B}'{os.path.relpath(params.resultsFn)}'")
+    params.log.info(MSG_1)
+    clock.printStart(MSG_1)
     
     # get the names of genomes, and create headers with the same order
     names = list(next(iter(pairs.values())).keys())
@@ -245,6 +247,11 @@ def getHeaders(names) -> list[str]:
     # sort the primer pairs
     sortedPairs = _sortPairs(params, pairs)
     
+    # print status
+    clock.printDone()
+    params.log.info(f"{MSG_2A}{len(pairs)}{MSG_2B}'{os.path.relpath(params.resultsFn)}'")
+    clock.printStart(f"{MSG_2A}{len(pairs)}{MSG_2B}'{os.path.relpath(params.resultsFn)}'")
+    
     # open the file
     with open(params.resultsFn, 'w') as fh:
         # write the headers
@@ -263,10 +270,8 @@ def getHeaders(names) -> list[str]:
             
             # then save the contig name and PCR product length for each genome
             for name in names:
-                contig = pairs[(fwd,rev)][name][0]
-                pcrLen = pairs[(fwd,rev)][name][1]
-                row.append(contig)
-                row.append(pcrLen)
+                row.append(pairs[(fwd,rev)][name].contig)
+                row.append(pairs[(fwd,rev)][name].size)
             
             # write the data to the file
             fh.write(f"{SEP.join(map(str, row))}{EOL}")
@@ -274,7 +279,7 @@ def getHeaders(names) -> list[str]:
     
     # print status
     clock.printDone()
-    params.log.info(f"done {clock.getTimeString()}")    
+    params.log.info(f"done {clock.getTimeString()}")
 
 
 def __removeIntermediateFiles(params:Parameters) -> None:
diff --git a/bin/removeOutgroupPrimers.py b/bin/removeOutgroupPrimers.py
index f6819aa..6ae4fd8 100644
--- a/bin/removeOutgroupPrimers.py
+++ b/bin/removeOutgroupPrimers.py
@@ -1,12 +1,13 @@
 from bin.Clock import Clock
 from typing import Generator
 from bin.Primer import Primer
+from bin.Product import Product
 from ahocorasick import Automaton
 from Bio.SeqRecord import SeqRecord
 from bin.Parameters import Parameters
 
 # global constant
-__NULL_PRODUCT = ("NA", 0, ())
+__NULL_PRODUCT = ("NA", 0)
 
 
 # functions
@@ -23,7 +24,7 @@ def __getAutomaton(pairs:list[tuple[Primer,Primer]]) -> Automaton:
     out = Automaton()
     
     # add each kmer in the pairs to the automaton once and make the automaton
-    for kmer in {str(x) for y in pairs for x in y}:
+    for kmer in {str(primer) for pair in pairs for primer in pair}:
         out.add_word(kmer, kmer)
     out.make_automaton()
 
@@ -121,16 +122,22 @@ def __getOutgroupProductSizes(kmers:dict[str,dict[str,list[int]]], fwd:str, rev:
     return out
 
 
-def __processOutgroupResults(outgroupProducts:dict[str,dict[tuple[Primer,Primer],set[tuple[str,int,tuple]]]], pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> None:
+def __processOutgroupResults(outgroupProducts:dict[str,dict[tuple[Primer,Primer],set[tuple[str,int]]]], pairs:dict[tuple[Primer,Primer],dict[str,Product]]) -> None:
     """adds the outgroup results to the pairs dictionary
 
     Args:
-        outgroupProducts (dict[str,dict[tuple[Primer,Primer],set[tuple[str,int,tuple]]]]): key=genome name; val=dict: key=primer pair; val=set of tuples: contig, pcrLen, binpair
-        pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): key=Primer pair; val=dict: key=genome name; val=tuple: contig, pcrLen, binpair
+        outgroupProducts (dict[str,dict[tuple[Primer,Primer],set[tuple[str,int]]]]): key=genome name; val=dict: key=primer pair; val=set of tuples: contig, pcrLen
+        pairs (dict[tuple[Primer,Primer],dict[str,Product]]): key=Primer pair; val=dict: key=genome name; val=Product
     
     Returns:
         does not return. modifies the pairs dictionary
     """
+    # constant
+    FAKE_BIN = -1
+    
+    # get the name of an ingroup genome
+    ingroupName = next(iter([n for v in pairs.values() for n in v.keys() if n not in outgroupProducts.keys()]))
+    
     # for each pair remaining to process
     for pair in pairs.keys():
         # for each outgroup genome
@@ -140,7 +147,7 @@ def __processOutgroupResults(outgroupProducts:dict[str,dict[tuple[Primer,Primer]
 
             # if there is only one primer size, then save it
             if len(result) == 1:
-                pairs[pair][name] = result.pop()
+                contig, size = result.pop()
             
             # otherwise
             else:
@@ -150,27 +157,31 @@ def __processOutgroupResults(outgroupProducts:dict[str,dict[tuple[Primer,Primer]
                 
                 # if there is only one primer size, then save it
                 if len(result) == 1:
-                    pairs[pair][name] = result.pop()
+                    contig, size = result.pop()
                 
                 # otherwise
                 else:
                     # combine all contigs and pcrLens into separate lists
                     contigs = list()
                     pcrLens = list()
-                    for contig,pcrLen,binPair in result:
+                    for contig,pcrLen in result:
                         contigs.append(contig)
                         pcrLens.append(pcrLen)
                     
-                    # convert the contigs and lengths to comma-separated strings; add empty bin pair
-                    pairs[pair][name] = (",".join(contigs), ",".join(map(str,pcrLens)), ())
+                    # convert the contigs and lengths to comma-separated strings
+                    contig = ",".join(contigs)
+                    size = ",".join(map(str, pcrLens))
+                
+            # create and save the Product for this pair
+            pairs[pair][name] = Product(contig, size, FAKE_BIN, FAKE_BIN, pairs[pair][ingroupName].dimerTm)
 
 
-def _removeOutgroupPrimers(outgroup:dict[str,Generator[SeqRecord,None,None]], pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]], params:Parameters) -> None:
+def _removeOutgroupPrimers(outgroup:dict[str,Generator[SeqRecord,None,None]], pairs:dict[tuple[Primer,Primer],dict[str,Product]], params:Parameters) -> None:
     """removes primers found in the outgroup that produce disallowed product sizes
 
     Args:
         outgroup (dict[str,Generator[SeqRecord,None,None]]): key=genome name; val=contig generator
-        pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): key=Primer pair; dict:key=genome name; val=tuple: contig, pcr product size, bin pair (contig, num1, num2)
+        pairs (dict[tuple[Primer,Primer],dict[str,Product]]): key=Primer pair; dict:key=genome name; val=Product
         params (Parameters): a Parameters object
 
     Raises:
@@ -274,7 +285,7 @@ def _removeOutgroupPrimers(outgroup:dict[str,Generator[SeqRecord,None,None]], pa
                     
                     # if the pcr product lengths are not disallowed, then save them in the dictionary
                     if not done:
-                        outgroupProducts[name][(fwd,rev)].update({(contig, x, ()) for x in products})
+                        outgroupProducts[name][(fwd,rev)].update({(contig, x) for x in products})
         
     # log the number of pairs removed and remaining from the last genome if debugging
     params.log.debug(f"{MSG_5A}{startNumPairs - len(pairs)}{MSG_5B}{name}{MSG_5C}{len(pairs)}{MSG_5D}")
diff --git a/bin/sortPrimerPairs.py b/bin/sortPrimerPairs.py
index 324d76a..a1e5141 100644
--- a/bin/sortPrimerPairs.py
+++ b/bin/sortPrimerPairs.py
@@ -1,6 +1,7 @@
+import os, numpy
 from typing import Union
-import os, numpy, primer3
 from bin.Primer import Primer
+from bin.Product import Product
 from bin.Parameters import Parameters
 
 
@@ -45,11 +46,11 @@ def __minDiffFromRange(intgr:int, rng:range) -> int:
     return min((lower, upper))
 
 
-def __sortOutgroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], outgroup:list[str], ingroupPcrLens:range) -> tuple[int,int]:
+def __sortOutgroupProducts(metadata:dict[str,Product], outgroup:list[str], ingroupPcrLens:range) -> tuple[int,int]:
     """evaluates the outgroup products for a primer pair in order to sort the pair
 
     Args:
-        metadata (dict[str,tuple[str,int,tuple[str,int,int]]]): the metadata for a primer pair
+        metadata (dict[str,Product]): the metadata for a primer pair
         outgroup (list[str]): a list of outgroup filenames
         ingroupPcrLens (range): the range that ingroup PCR products are within
 
@@ -63,7 +64,7 @@ def __sortOutgroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]],
     outgroup = frozenset(map(os.path.basename, outgroup))
     
     # get the outgroup products that are >0bp
-    products = [v[1] for k,v in metadata.items() if k in outgroup and v != 0]
+    products = [v.size for k,v in metadata.items() if k in outgroup and v.size != 0]
     
     # handle situations where there are no products (best case)
     if products == []:
@@ -92,14 +93,10 @@ def __getHomopolymerTemps(pair:tuple[Primer,Primer]) -> float:
         float: the sum of both primer's homodimer melting temperatures
     """
     # parse the pair as strings
-    fwd,rev = map(str, pair)
-    
-    # calculate dimer tms
-    fDimer = primer3.calc_homodimer_tm(fwd)
-    rDimer = primer3.calc_homodimer_tm(rev)
+    fwd,rev = pair
     
-    # return the sum of these temps
-    return fDimer + rDimer
+    # return the sum of the dimer temps
+    return fwd.homodimerTm + rev.homodimerTm
 
 
 def __getHairpinTemps(pair:tuple[Primer,Primer]) -> float:
@@ -112,17 +109,13 @@ def __getHairpinTemps(pair:tuple[Primer,Primer]) -> float:
         float: the sum of the hairpin melting tempertures
     """
     # parse the pair as strings
-    fwd,rev = map(str, pair)
-
-    # calculate hairpin tms
-    fHairpin = primer3.calc_hairpin_tm(fwd)
-    rHairpin = primer3.calc_hairpin_tm(rev)
+    fwd,rev = pair
     
-    # return the sum of these temps
-    return fHairpin + rHairpin
+    # return the sum of the hairpin temps
+    return fwd.hairpinTm + rev.hairpinTm
 
 
-def __getHeteroDimerTemp(pair:tuple[Primer,Primer]) -> float:
+def __getHeteroDimerTemp(products:dict[str,Product]) -> float:
     """gets the heterodimer melting temperature for a pair of primers
 
     Args:
@@ -131,11 +124,8 @@ def __getHeteroDimerTemp(pair:tuple[Primer,Primer]) -> float:
     Returns:
         float: the heterodimer melting temperature
     """
-    
-    # parse the pair as strings
-    fwd,rev = map(str, pair)
-
-    return primer3.calc_heterodimer_tm(fwd, rev)
+    # all products have the same dimerTm; extract it from the first one
+    return next(iter(products.values())).dimerTm
 
 
 def __getGcDiff(pair:tuple[Primer,Primer]) -> float:
@@ -168,30 +158,30 @@ def __getTmDiff(pair:tuple[Primer,Primer]) -> float:
     return abs(fwd.Tm - rev.Tm)
 
 
-def __sortIngroupProducts(metadata:dict[str,tuple[str,int,tuple[str,int,int]]], ingroup:list[str]) -> tuple[float,float]:
+def __sortIngroupProducts(metadata:dict[str,Product], ingroup:list[str]) -> tuple[float,float]:
     """calculates the median product size for the ingroup genomes
 
     Args:
-        metadata (dict[str,tuple[str,int,tuple[str,int,int]]]): the metadata for a primer pair
+        metadata (dict[str,Product]): the metadata for a primer pair
 
     Returns:
-        float: the median PCR product size for the ingroup genomes
+        tuple[float,float]: variance of product sizes; median product size (negated)
     """
     # determine the genome names from the ingroup file names
     ingroup = frozenset(map(os.path.basename, ingroup))
     
     # extract the sizes from the metadata for the pair
-    products = [size for name,(contig,size,binpair) in metadata.items() if name in ingroup]
+    products = [prod.size for name,prod in metadata.items() if name in ingroup]
     
     # calculate the variance and the median of the 
     variance = numpy.var(products)
     median = numpy.var(products)
     
-    # return the variance and the median (negated to prioritze larger product sizes)
+    # return the variance and the median (negated to prioritize larger product sizes)
     return variance, -median
 
 
-def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> list[tuple[Primer,Primer]]:
+def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,Product]]) -> list[tuple[Primer,Primer]]:
     """sorts pairs of primers based on several characteristics:
           * first by difference in GC
           * next by difference in Tm
@@ -222,7 +212,7 @@ def _sortPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple
                                                                       range(params.minPcr, params.maxPcr+1)),
                                                __getGcDiff(p),
                                                __getTmDiff(p),
-                                               __getHeteroDimerTemp(p),
+                                               __getHeteroDimerTemp(pairs[p]),
                                                __getHomopolymerTemps(p),
                                                __getHairpinTemps(p),
                                                __sortIngroupProducts(pairs[p], params.ingroupFns)))
diff --git a/bin/validatePrimers.py b/bin/validatePrimers.py
index a89e5fa..a408675 100644
--- a/bin/validatePrimers.py
+++ b/bin/validatePrimers.py
@@ -2,6 +2,7 @@
 from bin.Clock import Clock
 from typing import Generator
 from bin.Primer import Primer
+from bin.Product import Product
 from Bio.SeqRecord import SeqRecord
 from bin.Parameters import Parameters
 import multiprocessing, os, subprocess
@@ -117,10 +118,10 @@ def __runAllPcrs(params:Parameters, pairs:list[tuple[Primer,Primer]]) -> dict[tu
 
     Args:
         params (Parameters): a Parameters object
-        pairs (list[tuple[Primer,Primer]]): a list of primer pairs
+        pairs (list[tuple[Primer,Primer]]): a list of Primer pairs
 
     Returns:
-        dict[tuple[str,str],set[tuple[str,int]]]: key=primer pair; val=(contig, pcr product size)
+        dict[tuple[str,str],set[tuple[str,int]]]: key=Primer pair; val=(contig, pcr product size)
     """
     # helper function
     def genArgs() -> Generator[tuple[str,str],None,None]:
@@ -156,29 +157,29 @@ def genArgs() -> Generator[tuple[str,str],None,None]:
     return {k:v for r in results for k,v in r.items()}
 
 
-def __filterPairs(pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]], pcrs:dict[tuple[str,str],list[tuple[str,int]]]) -> None:
+def __filterPairs(pairs:dict[tuple[Primer,Primer],dict[str,Product]], pcrs:dict[tuple[str,str],list[tuple[str,int]]]) -> None:
     """filters primer pairs based on isPcr results; does not return
 
     Args:
-        pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): key=Primer pair; dict:key=genome name; val=tuple: contig, pcr product size, bin pair (contig, num1, num2)
-        pcrs (dict[tuple[str,str],list[tuple[str,int]]]): the dictionary produced by __runPcr
+        pairs (dict[tuple[Primer,Primer],dict[str,Product]]): key=Primer pair; dict:key=genome name; val=Product
+        pcrs (dict[tuple[str,str],list[tuple[str,int]]]): the dictionary produced by __runAllPcrs
     """
     # for each pair
     for pair,observed in pcrs.items():
         # get a set of the expected product sizes (contig, size)
-        expected = {x[:2] for x in pairs[pair].values() if x[0] != 'NA'}
+        expected = {(x.contig, x.size) for x in pairs[pair].values() if x.contig != 'NA'}
 
         # remove any pairs that do not match the observed product sizes
         if expected != observed:
             del pairs[pair]
 
 
-def _validatePrimerPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]) -> None:
+def _validatePrimerPairs(params:Parameters, pairs:dict[tuple[Primer,Primer],dict[str,Product]]) -> None:
     """uses in silico PCR to validate primer pairs
 
     Args:
         params (Parameters): a Parameters object
-        pairs (dict[tuple[Primer,Primer],dict[str,tuple[str,int,tuple[str,int,int]]]]): key=Primer pair; dict:key=genome name; val=tuple: contig, pcr product size, bin pair (contig, num1, num2)
+        pairs (dict[tuple[Primer,Primer],dict[str,Product]]): key=Primer pair; dict:key=genome name; val=Product
 
     Raises:
         RuntimeError: no valid primer pairs

From 18071dae68148680a5ba6342482f9739cd66e4f9 Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Thu, 12 Sep 2024 15:14:18 -0400
Subject: [PATCH 14/17] added additional tests to ensure Tms are stored
 correctly

---
 bin/unit_tests/primer_test.py  | 32 ++++++++++++++++++++++++
 bin/unit_tests/results_test.py | 45 ++++++++++++++++++++++++++++++++--
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/bin/unit_tests/primer_test.py b/bin/unit_tests/primer_test.py
index 4a47968..efc88f1 100644
--- a/bin/unit_tests/primer_test.py
+++ b/bin/unit_tests/primer_test.py
@@ -28,6 +28,26 @@ def setUp(self) -> None:
         self.fwd_2 = Primer(PrimerTest.FWD_SEQ, PrimerTest.CNTG_2, PrimerTest.FSTR_2, len(PrimerTest.FWD_SEQ), Primer.MINUS)
         self.rev_1 = Primer(PrimerTest.REV_SEQ, PrimerTest.CNTG_1, PrimerTest.RSTR_1, len(PrimerTest.REV_SEQ), Primer.MINUS)
         self.rev_2 = Primer(PrimerTest.REV_SEQ, PrimerTest.CNTG_2, PrimerTest.RSTR_2, len(PrimerTest.REV_SEQ), Primer.PLUS)
+        
+        # save the hairpin Tms
+        self.fwd_1.hairpinTm = primer3.calc_hairpin_tm(PrimerTest.FWD_SEQ)
+        self.fwd_1.rcHairpin = primer3.calc_hairpin_tm(str(Seq(PrimerTest.FWD_SEQ).reverse_complement()))
+        self.fwd_2.hairpinTm = primer3.calc_hairpin_tm(PrimerTest.FWD_SEQ)
+        self.fwd_2.rcHairpin = primer3.calc_hairpin_tm(str(Seq(PrimerTest.FWD_SEQ).reverse_complement()))
+        self.rev_1.hairpinTm = primer3.calc_hairpin_tm(PrimerTest.REV_SEQ)
+        self.rev_1.rcHairpin = primer3.calc_hairpin_tm(str(Seq(PrimerTest.REV_SEQ).reverse_complement()))
+        self.rev_2.hairpinTm = primer3.calc_hairpin_tm(PrimerTest.REV_SEQ)
+        self.rev_2.rcHairpin = primer3.calc_hairpin_tm(str(Seq(PrimerTest.REV_SEQ).reverse_complement()))
+        
+        # save the homodimer Tms
+        self.fwd_1.homodimerTm = primer3.calc_homodimer_tm(PrimerTest.FWD_SEQ)
+        self.fwd_1.rcHomodimer = primer3.calc_homodimer_tm(str(Seq(PrimerTest.FWD_SEQ).reverse_complement()))
+        self.fwd_2.homodimerTm = primer3.calc_homodimer_tm(PrimerTest.FWD_SEQ)
+        self.fwd_2.rcHomodimer = primer3.calc_homodimer_tm(str(Seq(PrimerTest.FWD_SEQ).reverse_complement()))
+        self.rev_1.homodimerTm = primer3.calc_homodimer_tm(PrimerTest.REV_SEQ)
+        self.rev_1.rcHomodimer = primer3.calc_homodimer_tm(str(Seq(PrimerTest.REV_SEQ).reverse_complement()))
+        self.rev_2.homodimerTm = primer3.calc_homodimer_tm(PrimerTest.REV_SEQ)
+        self.rev_2.rcHomodimer = primer3.calc_homodimer_tm(str(Seq(PrimerTest.REV_SEQ).reverse_complement()))
     
     def tearDown(self) -> None:
         return super().tearDown()
@@ -235,6 +255,18 @@ def testN_revComp(self) -> None:
         self.assertEqual(rc_f2.contig, self.fwd_2.contig)
         self.assertEqual(rc_r1.contig, self.rev_1.contig)
         self.assertEqual(rc_r2.contig, self.rev_2.contig)
+        
+        # check hairpin Tm
+        self.assertEqual(rc_f1.hairpinTm, self.fwd_1.rcHairpin)
+        self.assertEqual(rc_f2.hairpinTm, self.fwd_2.rcHairpin)
+        self.assertEqual(rc_f1.rcHairpin, self.fwd_1.hairpinTm)
+        self.assertEqual(rc_f2.rcHairpin, self.fwd_2.hairpinTm)
+        
+        # check homodimer Tm
+        self.assertEqual(rc_f1.homodimerTm, self.fwd_1.rcHomodimer)
+        self.assertEqual(rc_f2.homodimerTm, self.fwd_2.rcHomodimer)
+        self.assertEqual(rc_f1.rcHomodimer, self.fwd_1.homodimerTm)
+        self.assertEqual(rc_f2.rcHomodimer, self.fwd_2.homodimerTm)
 
 
 if __name__ == "__main__":
diff --git a/bin/unit_tests/results_test.py b/bin/unit_tests/results_test.py
index 7f98d19..2e9ed4f 100644
--- a/bin/unit_tests/results_test.py
+++ b/bin/unit_tests/results_test.py
@@ -10,6 +10,7 @@
 from bin.main import _runner
 from khmer import Countgraph
 from bin.Primer import Primer
+from bin.Product import Product
 from Bio.SeqRecord import SeqRecord
 from bin.Parameters import Parameters
 from bin.sortPrimerPairs import _sortPairs
@@ -755,8 +756,11 @@ def testK_noPrimerDimers(self) -> None:
             fwd = Primer(fwd, '', 100, len(fwd), Primer.PLUS)
             rev = Primer(rev, '', 100, len(rev), Primer.MINUS)
 
+            # check dimer Tm
+            forms,tm = _formsDimers(str(fwd),str(rev.reverseComplement()),fwd.Tm, rev.Tm)
+            
             # check for primer dimers; both primers need to be on the same strand
-            self.assertFalse(_formsDimers(str(fwd),str(rev.reverseComplement()),fwd.Tm, rev.Tm), f'primer dimers present in {fwd}, {rev}')
+            self.assertFalse(forms, f'primer dimers present in {fwd}, {rev}')
 
     def testL_ingroupHasOneProduct(self) -> None:
         """do all pairs produce one product size in the ingroup
@@ -1009,7 +1013,7 @@ def testU_arePairsSortedCorrectly(self) -> None:
         """
         # load the pairs from the pickle
         with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh:
-            pairs:dict[tuple[Primer,Primer]] = pickle.load(fh)
+            pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh)
         
         # get the expected sort order as a tuples of Seq
         expected = [(f.seq,r.seq) for f,r in _sortPairs(self.params, pairs)]
@@ -1019,6 +1023,43 @@ def testU_arePairsSortedCorrectly(self) -> None:
         
         # check that the order is conserved
         self.assertListEqual(expected, observed, 'expected sort order does not match the order written to file')
+    
+    def testV_primerTmsWereCorrect(self) -> None:
+        """verifies that the pcr product data was populated correctly during runtime
+        """
+        # load the pairs from the pickle
+        with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh:
+            pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh)
+        
+        # get a set of the non-redundant primer sequences
+        primers = {x for p in pairs.keys() for x in p}
+        
+        # for each primer
+        for primer in primers:
+            # check hairpin Tms
+            self.assertAlmostEqual(primer.hairpinTm, primer3.calc_hairpin_tm(str(primer)), places=3)
+            self.assertAlmostEqual(primer.rcHairpin, primer3.calc_hairpin_tm(str(primer.reverseComplement())), places=3)
+            
+            # check homodimer Tms
+            self.assertAlmostEqual(primer.homodimerTm, primer3.calc_homodimer_tm(str(primer)), places=3)
+            self.assertAlmostEqual(primer.rcHomodimer, primer3.calc_homodimer_tm(str(primer.reverseComplement())), places=3)
+    
+    def testW_productTmsWereCorrect(self) -> None:
+        # load the pairs from the pickle
+        with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh:
+            pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh)
+        
+        # for each pair
+        for fwd,rev in pairs.keys():
+            # calculate the heterodimer Tm
+            dimerTm = primer3.calc_heterodimer_tm(str(fwd), str(rev))
+            
+            # for each product
+            for name in pairs[(fwd,rev)].keys():
+                product = pairs[(fwd,rev)][name]
+                
+                # check that the dimer Tm was saved correctly
+                self.assertAlmostEqual(dimerTm, product.dimerTm, places=3)
 
 
 if __name__ == "__main__":

From d60bda0493acafb3395885ef833bf739bfa3b151 Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Thu, 12 Sep 2024 15:14:41 -0400
Subject: [PATCH 15/17] checking file writability in a safer way

---
 bin/Parameters.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/bin/Parameters.py b/bin/Parameters.py
index b2366af..ed8b458 100644
--- a/bin/Parameters.py
+++ b/bin/Parameters.py
@@ -177,11 +177,9 @@ def __checkOutputFile(fn:str) -> None:
             if proceed == YN[1]:
                 raise FileExistsError(f"{WARN_MSG_A}{fn}{WARN_MSG_B}")
         
-        # make sure the file can be written
+        # make sure we can write the file to the output directory
         try:
-            fh = open(fn, 'w')
-            fh.close()
-            os.remove(fn)
+            os.access(os.path.dirname(fn), os.W_OK)
         except:
             raise ValueError(f"{ERR_MSG}{fn}")
 

From 4fb2483afbe3ac9e420911aca118e62cc0552b4b Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Thu, 12 Sep 2024 15:43:03 -0400
Subject: [PATCH 16/17] reduced redundant operations; better documentation

---
 bin/unit_tests/results_test.py | 36 +++++++++++++++-------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/bin/unit_tests/results_test.py b/bin/unit_tests/results_test.py
index 2e9ed4f..2a2e2dc 100644
--- a/bin/unit_tests/results_test.py
+++ b/bin/unit_tests/results_test.py
@@ -111,6 +111,12 @@ def setUpClass(cls) -> None:
         cls.sequences:dict[str,dict[str,dict[str,Seq]]] = ResultsTest._loadGenomeSequences()
         clock.printDone()
         
+        # load the pairs from the pickle
+        clock.printStart('unpickling pairs')
+        with open(ResultsTest.params.pickles[Parameters._PAIR_3], 'rb') as fh:
+            cls.pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh)
+        clock.printDone()
+        
         # calculate the binding sites
         clock.printStart('getting binding sites for all kmers and primers', end='...\n', spin=False)
         cls.bindingSites:dict[Seq,dict[str,dict[str,dict[str,list[int]]]]] = ResultsTest._getKmerBindingSites(cls.results, cls.sequences, cls.params.minLen, cls.params.maxLen)
@@ -1011,12 +1017,8 @@ def testT_allPairsWritten(self) -> None:
     def testU_arePairsSortedCorrectly(self) -> None:
         """verifies that the pairs were written in the correct order
         """
-        # load the pairs from the pickle
-        with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh:
-            pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh)
-        
         # get the expected sort order as a tuples of Seq
-        expected = [(f.seq,r.seq) for f,r in _sortPairs(self.params, pairs)]
+        expected = [(f.seq,r.seq) for f,r in _sortPairs(self.params, self.pairs)]
         
         # get the observed order that was written
         observed  = sorted(self.results.keys(), key=lambda x: self.results[x].rowNum)
@@ -1025,14 +1027,10 @@ def testU_arePairsSortedCorrectly(self) -> None:
         self.assertListEqual(expected, observed, 'expected sort order does not match the order written to file')
     
     def testV_primerTmsWereCorrect(self) -> None:
-        """verifies that the pcr product data was populated correctly during runtime
+        """verifies that the homodimer Tms and hairpin Tms were stored correctly
         """
-        # load the pairs from the pickle
-        with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh:
-            pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh)
-        
         # get a set of the non-redundant primer sequences
-        primers = {x for p in pairs.keys() for x in p}
+        primers = {x for p in self.pairs.keys() for x in p}
         
         # for each primer
         for primer in primers:
@@ -1045,22 +1043,20 @@ def testV_primerTmsWereCorrect(self) -> None:
             self.assertAlmostEqual(primer.rcHomodimer, primer3.calc_homodimer_tm(str(primer.reverseComplement())), places=3)
     
     def testW_productTmsWereCorrect(self) -> None:
-        # load the pairs from the pickle
-        with open(self.params.pickles[Parameters._PAIR_3], 'rb') as fh:
-            pairs:dict[tuple[Primer,Primer],dict[str,Product]] = pickle.load(fh)
-        
+        """verifies that heterodimer Tm was correct for all PCR products
+        """
         # for each pair
-        for fwd,rev in pairs.keys():
-            # calculate the heterodimer Tm
+        for fwd,rev in self.pairs.keys():
+            # calculate the heterodimer Tm for this pair
             dimerTm = primer3.calc_heterodimer_tm(str(fwd), str(rev))
             
             # for each product
-            for name in pairs[(fwd,rev)].keys():
-                product = pairs[(fwd,rev)][name]
+            for name in self.pairs[(fwd,rev)].keys():
+                product = self.pairs[(fwd,rev)][name]
                 
                 # check that the dimer Tm was saved correctly
                 self.assertAlmostEqual(dimerTm, product.dimerTm, places=3)
 
-
+# entrypoint
 if __name__ == "__main__":
     unittest.main()

From f3d0f90b13bc1f0478d45c83a1c981e36e01494d Mon Sep 17 00:00:00 2001
From: dr-joe-wirth <dr.joe.wirth@gmail.com>
Date: Fri, 13 Sep 2024 14:08:10 -0400
Subject: [PATCH 17/17] first draft of a docker github action

---
 .github/workflows/docker-build-push.yml | 41 +++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 .github/workflows/docker-build-push.yml

diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml
new file mode 100644
index 0000000..b0be019
--- /dev/null
+++ b/.github/workflows/docker-build-push.yml
@@ -0,0 +1,41 @@
+name: Build and Push Docker Image
+
+on:
+  push:
+    tags:
+      - 'v*.*.*'
+
+env:
+  DOCKER_USERNAME: jwirth
+  DOCKER_IMAGE_NAME: primerforge
+
+jobs:
+  docker-build-push:
+    name: docker build and push
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Extract version from tag
+        run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker
+          file: ./docker/Dockerfile
+          push: true
+          tags: |
+            ${{ env.DOCKER_USERNAME }}/${{ env.DOCKER_IMAGE_NAME }}:v${{ env.VERSION }}
+            ${{ env.DOCKER_USERNAME }}/${{ env.DOCKER_IMAGE_NAME }}:latest