Skip to content

Commit

Permalink
bugfixes and adding code to simulate primer-directed read1
Browse files Browse the repository at this point in the history
  • Loading branch information
koadman committed Mar 10, 2014
1 parent 18e56ab commit baa6ea8
Showing 1 changed file with 24 additions and 3 deletions.
27 changes: 24 additions & 3 deletions simulation/hic_simulator/src/simForward.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
#!/usr/bin/env python
from Bio import SeqIO
from Bio import Alphabet
from Bio.Restriction import *
from collections import OrderedDict
import numpy
import sys
import re
from Bio.Seq import Seq

# restriction enzymes used in reaction
cut4 = Restriction.NlaIII
Expand All @@ -15,14 +18,27 @@ def findRestrictionSites(enzyme, seq):
"""
return enzyme.search(seq,linear=False)

def findPrimingSites(oligo, seq):
"""For supplied priming sequence, find positions of all matches in a given sequence
returns list of sites.
"""
array = []
for m in re.finditer(oligo, seq.tostring()):
array.append(m.end())
rc_oligo = Seq(oligo)
rc_oligo.reverse_complement()
for m in re.finditer(rc_oligo.tostring(), seq.tostring()):
array.append(m.end())
return array

def drawDelta(minLength,maxLength):
"""Draws from a distribution only accepting values between min and max."""
# as this relates to a circular chromosome, the min and max could be considered one value.
# could do this modulo length of chromosome.
geometricProbability = 1.0e-4
geometricProbability = 1.0e-5

This comment has been minimized.

Copy link
@koadman

koadman Mar 10, 2014

Author Owner

looking at the results this appeared to be a better match to the Beitel et al data

This comment has been minimized.

Copy link
@cerebis

cerebis Mar 10, 2014

Collaborator

Ok great. My intention was to move that parameter out to the CLI as I expected it would need calibration.


delta = numpy.random.geometric(p=geometricProbability,size=1)
while (delta < minLength and delta > maxLength):
while (delta < minLength or delta > maxLength):
delta = numpy.random.geometric(p=geometricProbability,size=1)
return int(delta)

Expand Down Expand Up @@ -136,6 +152,7 @@ def __init__(self, name, parentCell, sequence):
self.parentCell = parentCell
self.sequence = sequence
self.cutSites = {}
self.cutSites['515F'] = numpy.array(findPrimingSites("GTGCCAGC[AC]GCCGCGGTAA",sequence.seq))
self.cutSites['4cut'] = numpy.array(findRestrictionSites(cut4, sequence.seq))
self.cutSites['6cut'] = numpy.array(findRestrictionSites(cut6, sequence.seq))

Expand Down Expand Up @@ -390,6 +407,7 @@ def getCDF(self):
def makeUnconstrainedPartA():
rep = comm.getRepliconByIndex(comm.pickReplicon())
pos6c = rep.randomCutSite('6cut')
# pos6c = rep.randomCutSite('515F')
fwd = comm.isForwardStrand()
pos4c = rep.nearestCutSiteAbove('4cut',pos6c)
seq = rep.subSeq(pos6c,pos4c, True)
Expand Down Expand Up @@ -460,6 +478,9 @@ def makeConstrainedPartB(partA):
if len(fragment) < 200: # fudge minimum length of total fragment
skipCount += 1
continue
if len(fragment) > 1000: # fudge maximum length of total fragment

This comment has been minimized.

Copy link
@koadman

koadman Mar 10, 2014

Author Owner

frags bigger than 1k don't sequence very well on the MiSeq

This comment has been minimized.

Copy link
@cerebis

cerebis Mar 10, 2014

Collaborator

Practicalities are good.

skipCount += 1
continue

read1 = makeRead(fragment, True, readLength)
read1.id = "frg" + str(fragCount) + "fwd"
Expand All @@ -476,4 +497,4 @@ def makeConstrainedPartB(partA):
# Close output file before exit
hOutput.close()

print "Ignored " + str(skipCount) + " fragments due to short length"
print "Ignored " + str(skipCount) + " fragments due to length restrictions"

0 comments on commit baa6ea8

Please sign in to comment.