Skip to content

Commit

Permalink
Made interactiveAnalysis an option in the main script.
Browse files Browse the repository at this point in the history
  • Loading branch information
ecbush committed Jun 11, 2018
1 parent 8794555 commit 1db650c
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 167 deletions.
36 changes: 36 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,42 @@ The last two steps, printAnalysis and createIslandBed make the output files rele

* ``createIslandBed`` creates a subdirectory called bed/ containing bed files for each genome showing the islands in different colors.

Interactive analysis
~~~~~~~~~~~~~~~~~~~~

After you have done runAll, it is possible to bring up the interpreter for interactive analysis::

xenoGI params.py interactiveAnalysis
From within python, you can then run functions such as

* printIslandsAtNode

Usage::

printIslandsAtNode('i0') # All islands at node i0
printIslandsAtNode('E_coli_K12') # All islands on the E. coli K12 branch

* findIsland

Usage::
findIsland('gadA') # Find an island associated with a gene name or description``
* printIsland

If we've identified an island of interest (for example island number 3500) then we can print it like this::

printIsland(3500,10) # First argument is island id, second is the number of genes to print to each side
printIsland prints the island in each strain where it's present. Its output includes the island and family numbers for each gene, an error score for the family of each gene, the most recent common ancestor (mrca) of the family, and a description of the gene. The error score is intended to indicate confidence in the correctness of the family. 0 means more confident, higher numbers less confident.

* printFam

Print scores within a particular gene family, and also with similar genes not in the family::
printFam(3500)



Additional files
Expand Down
127 changes: 0 additions & 127 deletions misc/interactiveAnalysis.py

This file was deleted.

37 changes: 0 additions & 37 deletions misc/miscDocumentation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,43 +40,6 @@ Tools for visualization with the IGB browser
python3 path-to-xenoGI-github-repository/misc/createIslandGffs.py params.py


interactiveAnalysis.py
----------------------

This script does some interactive analysis from within the interpreter::

python3 -i path-to-xenoGI-github-repository/misc/interactiveAnalysis.py params.py

From within python, you can then run functions such as

* printIslandsAtNode

Usage::

printIslandsAtNode('i0') # All islands at node i0
printIslandsAtNode('E_coli_K12') # All islands on the E. coli K12 branch

* findIsland

Usage::
findIsland('gadA') # Find an island associated with a gene name or description``
* printIsland

If we've identified an island of interest (for example island number 3500) then we can print it like this::

printIsland(3500,10) # First argument is island id, second is the number of genes to print to each side
printIsland prints the island in each strain where it's present. Its output includes the island and family numbers for each gene, an error score for the family of each gene, the most recent common ancestor (mrca) of the family, and a description of the gene. The error score is intended to indicate confidence in the correctness of the family. 0 means more confident, higher numbers less confident.

* printFam

Print scores within a particular gene family, and also with similar genes not in the family::
printFam(3500)


Obtaining a tree if you don't already have one
-----------------------------------------------

Expand Down
120 changes: 117 additions & 3 deletions xenoGI/xenoGI.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ def main():
assert(len(sys.argv) == 3)
paramFN=sys.argv[1]
task = sys.argv[2]
assert(task in ['parseGenbank', 'runBlast', 'calcScores', 'makeFamilies', 'makeIslands', 'printAnalysis', 'createIslandBed', 'runAll'])
assert(task in ['parseGenbank', 'runBlast', 'calcScores', 'makeFamilies', 'makeIslands', 'printAnalysis', 'createIslandBed', 'interactiveAnalysis', 'runAll'])

except:
print(
"""
Exactly two arguments required.
1. A path to a parameter file.
2. The task to be run which must be one of: parseGenbank, runBlast, calcScores, makeFamilies, makeIslands, printAnalysis, createIslandBed, or runAll.
2. The task to be run which must be one of: parseGenbank, runBlast, calcScores, makeFamilies, makeIslands, printAnalysis, createIslandBed, interactiveAnalysis, or runAll.
For example:
xenoGI params.py parseGenbank
Expand Down Expand Up @@ -56,7 +56,7 @@ def main():
#### createIslandBed
elif task == 'createIslandBed':
createIslandBedWrapper(paramD)

#### runAll
elif task == 'runAll':
parseGenbankWrapper(paramD)
Expand All @@ -67,6 +67,10 @@ def main():
printAnalysisWrapper(paramD)
createIslandBedWrapper(paramD)

#### interactiveAnalysis
elif task == 'interactiveAnalysis':
interactiveAnalysisWrapper(paramD)


######## Task related functions

Expand Down Expand Up @@ -215,3 +219,113 @@ def createIslandBedWrapper(paramD):

islandBed.createAllBeds(islandByStrainD,geneInfoD,tree,strainNum2StrD,paramD['bedFilePath'],paramD['scoreNodeMapD'],paramD['potentialRgbL'],paramD['bedNumTries'])

def interactiveAnalysisWrapper(paramD):
"""Enter interactive mode."""

## Set up the modules a bit differently for interactive mode
import code,sys
from xenoGI.analysis import createGene2FamD,createFam2IslandD,printScoreMatrix,matchFamilyIsland,printIslandNeighb,vPrintIslands,coreNonCoreCtAtNode,printOutsideFamilyScores

## Wrapper analysis functions. For convenience these assume a
## bunch of global variables.

def printFam(familyNum,fileF=sys.stdout):
'''This is a wrapper to provide an easy way to print relevant info on
a family. For ease of use, we take only one argument, assuming all the
other required stuff is available at the top level. familyNum is the
numerical identifier of a family.
'''

print("Family error score (count of possibly misassigned genes):",familyL[familyNum].possibleErrorCt,file=fileF)

print(file=fileF)
print("Matrix of raw similarity scores [0,1] between genes in the family",file=fileF)
printScoreMatrix(familyNum,subtreeL,familyL,geneNames,scoresO,'rawSc',fileF)
print(file=fileF)
print(file=fileF)

print(file=fileF)
print("Matrix of normalized similarity scores between genes in the family",file=fileF)
printScoreMatrix(familyNum,subtreeL,familyL,geneNames,scoresO,'normSc',fileF)
print(file=fileF)
print(file=fileF)

print("Matrix of core synteny scores [0,1] between genes in the family",file=fileF)
printScoreMatrix(familyNum,subtreeL,familyL,geneNames,scoresO,'coreSynSc',fileF)
print(file=fileF)
print(file=fileF)

print("Matrix of synteny scores between genes in the family",file=fileF)
printScoreMatrix(familyNum,subtreeL,familyL,geneNames,scoresO,'synSc',fileF)
print(file=fileF)
print(file=fileF)

printOutsideFamilyScores(familyNum,subtreeL,familyL,geneNames,scoresO,fileF)
print(file=fileF)
print(file=fileF)

def findIsland(searchStr,fileF=sys.stdout):
'''Print the gene, family and island associated with searchStr. This
is a wrapper that assumes various required objects are present at the
top level.'''
L=matchFamilyIsland(geneInfoD,geneNames,gene2FamD,fam2IslandD,searchStr)
for geneName,fam,isl in L:
print("<gene:"+str(geneName)+">","<family:"+str(fam)+">","<island:"+str(isl)+">",file=fileF)


def printIsland(islandNum,synWSize,fileF=sys.stdout):
'''Print the island and its genomic context in each species. We
include synWSize/2 genes in either direction beyond the island.
'''
printIslandNeighb(islandNum,synWSize,subtreeL,islandByNodeL,familyL,geneOrderT,gene2FamD,fam2IslandD,geneInfoD,geneNames,strainNum2StrD,fileF)


def printIslandsAtNode(nodeStr,fileF=sys.stdout):
'''This is a wrapper to provide an easy way to print all the islands
at a particular node in the tree. For ease of use, we take only a node
number as argument, assuming all the other required stuff is available
at the top level.
'''
node = strainStr2NumD[nodeStr]
vPrintIslands(islandByNodeL[node],subtreeL,familyL,strainNum2StrD,geneNames,geneInfoD,fileF)


def printCoreNonCoreByNode(fileF=sys.stdout):
'''For each node in the focal clade, print the number of core and
non-core families. That is, for that node, we get all families present
in descendent species. We then look at their mrcas. Those with an mrca
below the node in question are non-core, others are core.'''

focTree = trees.subtree(tree,strainStr2NumD[paramD['rootFocalClade']])
focInodesL=trees.iNodeList(focTree)
familyByNodeL=createFamilyByNodeL(geneOrderT,gene2FamD)

rowL=[]
rowL.append(['Node','Core','Non-Core','Total','% Non-Core'])
rowL.append(['----','----','--------','-----','----------'])
for node in focInodesL:
nonCore,core=coreNonCoreCtAtNode(tree,node,familyByNodeL,familyL)
rowL.append([strainNum2StrD[node],str(core),str(nonCore),str(core+nonCore),str(format(nonCore/(core+nonCore),".3f"))])
printTable(rowL,fileF=fileF)
print(file=fileF)
return


## Load data

tree,strainStr2NumD,strainNum2StrD,geneNames,subtreeL,geneOrderT = loadMiscDataStructures(paramD)
nodesL=trees.nodeList(tree)
geneNames = genomes.geneNames(paramD['geneOrderFN'],strainStr2NumD,strainNum2StrD)
geneInfoD = genomes.readGeneInfoD(paramD['geneInfoFN'])
familyL = families.readFamilies(paramD['familyFN'],tree,geneNames,strainStr2NumD)
islandByNodeL=islands.readIslands(paramD['islandOutFN'],tree,strainStr2NumD)
gene2FamD=createGene2FamD(familyL)
fam2IslandD=createFam2IslandD(islandByNodeL)
geneOrderT=genomes.createGeneOrderTs(paramD['geneOrderFN'],geneNames,subtreeL,strainStr2NumD)
scoresO = scores.readScores(paramD['scoresFN'],geneNames)
scoresO.createNodeConnectL(geneNames) # make nodeConnectL attribute
# calc family error scores
families.calcErrorScores(familyL,scoresO,paramD['minNormThresh'],paramD['minCoreSynThresh'],paramD['minSynThresh'],paramD['famErrorScoreIncrementD'])

code.interact(local=locals())

0 comments on commit 1db650c

Please sign in to comment.