From 1db650c2860d9be028ffccccd463cfb25d454f1c Mon Sep 17 00:00:00 2001 From: Eliot Bush Date: Mon, 11 Jun 2018 09:32:48 -0400 Subject: [PATCH] Made interactiveAnalysis an option in the main script. --- README.rst | 36 ++++++++++ misc/interactiveAnalysis.py | 127 ------------------------------------ misc/miscDocumentation.rst | 37 ----------- xenoGI/xenoGI.py | 120 +++++++++++++++++++++++++++++++++- 4 files changed, 153 insertions(+), 167 deletions(-) delete mode 100644 misc/interactiveAnalysis.py diff --git a/README.rst b/README.rst index 8715544..4374028 100644 --- a/README.rst +++ b/README.rst @@ -114,6 +114,42 @@ The last two steps, printAnalysis and createIslandBed make the output files rele * ``createIslandBed`` creates a subdirectory called bed/ containing bed files for each genome showing the islands in different colors. +Interactive analysis +~~~~~~~~~~~~~~~~~~~~ + +After you have done runAll, it is possible to bring up the interpreter for interactive analysis:: + + xenoGI params.py interactiveAnalysis + +From within python, you can then run functions such as + +* printIslandsAtNode + + Usage:: + + printIslandsAtNode('i0') # All islands at node i0 + printIslandsAtNode('E_coli_K12') # All islands on the E. coli K12 branch + +* findIsland + + Usage:: + + findIsland('gadA') # Find an island associated with a gene name or description`` + +* printIsland + + If we've identified an island of interest (for example island number 3500) then we can print it like this:: + + printIsland(3500,10) # First argument is island id, second is the number of genes to print to each side + + printIsland prints the island in each strain where it's present. Its output includes the island and family numbers for each gene, an error score for the family of each gene, the most recent common ancestor (mrca) of the family, and a description of the gene. The error score is intended to indicate confidence in the correctness of the family. 0 means more confident, higher numbers less confident. + +* printFam + + Print scores within a particular gene family, and also with similar genes not in the family:: + + printFam(3500) + Additional files diff --git a/misc/interactiveAnalysis.py b/misc/interactiveAnalysis.py deleted file mode 100644 index 5f9fd6a..0000000 --- a/misc/interactiveAnalysis.py +++ /dev/null @@ -1,127 +0,0 @@ -import sys,os -sys.path.append(os.path.join(sys.path[0],'..')) -from xenoGI import parameters,genomes,trees,families,scores,islands -from xenoGI.analysis import * - -## Wrapper functions -# these should be here, as they assume a bunch of global variables. - -def printFam(familyNum,fileF=sys.stdout): - '''This is a wrapper to provide an easy way to print relevant info on -a family. For ease of use, we take only one argument, assuming all the -other required stuff is available at the top level. familyNum is the -numerical identifier of a family. - ''' - - print("Family error score (count of possibly misassigned genes):",familyL[familyNum].possibleErrorCt,file=fileF) - - print(file=fileF) - print("Matrix of raw similarity scores [0,1] between genes in the family",file=fileF) - printScoreMatrix(familyNum,subtreeL,familyL,geneNames,scoresO,'rawSc',fileF) - print(file=fileF) - print(file=fileF) - - print(file=fileF) - print("Matrix of normalized similarity scores between genes in the family",file=fileF) - printScoreMatrix(familyNum,subtreeL,familyL,geneNames,scoresO,'normSc',fileF) - print(file=fileF) - print(file=fileF) - - print("Matrix of core synteny scores [0,1] between genes in the family",file=fileF) - printScoreMatrix(familyNum,subtreeL,familyL,geneNames,scoresO,'coreSynSc',fileF) - print(file=fileF) - print(file=fileF) - - print("Matrix of synteny scores between genes in the family",file=fileF) - printScoreMatrix(familyNum,subtreeL,familyL,geneNames,scoresO,'synSc',fileF) - print(file=fileF) - print(file=fileF) - - printOutsideFamilyScores(familyNum,subtreeL,familyL,geneNames,scoresO,fileF) - print(file=fileF) - print(file=fileF) - -def findIsland(searchStr,fileF=sys.stdout): - '''Print the gene, family and island associated with searchStr. This -is a wrapper that assumes various required objects are present at the -top level.''' - L=matchFamilyIsland(geneInfoD,geneNames,gene2FamD,fam2IslandD,searchStr) - for geneName,fam,isl in L: - print("","","",file=fileF) - - -def printIsland(islandNum,synWSize,fileF=sys.stdout): - '''Print the island and its genomic context in each species. We - include synWSize/2 genes in either direction beyond the island. - ''' - printIslandNeighb(islandNum,synWSize,subtreeL,islandByNodeL,familyL,geneOrderT,gene2FamD,fam2IslandD,geneInfoD,geneNames,strainNum2StrD,fileF) - - -def printIslandsAtNode(nodeStr,fileF=sys.stdout): - '''This is a wrapper to provide an easy way to print all the islands -at a particular node in the tree. For ease of use, we take only a node -number as argument, assuming all the other required stuff is available -at the top level. - ''' - node = strainStr2NumD[nodeStr] - vPrintIslands(islandByNodeL[node],subtreeL,familyL,strainNum2StrD,geneNames,fileF) - - -def printCoreNonCoreByNode(fileF=sys.stdout): - '''For each node in the focal clade, print the number of core and -non-core families. That is, for that node, we get all families present -in descendent species. We then look at their mrcas. Those with an mrca -below the node in question are non-core, others are core.''' - - focTree = trees.subtree(tree,strainStr2NumD[paramD['rootFocalClade']]) - focInodesL=trees.iNodeList(focTree) - familyByNodeL=createFamilyByNodeL(geneOrderT,gene2FamD) - - rowL=[] - rowL.append(['Node','Core','Non-Core','Total','% Non-Core']) - rowL.append(['----','----','--------','-----','----------']) - for node in focInodesL: - nonCore,core=coreNonCoreCtAtNode(tree,node,familyByNodeL,familyL) - rowL.append([strainNum2StrD[node],str(core),str(nonCore),str(core+nonCore),str(format(nonCore/(core+nonCore),".3f"))]) - printTable(rowL,fileF=fileF) - print(file=fileF) - return - - -if __name__ == "__main__": - - paramFN=sys.argv[1] - paramD = parameters.loadParametersD(paramFN) - - tree,strainStr2NumD,strainNum2StrD = trees.readTree(paramD['treeFN']) - - # load islands - islandByNodeL=islands.readIslands(paramD['islandOutFN'],tree,strainStr2NumD) - - # get familyL etc. - geneNames = genomes.geneNames(paramD['geneOrderFN'],strainStr2NumD,strainNum2StrD) - - - geneInfoD = genomes.readGeneInfoD(paramD['geneInfoFN']) - - familyL = families.readFamilies(paramD['familyFN'],tree,geneNames,strainStr2NumD) - - gene2FamD=createGene2FamD(familyL) - fam2IslandD=createFam2IslandD(islandByNodeL) - - # subtree list - subtreeL=trees.createSubtreeL(tree) - subtreeL.sort() - nodesL=trees.nodeList(tree) - - - geneOrderT=genomes.createGeneOrderTs(paramD['geneOrderFN'],geneNames,subtreeL,strainStr2NumD) - - # scores - scoresO = scores.readScores(paramD['scoresFN'],geneNames) - scoresO.createNodeConnectL(geneNames) # make nodeConnectL attribute - - # calc family error scores - families.calcErrorScores(familyL,scoresO,paramD['minNormThresh'],paramD['minCoreSynThresh'],paramD['minSynThresh'],paramD['famErrorScoreIncrementD']) - - diff --git a/misc/miscDocumentation.rst b/misc/miscDocumentation.rst index cc0f099..1340f8c 100644 --- a/misc/miscDocumentation.rst +++ b/misc/miscDocumentation.rst @@ -40,43 +40,6 @@ Tools for visualization with the IGB browser python3 path-to-xenoGI-github-repository/misc/createIslandGffs.py params.py -interactiveAnalysis.py ----------------------- - -This script does some interactive analysis from within the interpreter:: - - python3 -i path-to-xenoGI-github-repository/misc/interactiveAnalysis.py params.py - -From within python, you can then run functions such as - -* printIslandsAtNode - - Usage:: - - printIslandsAtNode('i0') # All islands at node i0 - printIslandsAtNode('E_coli_K12') # All islands on the E. coli K12 branch - -* findIsland - - Usage:: - - findIsland('gadA') # Find an island associated with a gene name or description`` - -* printIsland - - If we've identified an island of interest (for example island number 3500) then we can print it like this:: - - printIsland(3500,10) # First argument is island id, second is the number of genes to print to each side - - printIsland prints the island in each strain where it's present. Its output includes the island and family numbers for each gene, an error score for the family of each gene, the most recent common ancestor (mrca) of the family, and a description of the gene. The error score is intended to indicate confidence in the correctness of the family. 0 means more confident, higher numbers less confident. - -* printFam - - Print scores within a particular gene family, and also with similar genes not in the family:: - - printFam(3500) - - Obtaining a tree if you don't already have one ----------------------------------------------- diff --git a/xenoGI/xenoGI.py b/xenoGI/xenoGI.py index 43343d8..d0d8477 100644 --- a/xenoGI/xenoGI.py +++ b/xenoGI/xenoGI.py @@ -10,14 +10,14 @@ def main(): assert(len(sys.argv) == 3) paramFN=sys.argv[1] task = sys.argv[2] - assert(task in ['parseGenbank', 'runBlast', 'calcScores', 'makeFamilies', 'makeIslands', 'printAnalysis', 'createIslandBed', 'runAll']) + assert(task in ['parseGenbank', 'runBlast', 'calcScores', 'makeFamilies', 'makeIslands', 'printAnalysis', 'createIslandBed', 'interactiveAnalysis', 'runAll']) except: print( """ Exactly two arguments required. 1. A path to a parameter file. - 2. The task to be run which must be one of: parseGenbank, runBlast, calcScores, makeFamilies, makeIslands, printAnalysis, createIslandBed, or runAll. + 2. The task to be run which must be one of: parseGenbank, runBlast, calcScores, makeFamilies, makeIslands, printAnalysis, createIslandBed, interactiveAnalysis, or runAll. For example: xenoGI params.py parseGenbank @@ -56,7 +56,7 @@ def main(): #### createIslandBed elif task == 'createIslandBed': createIslandBedWrapper(paramD) - + #### runAll elif task == 'runAll': parseGenbankWrapper(paramD) @@ -67,6 +67,10 @@ def main(): printAnalysisWrapper(paramD) createIslandBedWrapper(paramD) + #### interactiveAnalysis + elif task == 'interactiveAnalysis': + interactiveAnalysisWrapper(paramD) + ######## Task related functions @@ -215,3 +219,113 @@ def createIslandBedWrapper(paramD): islandBed.createAllBeds(islandByStrainD,geneInfoD,tree,strainNum2StrD,paramD['bedFilePath'],paramD['scoreNodeMapD'],paramD['potentialRgbL'],paramD['bedNumTries']) +def interactiveAnalysisWrapper(paramD): + """Enter interactive mode.""" + + ## Set up the modules a bit differently for interactive mode + import code,sys + from xenoGI.analysis import createGene2FamD,createFam2IslandD,printScoreMatrix,matchFamilyIsland,printIslandNeighb,vPrintIslands,coreNonCoreCtAtNode,printOutsideFamilyScores + + ## Wrapper analysis functions. For convenience these assume a + ## bunch of global variables. + + def printFam(familyNum,fileF=sys.stdout): + '''This is a wrapper to provide an easy way to print relevant info on + a family. For ease of use, we take only one argument, assuming all the + other required stuff is available at the top level. familyNum is the + numerical identifier of a family. + ''' + + print("Family error score (count of possibly misassigned genes):",familyL[familyNum].possibleErrorCt,file=fileF) + + print(file=fileF) + print("Matrix of raw similarity scores [0,1] between genes in the family",file=fileF) + printScoreMatrix(familyNum,subtreeL,familyL,geneNames,scoresO,'rawSc',fileF) + print(file=fileF) + print(file=fileF) + + print(file=fileF) + print("Matrix of normalized similarity scores between genes in the family",file=fileF) + printScoreMatrix(familyNum,subtreeL,familyL,geneNames,scoresO,'normSc',fileF) + print(file=fileF) + print(file=fileF) + + print("Matrix of core synteny scores [0,1] between genes in the family",file=fileF) + printScoreMatrix(familyNum,subtreeL,familyL,geneNames,scoresO,'coreSynSc',fileF) + print(file=fileF) + print(file=fileF) + + print("Matrix of synteny scores between genes in the family",file=fileF) + printScoreMatrix(familyNum,subtreeL,familyL,geneNames,scoresO,'synSc',fileF) + print(file=fileF) + print(file=fileF) + + printOutsideFamilyScores(familyNum,subtreeL,familyL,geneNames,scoresO,fileF) + print(file=fileF) + print(file=fileF) + + def findIsland(searchStr,fileF=sys.stdout): + '''Print the gene, family and island associated with searchStr. This + is a wrapper that assumes various required objects are present at the + top level.''' + L=matchFamilyIsland(geneInfoD,geneNames,gene2FamD,fam2IslandD,searchStr) + for geneName,fam,isl in L: + print("","","",file=fileF) + + + def printIsland(islandNum,synWSize,fileF=sys.stdout): + '''Print the island and its genomic context in each species. We + include synWSize/2 genes in either direction beyond the island. + ''' + printIslandNeighb(islandNum,synWSize,subtreeL,islandByNodeL,familyL,geneOrderT,gene2FamD,fam2IslandD,geneInfoD,geneNames,strainNum2StrD,fileF) + + + def printIslandsAtNode(nodeStr,fileF=sys.stdout): + '''This is a wrapper to provide an easy way to print all the islands + at a particular node in the tree. For ease of use, we take only a node + number as argument, assuming all the other required stuff is available + at the top level. + ''' + node = strainStr2NumD[nodeStr] + vPrintIslands(islandByNodeL[node],subtreeL,familyL,strainNum2StrD,geneNames,geneInfoD,fileF) + + + def printCoreNonCoreByNode(fileF=sys.stdout): + '''For each node in the focal clade, print the number of core and + non-core families. That is, for that node, we get all families present + in descendent species. We then look at their mrcas. Those with an mrca + below the node in question are non-core, others are core.''' + + focTree = trees.subtree(tree,strainStr2NumD[paramD['rootFocalClade']]) + focInodesL=trees.iNodeList(focTree) + familyByNodeL=createFamilyByNodeL(geneOrderT,gene2FamD) + + rowL=[] + rowL.append(['Node','Core','Non-Core','Total','% Non-Core']) + rowL.append(['----','----','--------','-----','----------']) + for node in focInodesL: + nonCore,core=coreNonCoreCtAtNode(tree,node,familyByNodeL,familyL) + rowL.append([strainNum2StrD[node],str(core),str(nonCore),str(core+nonCore),str(format(nonCore/(core+nonCore),".3f"))]) + printTable(rowL,fileF=fileF) + print(file=fileF) + return + + + ## Load data + + tree,strainStr2NumD,strainNum2StrD,geneNames,subtreeL,geneOrderT = loadMiscDataStructures(paramD) + nodesL=trees.nodeList(tree) + geneNames = genomes.geneNames(paramD['geneOrderFN'],strainStr2NumD,strainNum2StrD) + geneInfoD = genomes.readGeneInfoD(paramD['geneInfoFN']) + familyL = families.readFamilies(paramD['familyFN'],tree,geneNames,strainStr2NumD) + islandByNodeL=islands.readIslands(paramD['islandOutFN'],tree,strainStr2NumD) + gene2FamD=createGene2FamD(familyL) + fam2IslandD=createFam2IslandD(islandByNodeL) + geneOrderT=genomes.createGeneOrderTs(paramD['geneOrderFN'],geneNames,subtreeL,strainStr2NumD) + scoresO = scores.readScores(paramD['scoresFN'],geneNames) + scoresO.createNodeConnectL(geneNames) # make nodeConnectL attribute + # calc family error scores + families.calcErrorScores(familyL,scoresO,paramD['minNormThresh'],paramD['minCoreSynThresh'],paramD['minSynThresh'],paramD['famErrorScoreIncrementD']) + + code.interact(local=locals()) +