Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial code review of changes for hg38 alt contigs #63

Open
wants to merge 20 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 61 additions & 12 deletions SNAPLib/BaseAligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ BaseAligner::BaseAligner(
genomeIndex(i_genomeIndex), maxHitsToConsider(i_maxHitsToConsider), maxK(i_maxK),
maxReadSize(i_maxReadSize), maxSeedsToUseFromCommandLine(i_maxSeedsToUseFromCommandLine),
maxSeedCoverage(i_maxSeedCoverage), readId(-1), extraSearchDepth(i_extraSearchDepth),
explorePopularSeeds(false), stopOnFirstHit(false), stats(i_stats),
explorePopularSeeds(false), stopOnFirstHit(false), stats(i_stats), allMatches(NULL),
noUkkonen(i_noUkkonen), noOrderedEvaluation(i_noOrderedEvaluation), noTruncation(i_noTruncation),
minWeightToCheck(max(1u, i_minWeightToCheck)), maxSecondaryAlignmentsPerContig(i_maxSecondaryAlignmentsPerContig)
/*++
Expand Down Expand Up @@ -226,6 +226,10 @@ Routine Description:
}
hashTableEpoch = 0;

if (genome->hasAltContigs()) {
MatchInfo* entries = (MatchInfo*)allocator->allocate(sizeof(MatchInfo)* maxHitsToConsider);
allMatches = new MatchInfoVector(entries, maxHitsToConsider);
}

}

Expand Down Expand Up @@ -652,6 +656,36 @@ Return Value:
return;
}

/**
* Add up the highest-probability matches of all overlapping alternates
*/
double
BaseAligner::computeLiftedCandidateProbability(
GenomeDistance length)
{
std::sort(allMatches->begin(), allMatches->end(), matchInfoComparator);
double totalProbability = 0.0;
MatchInfo best(0, 0, 0);
GenomeLocation farthest;
for (int i = 0; i <= allMatches->size(); i++) {
MatchInfo m(0, 0, 0);
if (i == allMatches->size() || (m = (*allMatches)[i]).liftedLocation > farthest) {
totalProbability += best.matchProbability;
best = m;
}
else {
if (m.matchProbability > best.matchProbability) {
best = m;
}
GenomeLocation e = m.liftedLocation + length - 1;
if (e > farthest) {
farthest = e;
}
}
}
return totalProbability;
}

bool
BaseAligner::score(
bool forceResult,
Expand Down Expand Up @@ -744,6 +778,10 @@ Return Value:
#endif

unsigned weightListToCheck = highestUsedWeightList;
bool anyAltMatches = false;
if (allMatches != NULL) {
allMatches->clear();
}

do {
//
Expand All @@ -764,6 +802,9 @@ Return Value:
primaryResult->score = bestScore;
if (bestScore <= maxK) {
primaryResult->location = bestScoreGenomeLocation;
if (anyAltMatches) {
probabilityOfAllCandidates = computeLiftedCandidateProbability(read[0]->getDataLength());
}
primaryResult->mapq = computeMAPQ(probabilityOfAllCandidates, probabilityOfBestCandidate, bestScore, popularSeedsSkipped);
if (primaryResult->mapq >= MAPQ_LIMIT_FOR_SINGLE_HIT) {
primaryResult->status = SingleHit;
Expand Down Expand Up @@ -913,6 +954,14 @@ Return Value:
// We could mark as scored anything in between the old and new genome offsets, but it's probably not worth the effort since this is
// so rare and all it would do is same time.
//

// remember in case there are alt matches
if (allMatches != NULL) {
if ((! anyAltMatches) && genome->getLiftedLocation(genomeLocation) != genomeLocation) {
anyAltMatches = true;
}
allMatches->push_back(MatchInfo(genomeLocation, genome->getLiftedLocation(genomeLocation), matchProbability));
}
}
}
} else { // if we had genome data to compare against
Expand Down Expand Up @@ -1114,7 +1163,6 @@ Return Value:
return false;
}


void
BaseAligner::prefetchHashTableBucket(GenomeLocation genomeLocation, Direction direction)
{
Expand Down Expand Up @@ -1455,17 +1503,18 @@ BaseAligner::getBigAllocatorReservation(GenomeIndex *index, bool ownLandauVishki
}

return
contigCounters +
sizeof(_uint64) * 14 + // allow for alignment
sizeof(BaseAligner) + // our own member variables
contigCounters +
sizeof(_uint64)* 14 + // allow for alignment
sizeof(BaseAligner)+ // our own member variables
(ownLandauVishkin ?
LandauVishkin<>::getBigAllocatorReservation() +
LandauVishkin<-1>::getBigAllocatorReservation() : 0) + // our LandauVishkin objects
sizeof(char) * maxReadSize * 2 + // rcReadData
sizeof(char) * maxReadSize * 4 + 2 * MAX_K + // reversed read (both)
sizeof(BYTE) * (maxReadSize + 7 + 128) / 8 + // seed used
sizeof(HashTableElement) * hashTableElementPoolSize + // hash table element pool
sizeof(HashTableAnchor) * candidateHashTablesSize * 2 + // candidate hash table (both)
LandauVishkin<>::getBigAllocatorReservation() +
LandauVishkin<-1>::getBigAllocatorReservation() : 0) + // our LandauVishkin objects
sizeof(char)* maxReadSize * 2 + // rcReadData
sizeof(char)* maxReadSize * 4 + 2 * MAX_K + // reversed read (both)
sizeof(BYTE)* (maxReadSize + 7 + 128) / 8 + // seed used
sizeof(HashTableElement)* hashTableElementPoolSize + // hash table element pool
sizeof(HashTableAnchor)* candidateHashTablesSize * 2 + // candidate hash table (both)
(index->getGenome()->hasAltContigs() ? sizeof(MatchInfo) * maxHitsToConsider : 0) + // matches for alt contigs
sizeof(HashTableElement) * (maxSeedsToUse + 1); // weight lists
}

Expand Down
22 changes: 22 additions & 0 deletions SNAPLib/BaseAligner.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Revision History:
#include "AlignerStats.h"
#include "directions.h"
#include "GenomeIndex.h"
#include "VariableSizeVector.h"

extern bool doAlignerPrefetch;

Expand Down Expand Up @@ -326,6 +327,27 @@ class BaseAligner {

AlignerStats *stats;

typedef struct MatchInfo
{
GenomeLocation location;
GenomeLocation liftedLocation;
double matchProbability;

MatchInfo(GenomeLocation _loc, GenomeLocation _lifted, double _p) :
location(_loc), liftedLocation(_lifted), matchProbability(_p) {}
} MatchInfo;

static bool matchInfoComparator(const BaseAligner::MatchInfo& a, const BaseAligner::MatchInfo& b)
{
return a.liftedLocation < b.liftedLocation;
}

typedef VariableSizeVector<MatchInfo,0> MatchInfoVector;

MatchInfoVector* allMatches;

double computeLiftedCandidateProbability(GenomeDistance length);

unsigned *hitCountByExtraSearchDepth; // How many hits at each depth bigger than the current best edit distance.
// So if the current best hit has edit distance 2, then hitCountByExtraSearchDepth[0] would
// be the count of hits at edit distance 2, while hitCountByExtraSearchDepth[2] would be the count
Expand Down
125 changes: 100 additions & 25 deletions SNAPLib/FASTA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ ReadFASTAGenome(
const char *fileName,
const char *pieceNameTerminatorCharacters,
bool spaceIsAPieceNameTerminator,
unsigned chromosomePaddingSize)
unsigned chromosomePaddingSize,
const char *chrTag,
const char *chrMapFilename,
AltContigMap* altMap)
{
//
// We need to know a bound on the size of the genome before we create the Genome object.
Expand All @@ -53,15 +56,38 @@ ReadFASTAGenome(
isValidGenomeCharacter['A'] = isValidGenomeCharacter['T'] = isValidGenomeCharacter['C'] = isValidGenomeCharacter['G'] = isValidGenomeCharacter['N'] = true;
isValidGenomeCharacter['a'] = isValidGenomeCharacter['t'] = isValidGenomeCharacter['c'] = isValidGenomeCharacter['g'] = isValidGenomeCharacter['n'] = true;

int lineBufferSize = 0;
char *lineBuffer;

map<string, string> chrMap;
if (chrMapFilename != NULL) {
FILE* mapFile = fopen(chrMapFilename, "r");
if (mapFile == NULL) {
WriteErrorMessage("Unable to open -chrmap file '%s'\n", chrMapFilename);
return NULL;
}
while (NULL != reallocatingFgets(&lineBuffer, &lineBufferSize, mapFile)) {
if (lineBuffer[0] == '#') {
continue;
}
string chrom;
for (char * token = strtok(lineBuffer, "\t\r\n"); token != NULL; token = strtok(NULL, "\t\r\n")) {
if (token == lineBuffer) {
chrom = string(token);
} else {
chrMap[string(token)] = chrom;
}
}
}
fclose(mapFile);
}

FILE *fastaFile = fopen(fileName, "r");
if (fastaFile == NULL) {
WriteErrorMessage("Unable to open FASTA file '%s' (even though we already got its size)\n",fileName);
return NULL;
}

int lineBufferSize = 0;
char *lineBuffer;

//
// Count the chromosomes
//
Expand Down Expand Up @@ -96,33 +122,59 @@ ReadFASTAGenome(
//
// Now supply the chromosome name.
//
if (NULL != pieceNameTerminatorCharacters) {
for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) {
char *terminator = strchr(lineBuffer+1, pieceNameTerminatorCharacters[i]);
if (NULL != terminator) {
*terminator = '\0';
const char *chrName;
int chrNameLen;
if (chrTag == NULL) {
char * terminator = lineBuffer + strlen(lineBuffer);
char * p;
if (NULL != pieceNameTerminatorCharacters) {
for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) {
p = strchr(lineBuffer + 1, pieceNameTerminatorCharacters[i]);
if (NULL != p && p < terminator) {
terminator = p;
}
}
}
}
if (spaceIsAPieceNameTerminator) {
char *terminator = strchr(lineBuffer, ' ');
if (NULL != terminator) {
*terminator = '\0';
if (spaceIsAPieceNameTerminator) {
p = strchr(lineBuffer, ' ');
if (NULL != p && p < terminator) {
terminator = p;
}
p = strchr(lineBuffer, '\t');
if (NULL != p && p < terminator) {
terminator = p;
}
}
terminator = strchr(lineBuffer, '\t');
if (NULL != terminator) {
*terminator = '\0';
p = strchr(lineBuffer, '\n');
if (NULL != p && p < terminator) {
terminator = p;
}
p = strchr(lineBuffer, '\r');
if (NULL != p && p < terminator) {
terminator = p;
}
chrName = lineBuffer + 1;
chrNameLen = (int) (terminator - lineBuffer - 1);
} else {
if (!FindFASTATagValue(lineBuffer, chrTag, &chrName, &chrNameLen)) {
WriteErrorMessage("Unable to find tag '%s' in contig '%s'\n", chrTag, lineBuffer + 1);
soft_exit(1);
}
if (chrMapFilename != NULL) {
map<string,string>::iterator mapped = chrMap.find(string(chrName, chrName + chrNameLen));
if (mapped != chrMap.end()) {
chrName = mapped->second.data();
chrNameLen = (int) mapped->second.length();
}
}
}
char *terminator = strchr(lineBuffer, '\n');
if (NULL != terminator) {
*terminator = '\0';
}
terminator = strchr(lineBuffer, '\r');
if (NULL != terminator) {
*terminator = '\0';
if (altMap != NULL) {
altMap->addFastaContig(lineBuffer, chrName, chrNameLen);
}
genome->startContig(lineBuffer+1);
char *contigName = (char*) malloc(chrNameLen + 1);
memcpy(contigName, chrName, chrNameLen);
contigName[chrNameLen] = '\0';
genome->startContig(contigName, altMap);
} else {
if (!inAContig) {
WriteErrorMessage("\nFASTA file doesn't beging with a contig name (i.e., the first line doesn't start with '>').\n");
Expand Down Expand Up @@ -170,6 +222,9 @@ ReadFASTAGenome(
//
genome->addData(paddingBuffer);
genome->fillInContigLengths();
if (altMap != NULL) {
genome->adjustAltContigs(altMap);
}
genome->sortContigsByName();

fclose(fastaFile);
Expand Down Expand Up @@ -198,3 +253,23 @@ bool AppendFASTAGenome(const Genome *genome, FILE *fasta, const char *prefix="")
}
return !ferror(fasta);
}

bool
FindFASTATagValue(const char* lineBuffer, const char* tagName, const char ** pTagValue, int * pValueLength)
{
const char *tag = lineBuffer;
do {
tag = strstr(tag + 1, tagName);
if (tag == NULL) {
return false;
}
} while (tag[-1] != '>' && tag[-1] != '|' && tag[strlen(tagName)] != '|');
*pTagValue = tag + strlen(tagName) + 1; // Format is "tag|value|
const char *tagValueEnd = strchr(*pTagValue, '|');
if (tagValueEnd == NULL) {
WriteErrorMessage("Badly formatted tag '%s' in contig '%s'\n", tag, lineBuffer + 1);
soft_exit(1);
}
*pValueLength = (int) (tagValueEnd - *pTagValue);
return true;
}
12 changes: 4 additions & 8 deletions SNAPLib/FASTA.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Revision History:
#include "Genome.h"

const Genome *
ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize);
ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize, const char* chrTag, const char* chrMapFilename, AltContigMap* altMap);

//
// The FASTA appending functions return whether the write was successful.
Expand All @@ -40,10 +40,6 @@ ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters,
bool
AppendFASTAGenome(const Genome *, FILE *fasta);

//
// This is arbitrary; is there some existing convention?
//
inline const char *diploidFASTASexPrefix(bool male)
{
return male ? "PATERNAL|" : "MATERNAL|";
}
// utility for parsing FASTA tags
bool
FindFASTATagValue(const char* lineBuffer, const char* tag, const char ** pTagValue, int * pValueLength);
Loading