Skip to content

Commit

Permalink
remove unessary copies
Browse files Browse the repository at this point in the history
  • Loading branch information
jaebeom-kim committed Dec 8, 2023
1 parent 79acbc8 commit 79e743d
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 64 deletions.
12 changes: 2 additions & 10 deletions src/commons/KmerMatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,11 +331,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
}
for (int k = 0; k < currMatchNum; k++) {
idx = selectedMatches[k];
// Check if candidateKmerInfos[idx].sequenceID is valid
// if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
// taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
// cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
// }
matches[matchCnt] = {queryKmerList[j].info,
candidateKmerInfos[idx].sequenceID,
taxId2speciesId[candidateKmerInfos[idx].sequenceID],
Expand Down Expand Up @@ -474,10 +469,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
free(splitCheckList);
queryKmerNum = 0;

#ifdef OPENMP
omp_set_num_threads(threads);
#endif

totalMatchCnt += matchBuffer->startIndexOfReserve;
return 1;
}
Expand Down Expand Up @@ -520,8 +511,9 @@ void KmerMatcher::compareDna(uint64_t query,
}

// Select target k-mers that passed hamming criteria
uint8_t maxHamming = min(minHammingSum * 2, 7);
for (size_t h = 0; h < size; h++) {
if (hammingSums[h] <= min(minHammingSum * 2, 7)) {
if (hammingSums[h] <= maxHamming) {
selectedMatches.push_back(h);
selectedHammingSum.push_back(hammingSums[h]);
if (frame < 3) { // Frame of query k-mer
Expand Down
6 changes: 3 additions & 3 deletions src/commons/LocalUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ void LocalUtil::splitQueryFile(std::vector<SequenceBlock> & sequences, const std

int LocalUtil::getMaxCoveredLength(int queryLength) {
if (queryLength % 3 == 2) {
return queryLength - 2; // 2
return queryLength - 2;
} else if (queryLength % 3 == 1) {
return queryLength - 4; // 4
return queryLength - 4;
} else {
return queryLength - 3; // 3
return queryLength - 3;
}
}

Expand Down
7 changes: 4 additions & 3 deletions src/commons/QueryIndexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) {
maxRam = par.ramUsage;
threads = par.threads;
bytesPerKmer = sizeof(QueryKmer) + matchPerKmer * sizeof(Match);
// std::cout << "bytesPerKmer: " << bytesPerKmer << "\n";
readNum_1 = 0;
readNum_2 = 0;
spaceNum = par.spaceMask.length() - kmerLength;
Expand All @@ -25,6 +26,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) {
void QueryIndexer::setAvailableRam() {
availableRam = ((size_t) maxRam * (size_t) 1024 * 1024 * 1024)
- ((size_t) 128 * 1024 * 1024 * (size_t) threads);
// std::cout << "availableRam: " << availableRam << "\n";
}

void QueryIndexer::indexQueryFile() {
Expand All @@ -40,7 +42,6 @@ void QueryIndexer::indexQueryFile() {
totalReadLength += kseq->entry.sequence.l;
size_t currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(kseq->entry.sequence.l, spaceNum);
kmerCnt += currentKmerCnt;
// std::cout << "currentKmerCnt: " << kmerCnt << "\n";

if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) {
querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt);
Expand Down Expand Up @@ -72,7 +73,6 @@ void QueryIndexer::indexQueryFile() {
seqCnt_1++;
totalReadLength += kseq_1->entry.sequence.l;
currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(kseq_1->entry.sequence.l, spaceNum);
kmerCnt += currentKmerCnt;
} else {
end = true;
}
Expand All @@ -82,7 +82,7 @@ void QueryIndexer::indexQueryFile() {
seqCnt_2++;
totalReadLength += kseq_2->entry.sequence.l;
currentKmerCnt += LocalUtil::getQueryKmerNumber<size_t>(kseq_2->entry.sequence.l, spaceNum);
kmerCnt += currentKmerCnt;

} else {
end = true;
}
Expand All @@ -92,6 +92,7 @@ void QueryIndexer::indexQueryFile() {
EXIT(EXIT_FAILURE);
}

kmerCnt += currentKmerCnt;
if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) {
querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt);
kmerCnt = currentKmerCnt;
Expand Down
70 changes: 25 additions & 45 deletions src/commons/Taxonomer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
const Match *matchList,
vector<Query> & queryList,
const LocalParameters &par) {

// if (true) {
// cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
// for (size_t i = offset; i < end + 1; i++) {
// cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl;
// }
// }
// Get the best species for current query
vector<Match> speciesMatches;
speciesMatches.reserve(end - offset + 1);
Expand All @@ -109,14 +102,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
queryList[currentQuery].queryLength);
}

// if (true) {
// cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n";
// for (size_t i = 0; i < genusMatches.size(); i++) {
// cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) << " " << int(genusMatches[i].redundancy) << endl;
// }
// cout << "Genus score: " << genusScore.score << "\n";
// }

// If there is no proper species for current query, it is un-classified.
if (speciesScore.score == 0 || speciesScore.score < par.minScore) {
queryList[currentQuery].isClassified = false;
Expand All @@ -138,8 +123,12 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
return;
}

// Filter redundant matches
filterRedundantMatches(speciesMatches, queryList[currentQuery].taxCnt);
// Filter redundant matches
unordered_map<TaxID, unsigned int> taxCnt;
filterRedundantMatches(speciesMatches, taxCnt);
for (auto & tax : taxCnt) {
queryList[currentQuery].taxCnt[tax.first] = tax.second;
}

// If score is not enough, classify to the parent of the selected species
if (speciesScore.score < par.minSpScore) {
Expand All @@ -153,7 +142,7 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
}

// Lower rank classification
TaxID result = lowerRankClassification(queryList[currentQuery].taxCnt,
TaxID result = lowerRankClassification(taxCnt,
speciesScore.taxId,
queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2);

Expand All @@ -164,21 +153,10 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
queryList[currentQuery].coverage = speciesScore.coverage;
queryList[currentQuery].hammingDist = speciesScore.hammingDist;
queryList[currentQuery].newSpecies = false;
// if (par.printLog) {
// cout << "# " << currentQuery << endl;
// for (size_t i = 0; i < genusMatches.size(); i++) {
// cout << i << " " << genusMatches[i].qInfo.pos << " " <<
// genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl;
// }
// cout << "Score: " << speciesScore.score << " " << selectedSpecies << " "
// << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx)
//
// << endl;
// }
}

void Taxonomer::filterRedundantMatches(vector<Match> & speciesMatches,
map<TaxID, int > & taxCnt) {
unordered_map<TaxID, unsigned int> & taxCnt) {
// Sort matches by the coordinate on the query
sort(speciesMatches.begin(), speciesMatches.end(),
[](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
Expand All @@ -204,7 +182,7 @@ void Taxonomer::filterRedundantMatches(vector<Match> & speciesMatches,
}
}

TaxID Taxonomer::lowerRankClassification(const map<TaxID, int> & taxCnt, TaxID spTaxId, int queryLength) {
TaxID Taxonomer::lowerRankClassification(const unordered_map<TaxID, unsigned int> & taxCnt, TaxID spTaxId, int queryLength) {
unsigned int maxCnt = (queryLength - 1)/denominator + 1;
unordered_map<TaxID, TaxonCounts> cladeCnt;
getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
Expand All @@ -225,7 +203,7 @@ TaxID Taxonomer::lowerRankClassification(const map<TaxID, int> & taxCnt, TaxID s
}
}

void Taxonomer::getSpeciesCladeCounts(const map<TaxID, int> &taxCnt,
void Taxonomer::getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> &taxCnt,
unordered_map<TaxID, TaxonCounts> & cladeCount,
TaxID speciesTaxID) {
for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) {
Expand Down Expand Up @@ -475,9 +453,11 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
// 1. Add the matchPath with the highest score to combinedMatchPaths
// 2. Add the matchPath with the highest score that is not overlapped with the matchPath in combinedMatchPaths
// 3. Repeat 2 until no matchPath can be added
float score = 0;
for (size_t i = 0; i < matchPaths.size(); i++) {
if (combinedMatchPaths.empty()) {
combinedMatchPaths.push_back(matchPaths[i]);
score += matchPaths[i].score;
} else {
bool isOverlapped = false;
for (size_t j = 0; j < combinedMatchPaths.size(); j++) {
Expand All @@ -486,27 +466,26 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
- max(matchPaths[i].start, combinedMatchPaths[j].start) + 1;
if (overlappedLength < 24) {
trimMatchPath(matchPaths[i], combinedMatchPaths[j], overlappedLength);

continue;
} else {
isOverlapped = true;
break;
}
// Current path trimmed too much
if (matchPaths[i].end - matchPaths[i].start + 1 < 24) {
if (matchPaths[i].end - matchPaths[i].start + 1 < 24) { // Current path trimmed too much
isOverlapped = true;
break;
}
}
}
}
if (!isOverlapped) {
combinedMatchPaths.push_back(matchPaths[i]);
score += matchPaths[i].score;
}
}
}
// Calculate the score of combinedMatchPaths
float score = 0;
for (auto & matchPath : combinedMatchPaths) {
score += matchPath.score;
// if (score / readLength >= 1) {
// break;
// }
}
return score / readLength;
}
Expand All @@ -520,12 +499,10 @@ bool Taxonomer::isMatchPathOverlapped(const MatchPath & matchPath1,
void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength) {
if (path1.start < path2.start) {
path1.end = path2.start - 1;
// uint8_t lastEndHamming = GET_2_BITS(path1.matches.back()->rightEndHamming);
path1.hammingDist = max(0, path1.hammingDist - path1.endMatch->getRightPartHammingDist(overlapLength/3));
path1.score = path1.score - path1.endMatch->getRightPartScore(overlapLength/3) - (overlapLength % 3);
} else {
path1.start = path2.end + 1;
// uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14);
path1.hammingDist = max(0, path1.hammingDist - path1.startMatch->getLeftPartHammingDist(overlapLength/3));
path1.score = path1.score - path1.startMatch->getLeftPartScore(overlapLength/3) - (overlapLength % 3);
}
Expand Down Expand Up @@ -558,7 +535,7 @@ void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameM
// Compare curPosMatches and nextPosMatches
for (auto &curPosMatch: curPosMatches) {
for (auto &nextPosMatch: nextPosMatches) {
if (curPosMatch->isConsecutive_DNA(nextPosMatch)) {
if (isConsecutive(curPosMatch, nextPosMatch)){
linkedMatches[curPosMatch].push_back(nextPosMatch);
}
}
Expand Down Expand Up @@ -586,9 +563,12 @@ void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameM
// Compare curPosMatches and nextPosMatches
for (auto &curPosMatch: curPosMatches) {
for (auto &nextPosMatch: nextPosMatches) {
if (nextPosMatch->isConsecutive_DNA(curPosMatch)) {
if (isConsecutive(nextPosMatch, curPosMatch)){
linkedMatches[curPosMatch].push_back(nextPosMatch);
}
// if (nextPosMatch->isConsecutive_DNA(curPosMatch)) {
// linkedMatches[curPosMatch].push_back(nextPosMatch);
// }
}
}

Expand Down Expand Up @@ -1052,7 +1032,7 @@ TaxonScore Taxonomer::scoreSpecies(const vector<Match> &matches,
bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) {
// match1 87654321 -> 08765432
// match2 98765432 -> 08765432
return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF);
return (match1->dnaEncoding >> 3) == (match2->dnaEncoding & 0x1FFFFF);
}

bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * match2) {
Expand Down
6 changes: 3 additions & 3 deletions src/commons/Taxonomer.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class Taxonomer {
void trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength);

void filterRedundantMatches(vector<Match> & speciesMatches,
map<TaxID, int> & taxCnt);
unordered_map<TaxID, unsigned int> & taxCnt);

depthScore DFS(const vector<const Match *> &matches, const Match * curMatchIdx,
const map<const Match *, vector<const Match *>> &linkedMatches,
Expand Down Expand Up @@ -177,9 +177,9 @@ class Taxonomer {
int queryLength,
int queryLength2);

TaxID lowerRankClassification(const map<TaxID, int> & matches, TaxID speciesID, int queryLength);
TaxID lowerRankClassification(const unordered_map<TaxID, unsigned int> & taxCnt, TaxID speciesID, int queryLength);

void getSpeciesCladeCounts(const map<TaxID, int> & taxCnt,
void getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> & taxCnt,
unordered_map<TaxID, TaxonCounts> & cladeCnt,
TaxID spciesID);

Expand Down

0 comments on commit 79e743d

Please sign in to comment.