diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 9c87d527..7627f0aa 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -331,11 +331,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI } for (int k = 0; k < currMatchNum; k++) { idx = selectedMatches[k]; - // Check if candidateKmerInfos[idx].sequenceID is valid - // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || - // taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { - // cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; - // } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, taxId2speciesId[candidateKmerInfos[idx].sequenceID], @@ -474,10 +469,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI free(splitCheckList); queryKmerNum = 0; -#ifdef OPENMP - omp_set_num_threads(threads); -#endif - totalMatchCnt += matchBuffer->startIndexOfReserve; return 1; } @@ -520,8 +511,9 @@ void KmerMatcher::compareDna(uint64_t query, } // Select target k-mers that passed hamming criteria + uint8_t maxHamming = min(minHammingSum * 2, 7); for (size_t h = 0; h < size; h++) { - if (hammingSums[h] <= min(minHammingSum * 2, 7)) { + if (hammingSums[h] <= maxHamming) { selectedMatches.push_back(h); selectedHammingSum.push_back(hammingSums[h]); if (frame < 3) { // Frame of query k-mer diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp index d981f238..6b87319b 100644 --- a/src/commons/LocalUtil.cpp +++ b/src/commons/LocalUtil.cpp @@ -35,11 +35,11 @@ void LocalUtil::splitQueryFile(std::vector & sequences, const std int LocalUtil::getMaxCoveredLength(int queryLength) { if (queryLength % 3 == 2) { - return queryLength - 2; // 2 + return queryLength - 2; } else if (queryLength % 3 == 1) { - return queryLength - 4; // 4 + return queryLength - 4; } else { - return queryLength - 3; // 3 + return queryLength - 3; } } diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp index 29191f8e..7846a969 100644 --- a/src/commons/QueryIndexer.cpp +++ b/src/commons/QueryIndexer.cpp @@ -14,6 +14,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) { maxRam = par.ramUsage; threads = par.threads; bytesPerKmer = sizeof(QueryKmer) + matchPerKmer * sizeof(Match); + // std::cout << "bytesPerKmer: " << bytesPerKmer << "\n"; readNum_1 = 0; readNum_2 = 0; spaceNum = par.spaceMask.length() - kmerLength; @@ -25,6 +26,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) { void QueryIndexer::setAvailableRam() { availableRam = ((size_t) maxRam * (size_t) 1024 * 1024 * 1024) - ((size_t) 128 * 1024 * 1024 * (size_t) threads); + // std::cout << "availableRam: " << availableRam << "\n"; } void QueryIndexer::indexQueryFile() { @@ -40,7 +42,6 @@ void QueryIndexer::indexQueryFile() { totalReadLength += kseq->entry.sequence.l; size_t currentKmerCnt = LocalUtil::getQueryKmerNumber(kseq->entry.sequence.l, spaceNum); kmerCnt += currentKmerCnt; - // std::cout << "currentKmerCnt: " << kmerCnt << "\n"; if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) { querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt); @@ -72,7 +73,6 @@ void QueryIndexer::indexQueryFile() { seqCnt_1++; totalReadLength += kseq_1->entry.sequence.l; currentKmerCnt = LocalUtil::getQueryKmerNumber(kseq_1->entry.sequence.l, spaceNum); - kmerCnt += currentKmerCnt; } else { end = true; } @@ -82,7 +82,7 @@ void QueryIndexer::indexQueryFile() { seqCnt_2++; totalReadLength += kseq_2->entry.sequence.l; currentKmerCnt += LocalUtil::getQueryKmerNumber(kseq_2->entry.sequence.l, spaceNum); - kmerCnt += currentKmerCnt; + } else { end = true; } @@ -92,6 +92,7 @@ void QueryIndexer::indexQueryFile() { EXIT(EXIT_FAILURE); } + kmerCnt += currentKmerCnt; if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) { querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt); kmerCnt = currentKmerCnt; diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index cff5d5c1..63fb5731 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -89,13 +89,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, const Match *matchList, vector & queryList, const LocalParameters &par) { - -// if (true) { -// cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; -// for (size_t i = offset; i < end + 1; i++) { -// cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; -// } -// } // Get the best species for current query vector speciesMatches; speciesMatches.reserve(end - offset + 1); @@ -109,14 +102,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, queryList[currentQuery].queryLength); } -// if (true) { -// cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n"; -// for (size_t i = 0; i < genusMatches.size(); i++) { -// cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) << " " << int(genusMatches[i].redundancy) << endl; -// } -// cout << "Genus score: " << genusScore.score << "\n"; -// } - // If there is no proper species for current query, it is un-classified. if (speciesScore.score == 0 || speciesScore.score < par.minScore) { queryList[currentQuery].isClassified = false; @@ -138,8 +123,12 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, return; } - // Filter redundant matches - filterRedundantMatches(speciesMatches, queryList[currentQuery].taxCnt); + // Filter redundant matches + unordered_map taxCnt; + filterRedundantMatches(speciesMatches, taxCnt); + for (auto & tax : taxCnt) { + queryList[currentQuery].taxCnt[tax.first] = tax.second; + } // If score is not enough, classify to the parent of the selected species if (speciesScore.score < par.minSpScore) { @@ -153,7 +142,7 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, } // Lower rank classification - TaxID result = lowerRankClassification(queryList[currentQuery].taxCnt, + TaxID result = lowerRankClassification(taxCnt, speciesScore.taxId, queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2); @@ -164,21 +153,10 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, queryList[currentQuery].coverage = speciesScore.coverage; queryList[currentQuery].hammingDist = speciesScore.hammingDist; queryList[currentQuery].newSpecies = false; -// if (par.printLog) { -// cout << "# " << currentQuery << endl; -// for (size_t i = 0; i < genusMatches.size(); i++) { -// cout << i << " " << genusMatches[i].qInfo.pos << " " << -// genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl; -// } -// cout << "Score: " << speciesScore.score << " " << selectedSpecies << " " -// << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx) -// -// << endl; -// } } void Taxonomer::filterRedundantMatches(vector & speciesMatches, - map & taxCnt) { + unordered_map & taxCnt) { // Sort matches by the coordinate on the query sort(speciesMatches.begin(), speciesMatches.end(), [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); @@ -204,7 +182,7 @@ void Taxonomer::filterRedundantMatches(vector & speciesMatches, } } -TaxID Taxonomer::lowerRankClassification(const map & taxCnt, TaxID spTaxId, int queryLength) { +TaxID Taxonomer::lowerRankClassification(const unordered_map & taxCnt, TaxID spTaxId, int queryLength) { unsigned int maxCnt = (queryLength - 1)/denominator + 1; unordered_map cladeCnt; getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); @@ -225,7 +203,7 @@ TaxID Taxonomer::lowerRankClassification(const map & taxCnt, TaxID s } } -void Taxonomer::getSpeciesCladeCounts(const map &taxCnt, +void Taxonomer::getSpeciesCladeCounts(const unordered_map &taxCnt, unordered_map & cladeCount, TaxID speciesTaxID) { for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) { @@ -475,9 +453,11 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, // 1. Add the matchPath with the highest score to combinedMatchPaths // 2. Add the matchPath with the highest score that is not overlapped with the matchPath in combinedMatchPaths // 3. Repeat 2 until no matchPath can be added + float score = 0; for (size_t i = 0; i < matchPaths.size(); i++) { if (combinedMatchPaths.empty()) { combinedMatchPaths.push_back(matchPaths[i]); + score += matchPaths[i].score; } else { bool isOverlapped = false; for (size_t j = 0; j < combinedMatchPaths.size(); j++) { @@ -486,27 +466,26 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, - max(matchPaths[i].start, combinedMatchPaths[j].start) + 1; if (overlappedLength < 24) { trimMatchPath(matchPaths[i], combinedMatchPaths[j], overlappedLength); + continue; } else { isOverlapped = true; break; } - // Current path trimmed too much - if (matchPaths[i].end - matchPaths[i].start + 1 < 24) { + if (matchPaths[i].end - matchPaths[i].start + 1 < 24) { // Current path trimmed too much isOverlapped = true; break; - } + } } } if (!isOverlapped) { combinedMatchPaths.push_back(matchPaths[i]); + score += matchPaths[i].score; } } - } - // Calculate the score of combinedMatchPaths - float score = 0; - for (auto & matchPath : combinedMatchPaths) { - score += matchPath.score; + // if (score / readLength >= 1) { + // break; + // } } return score / readLength; } @@ -520,12 +499,10 @@ bool Taxonomer::isMatchPathOverlapped(const MatchPath & matchPath1, void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength) { if (path1.start < path2.start) { path1.end = path2.start - 1; - // uint8_t lastEndHamming = GET_2_BITS(path1.matches.back()->rightEndHamming); path1.hammingDist = max(0, path1.hammingDist - path1.endMatch->getRightPartHammingDist(overlapLength/3)); path1.score = path1.score - path1.endMatch->getRightPartScore(overlapLength/3) - (overlapLength % 3); } else { path1.start = path2.end + 1; - // uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14); path1.hammingDist = max(0, path1.hammingDist - path1.startMatch->getLeftPartHammingDist(overlapLength/3)); path1.score = path1.score - path1.startMatch->getLeftPartScore(overlapLength/3) - (overlapLength % 3); } @@ -558,7 +535,7 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM // Compare curPosMatches and nextPosMatches for (auto &curPosMatch: curPosMatches) { for (auto &nextPosMatch: nextPosMatches) { - if (curPosMatch->isConsecutive_DNA(nextPosMatch)) { + if (isConsecutive(curPosMatch, nextPosMatch)){ linkedMatches[curPosMatch].push_back(nextPosMatch); } } @@ -586,9 +563,12 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM // Compare curPosMatches and nextPosMatches for (auto &curPosMatch: curPosMatches) { for (auto &nextPosMatch: nextPosMatches) { - if (nextPosMatch->isConsecutive_DNA(curPosMatch)) { + if (isConsecutive(nextPosMatch, curPosMatch)){ linkedMatches[curPosMatch].push_back(nextPosMatch); } + // if (nextPosMatch->isConsecutive_DNA(curPosMatch)) { + // linkedMatches[curPosMatch].push_back(nextPosMatch); + // } } } @@ -1052,7 +1032,7 @@ TaxonScore Taxonomer::scoreSpecies(const vector &matches, bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) { // match1 87654321 -> 08765432 // match2 98765432 -> 08765432 - return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF); + return (match1->dnaEncoding >> 3) == (match2->dnaEncoding & 0x1FFFFF); } bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * match2) { diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index a9bc0486..0279fe38 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -112,7 +112,7 @@ class Taxonomer { void trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength); void filterRedundantMatches(vector & speciesMatches, - map & taxCnt); + unordered_map & taxCnt); depthScore DFS(const vector &matches, const Match * curMatchIdx, const map> &linkedMatches, @@ -177,9 +177,9 @@ class Taxonomer { int queryLength, int queryLength2); - TaxID lowerRankClassification(const map & matches, TaxID speciesID, int queryLength); + TaxID lowerRankClassification(const unordered_map & taxCnt, TaxID speciesID, int queryLength); - void getSpeciesCladeCounts(const map & taxCnt, + void getSpeciesCladeCounts(const unordered_map & taxCnt, unordered_map & cladeCnt, TaxID spciesID);