remove unessary copies

steineggerlab · Dec 8, 2023 · 79e743d · 79e743d
1 parent 79acbc8
commit 79e743d
Show file tree

Hide file tree

Showing 5 changed files with 37 additions and 64 deletions.
diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
@@ -331,11 +331,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
                         }
                         for (int k = 0; k < currMatchNum; k++) {
                             idx = selectedMatches[k];
-                            // Check if candidateKmerInfos[idx].sequenceID is valid
-                            // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
-                            //     taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
-                            //     cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
-                            // }
                             matches[matchCnt] = {queryKmerList[j].info,
                                                  candidateKmerInfos[idx].sequenceID,
                                                  taxId2speciesId[candidateKmerInfos[idx].sequenceID],
@@ -474,10 +469,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
     free(splitCheckList);
     queryKmerNum = 0;
 
-#ifdef OPENMP
-    omp_set_num_threads(threads);
-#endif
-
     totalMatchCnt += matchBuffer->startIndexOfReserve;
     return 1;
 }
@@ -520,8 +511,9 @@ void KmerMatcher::compareDna(uint64_t query,
     }
 
     // Select target k-mers that passed hamming criteria
+    uint8_t maxHamming = min(minHammingSum * 2, 7);
     for (size_t h = 0; h < size; h++) {
-        if (hammingSums[h] <= min(minHammingSum * 2, 7)) {
+        if (hammingSums[h] <= maxHamming) {
             selectedMatches.push_back(h);
             selectedHammingSum.push_back(hammingSums[h]);
             if (frame < 3) { // Frame of query k-mer

diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp
@@ -35,11 +35,11 @@ void LocalUtil::splitQueryFile(std::vector<SequenceBlock> & sequences, const std
 
 int LocalUtil::getMaxCoveredLength(int queryLength) {
     if (queryLength % 3 == 2) {
-        return queryLength - 2; // 2
+        return queryLength - 2; 
     } else if (queryLength % 3 == 1) {
-        return queryLength - 4; // 4
+        return queryLength - 4; 
     } else {
-        return queryLength - 3; // 3
+        return queryLength - 3; 
     }
 }
 

diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp
@@ -14,6 +14,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) {
     maxRam = par.ramUsage;
     threads = par.threads;
     bytesPerKmer = sizeof(QueryKmer) + matchPerKmer * sizeof(Match);
+    // std::cout << "bytesPerKmer: " << bytesPerKmer << "\n";
     readNum_1 = 0;
     readNum_2 = 0;
     spaceNum = par.spaceMask.length() - kmerLength;
@@ -25,6 +26,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) {
 void QueryIndexer::setAvailableRam() {
     availableRam = ((size_t) maxRam * (size_t) 1024 * 1024 * 1024)
                          - ((size_t) 128 * 1024 * 1024 * (size_t) threads);
+    // std::cout << "availableRam: " << availableRam << "\n";
 }
 
 void QueryIndexer::indexQueryFile() {
@@ -40,7 +42,6 @@ void QueryIndexer::indexQueryFile() {
             totalReadLength += kseq->entry.sequence.l;
             size_t currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(kseq->entry.sequence.l, spaceNum);
             kmerCnt += currentKmerCnt;
-            // std::cout << "currentKmerCnt: " << kmerCnt << "\n";
 
             if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) {
                 querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt);
@@ -72,7 +73,6 @@ void QueryIndexer::indexQueryFile() {
                 seqCnt_1++;
                 totalReadLength += kseq_1->entry.sequence.l;
                 currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(kseq_1->entry.sequence.l, spaceNum);
-                kmerCnt += currentKmerCnt;
             } else {
                 end = true;
             }
@@ -82,7 +82,7 @@ void QueryIndexer::indexQueryFile() {
                 seqCnt_2++;
                 totalReadLength += kseq_2->entry.sequence.l;
                 currentKmerCnt += LocalUtil::getQueryKmerNumber<size_t>(kseq_2->entry.sequence.l, spaceNum);
-                kmerCnt += currentKmerCnt;
+
             } else {
                 end = true;
             }
@@ -92,6 +92,7 @@ void QueryIndexer::indexQueryFile() {
                 EXIT(EXIT_FAILURE);
             }
 
+            kmerCnt += currentKmerCnt;
             if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) {
                 querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt);
                 kmerCnt = currentKmerCnt;

diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
@@ -89,13 +89,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
                                 const Match *matchList,
                                 vector<Query> & queryList,
                                 const LocalParameters &par) {
-
-//    if (true) {
-//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
-//        for (size_t i = offset; i < end + 1; i++) {
-//            cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
-//        }
-//    }
     // Get the best species for current query
     vector<Match> speciesMatches;
     speciesMatches.reserve(end - offset + 1);
@@ -109,14 +102,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
                                              queryList[currentQuery].queryLength);
     }
 
-//    if (true) {
-//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n";
-//        for (size_t i = 0; i < genusMatches.size(); i++) {
-//           cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) <<  " "  << int(genusMatches[i].redundancy) << endl;
-//         }
-//        cout << "Genus score: " << genusScore.score << "\n";
-//    }
-
     // If there is no proper species for current query, it is un-classified.
     if (speciesScore.score == 0 || speciesScore.score < par.minScore) {
         queryList[currentQuery].isClassified = false;
@@ -138,8 +123,12 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
         return;
     }
 
-    // Filter redundant matches  
-    filterRedundantMatches(speciesMatches, queryList[currentQuery].taxCnt);
+    // Filter redundant matches
+    unordered_map<TaxID, unsigned int> taxCnt;
+    filterRedundantMatches(speciesMatches, taxCnt);
+    for (auto & tax : taxCnt) {
+        queryList[currentQuery].taxCnt[tax.first] = tax.second;    
+    }
 
     // If score is not enough, classify to the parent of the selected species
     if (speciesScore.score < par.minSpScore) {
@@ -153,7 +142,7 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
     }
 
     // Lower rank classification
-    TaxID result = lowerRankClassification(queryList[currentQuery].taxCnt,
+    TaxID result = lowerRankClassification(taxCnt,
                                          speciesScore.taxId,
                                           queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2);
 
@@ -164,21 +153,10 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
     queryList[currentQuery].coverage = speciesScore.coverage;
     queryList[currentQuery].hammingDist = speciesScore.hammingDist;
     queryList[currentQuery].newSpecies = false;
-//    if (par.printLog) {
-//        cout << "# " << currentQuery << endl;
-//        for (size_t i = 0; i < genusMatches.size(); i++) {
-//            cout << i << " " << genusMatches[i].qInfo.pos << " " <<
-//            genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl;
-//        }
-//        cout << "Score: " << speciesScore.score << "  " << selectedSpecies << " "
-//             << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx)
-//
-//             << endl;
-//    }
 }
 
 void Taxonomer::filterRedundantMatches(vector<Match> & speciesMatches,
-                                       map<TaxID, int > & taxCnt) {
+                                        unordered_map<TaxID, unsigned int> & taxCnt) {
     // Sort matches by the coordinate on the query
     sort(speciesMatches.begin(), speciesMatches.end(),
          [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
@@ -204,7 +182,7 @@ void Taxonomer::filterRedundantMatches(vector<Match> & speciesMatches,
     }
 }
 
-TaxID Taxonomer::lowerRankClassification(const map<TaxID, int> & taxCnt, TaxID spTaxId, int queryLength) {
+TaxID Taxonomer::lowerRankClassification(const unordered_map<TaxID, unsigned int> & taxCnt, TaxID spTaxId, int queryLength) {
     unsigned int maxCnt = (queryLength - 1)/denominator + 1;
     unordered_map<TaxID, TaxonCounts> cladeCnt;
     getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
@@ -225,7 +203,7 @@ TaxID Taxonomer::lowerRankClassification(const map<TaxID, int> & taxCnt, TaxID s
     }
 }
 
-void Taxonomer::getSpeciesCladeCounts(const map<TaxID, int> &taxCnt,
+void Taxonomer::getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> &taxCnt,
                                        unordered_map<TaxID, TaxonCounts> & cladeCount,
                                        TaxID speciesTaxID) {
     for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) {
@@ -475,9 +453,11 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
     // 1. Add the matchPath with the highest score to combinedMatchPaths
     // 2. Add the matchPath with the highest score that is not overlapped with the matchPath in combinedMatchPaths
     // 3. Repeat 2 until no matchPath can be added
+    float score = 0;
     for (size_t i = 0; i < matchPaths.size(); i++) {
         if (combinedMatchPaths.empty()) {
             combinedMatchPaths.push_back(matchPaths[i]);
+            score += matchPaths[i].score;
         } else {
             bool isOverlapped = false;
             for (size_t j = 0; j < combinedMatchPaths.size(); j++) {
@@ -486,27 +466,26 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
                                             - max(matchPaths[i].start, combinedMatchPaths[j].start) + 1;
                     if (overlappedLength < 24) {
                         trimMatchPath(matchPaths[i], combinedMatchPaths[j], overlappedLength);
+
                         continue;
                     } else {
                         isOverlapped = true;
                         break;
                     }
-                    // Current path trimmed too much 
-                    if (matchPaths[i].end - matchPaths[i].start + 1 < 24) {
+                    if (matchPaths[i].end - matchPaths[i].start + 1 < 24) { // Current path trimmed too much 
                         isOverlapped = true;
                         break;
-                    }
+                    }  
                 } 
             }
             if (!isOverlapped) {
                 combinedMatchPaths.push_back(matchPaths[i]);
+                score += matchPaths[i].score;           
             }
         }
-    }
-    // Calculate the score of combinedMatchPaths
-    float score = 0;
-    for (auto & matchPath : combinedMatchPaths) {
-        score += matchPath.score;
+        // if (score / readLength >= 1) {
+        //     break;
+        // }
     }
     return score / readLength;
 }
@@ -520,12 +499,10 @@ bool Taxonomer::isMatchPathOverlapped(const MatchPath & matchPath1,
 void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength) {
     if (path1.start < path2.start) { 
         path1.end = path2.start - 1;
-        // uint8_t lastEndHamming = GET_2_BITS(path1.matches.back()->rightEndHamming);
         path1.hammingDist = max(0, path1.hammingDist - path1.endMatch->getRightPartHammingDist(overlapLength/3));
         path1.score = path1.score - path1.endMatch->getRightPartScore(overlapLength/3) - (overlapLength % 3);
     } else {
         path1.start = path2.end + 1;
-        // uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14);
         path1.hammingDist = max(0, path1.hammingDist - path1.startMatch->getLeftPartHammingDist(overlapLength/3));
         path1.score = path1.score - path1.startMatch->getLeftPartScore(overlapLength/3) - (overlapLength % 3);
     }
@@ -558,7 +535,7 @@ void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameM
                 // Compare curPosMatches and nextPosMatches
                 for (auto &curPosMatch: curPosMatches) {
                     for (auto &nextPosMatch: nextPosMatches) {
-                        if (curPosMatch->isConsecutive_DNA(nextPosMatch)) {
+                        if (isConsecutive(curPosMatch, nextPosMatch)){
                             linkedMatches[curPosMatch].push_back(nextPosMatch);
                         }
                     }
@@ -586,9 +563,12 @@ void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameM
                 // Compare curPosMatches and nextPosMatches
                 for (auto &curPosMatch: curPosMatches) {
                     for (auto &nextPosMatch: nextPosMatches) {
-                        if (nextPosMatch->isConsecutive_DNA(curPosMatch)) {
+                        if (isConsecutive(nextPosMatch, curPosMatch)){
                             linkedMatches[curPosMatch].push_back(nextPosMatch);
                         }
+                        // if (nextPosMatch->isConsecutive_DNA(curPosMatch)) {
+                        //     linkedMatches[curPosMatch].push_back(nextPosMatch);
+                        // }
                     }
                 }
 
@@ -1052,7 +1032,7 @@ TaxonScore Taxonomer::scoreSpecies(const vector<Match> &matches,
 bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) {
     // match1 87654321 -> 08765432
     // match2 98765432 -> 08765432
-    return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF);
+    return (match1->dnaEncoding >> 3) == (match2->dnaEncoding & 0x1FFFFF);
 }
 
 bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * match2) {

diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
@@ -112,7 +112,7 @@ class Taxonomer {
     void trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength);
 
     void filterRedundantMatches(vector<Match> & speciesMatches,
-                                map<TaxID, int> & taxCnt);
+                                unordered_map<TaxID, unsigned int> & taxCnt);
 
     depthScore DFS(const vector<const Match *> &matches, const Match * curMatchIdx,
                    const map<const Match *, vector<const Match *>> &linkedMatches,
@@ -177,9 +177,9 @@ class Taxonomer {
                             int queryLength,
                             int queryLength2);
 
-    TaxID lowerRankClassification(const map<TaxID, int> & matches, TaxID speciesID, int queryLength);
+    TaxID lowerRankClassification(const unordered_map<TaxID, unsigned int> & taxCnt, TaxID speciesID, int queryLength);
 
-    void getSpeciesCladeCounts(const map<TaxID, int> & taxCnt,
+    void getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> & taxCnt,
                                unordered_map<TaxID, TaxonCounts> & cladeCnt,
                                TaxID spciesID);