From cd80a7f76baa2786583c45ef5a56d7a86dba85b9 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Wed, 6 Dec 2023 16:30:32 +0900
Subject: [PATCH 1/5] build mac OS first

---
 azure-pipelines.yml | 52 ++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index d19f87b6..87a962d2 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -7,6 +7,32 @@ variables:
     regression: 1
 
 jobs:
+ - job: build_macos_11
+    displayName: macOS 11
+    pool:
+      vmImage: 'macos-11'
+    steps:
+      - checkout: self
+        submodules: true
+      - script: |
+          rustup update
+          rustup target install x86_64-apple-darwin
+          rustup target install aarch64-apple-darwin
+        displayName: Install Rust Toolchain
+      - script: |
+          cd ${BUILD_SOURCESDIRECTORY}
+          ./util/build_osx.sh . build metabuli
+        displayName: Build Metabuli
+      - script: |
+          cd ${BUILD_SOURCESDIRECTORY}
+          ./util/Metabuli-regression/run_regression.sh ./build/metabuli examples 
+        displayName: Run Regression Suite
+        condition: eq(variables['regression'], 1)
+      - task: PublishPipelineArtifact@0
+        inputs:
+          targetPath: $(Build.SourcesDirectory)/build/metabuli
+          artifactName: metabuli-darwin-universal
+          
   - job: build_ubuntu_2004
     displayName: Ubuntu 2004
     pool:
@@ -121,32 +147,6 @@ jobs:
           targetPath: $(Build.SourcesDirectory)/build/src/metabuli
           artifactName: metabuli-linux-$(SIMD)
 
-  - job: build_macos_11
-    displayName: macOS 11
-    pool:
-      vmImage: 'macos-11'
-    steps:
-      - checkout: self
-        submodules: true
-      - script: |
-          rustup update
-          rustup target install x86_64-apple-darwin
-          rustup target install aarch64-apple-darwin
-        displayName: Install Rust Toolchain
-      - script: |
-          cd ${BUILD_SOURCESDIRECTORY}
-          ./util/build_osx.sh . build metabuli
-        displayName: Build Metabuli
-      - script: |
-          cd ${BUILD_SOURCESDIRECTORY}
-          ./util/Metabuli-regression/run_regression.sh ./build/metabuli examples 
-        displayName: Run Regression Suite
-        condition: eq(variables['regression'], 1)
-      - task: PublishPipelineArtifact@0
-        inputs:
-          targetPath: $(Build.SourcesDirectory)/build/metabuli
-          artifactName: metabuli-darwin-universal
-
   - job: upload_artifacts
     displayName: Upload Artifacts
     condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'))

From afd72db6b045d024282e030a44eb58cda90c52ed Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Wed, 6 Dec 2023 17:54:10 +0900
Subject: [PATCH 2/5] update README, undo the change in azure..

---
 README.md           | 20 +++++++++++------
 azure-pipelines.yml | 52 ++++++++++++++++++++++-----------------------
 2 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index 38f8bf09..49d3525f 100644
--- a/README.md
+++ b/README.md
@@ -49,22 +49,26 @@ The built binary can be found in `./build/src`.
 ## Pre-built databases
 You can download [pre-built databases](https://metabuli.steineggerlab.workers.dev/) using `databases` workflow.
 ```
+Usage:
+metabuli databases DB_NAME OUTDIR tmp
+
 # RefSeq Complete/Chromosome (115.6 GiB)
 # - Complete Genome or Chromosome level assemblies of virus and prokaryotes in RefSeq (2023-04-04) and human genome (GRCh38.p14)
-metabuli databases RefSeq DBDIR tmp
+metabuli databases RefSeq OUTDIR tmp
 
 # RefSeq Releases 217 (480.5 GiB)
 # - Viral and prokaryotic genomes of RefSeq release 217 and human genome (GRCh38.p14)
-metabuli databases RefSeq217 DBDIR tmp
+metabuli databases RefSeq217 OUTDIR tmp
 
 # GTDB 207 (81.2 GiB)
 # - Complete Genome or Chromosome level assemblies in GTDB207 (CheckM Completeness > 90, CheckM Contamination < 5) with GTDB taxonomy.
-metabuli databases GTDB207 DBDIR tmp
+metabuli databases GTDB207 OUTDIR tmp
 
 # RefSeq Virus (1.5 GiB)
 # - Viral RefSeq genomes and five SARS-CoV-2 variants (alpha, beta, delta, gamma, and omicron)
-metabuli databases RefSeq_virus DBDIR tmp
+metabuli databases RefSeq_virus OUT_DIR tmp
 ```
+Downloaded files are stored in `OUTDIR/DB_NAME` directory, which can be provided for `classify` module as `DBDIR`.
 
 
 ## Classification
@@ -254,7 +258,11 @@ Classifying RNA-seq reads from a COVID-19 patient to identify the culprit varian
 The whole process must take less than 10 mins using a personal machine.
 
 #### 1. Download RefSeq Virus DB (1.5 GiB)
-`metabuli databases RefSeq_virus refseq_virus tmp`
+```
+metabuli databases RefSeq_virus OUTDIR tmp
+```
+`OUTDIR/refseq_virus` is the `DBDIR` in step 3.
+
 
 #### 2. Download an RNA-seq result (SRR14484345)
    Option 1. Download using SRA Toolkit 
@@ -270,7 +278,7 @@ cat SRR14484345.fastq | paste - - - - - - - - | tee >(cut -f 1-4 | tr "\t" "\n"
 
 #### 3. Classify the reads using metabuli
    ```
-   metabuli classify SRR14484345_1.fq SRR14484345_2.fq refseq_virus RESULT_DIR JOB_ID --max-ram RAM_SIZE
+   metabuli classify SRR14484345_1.fq SRR14484345_2.fq DBDIR RESULT_DIR JOB_ID --max-ram RAM_SIZE
    ```
 #### 4. Check RESULT_DIR/JOB_ID_report.tsv
   Find a section like the example below
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 87a962d2..d19f87b6 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -7,32 +7,6 @@ variables:
     regression: 1
 
 jobs:
- - job: build_macos_11
-    displayName: macOS 11
-    pool:
-      vmImage: 'macos-11'
-    steps:
-      - checkout: self
-        submodules: true
-      - script: |
-          rustup update
-          rustup target install x86_64-apple-darwin
-          rustup target install aarch64-apple-darwin
-        displayName: Install Rust Toolchain
-      - script: |
-          cd ${BUILD_SOURCESDIRECTORY}
-          ./util/build_osx.sh . build metabuli
-        displayName: Build Metabuli
-      - script: |
-          cd ${BUILD_SOURCESDIRECTORY}
-          ./util/Metabuli-regression/run_regression.sh ./build/metabuli examples 
-        displayName: Run Regression Suite
-        condition: eq(variables['regression'], 1)
-      - task: PublishPipelineArtifact@0
-        inputs:
-          targetPath: $(Build.SourcesDirectory)/build/metabuli
-          artifactName: metabuli-darwin-universal
-          
   - job: build_ubuntu_2004
     displayName: Ubuntu 2004
     pool:
@@ -147,6 +121,32 @@ jobs:
           targetPath: $(Build.SourcesDirectory)/build/src/metabuli
           artifactName: metabuli-linux-$(SIMD)
 
+  - job: build_macos_11
+    displayName: macOS 11
+    pool:
+      vmImage: 'macos-11'
+    steps:
+      - checkout: self
+        submodules: true
+      - script: |
+          rustup update
+          rustup target install x86_64-apple-darwin
+          rustup target install aarch64-apple-darwin
+        displayName: Install Rust Toolchain
+      - script: |
+          cd ${BUILD_SOURCESDIRECTORY}
+          ./util/build_osx.sh . build metabuli
+        displayName: Build Metabuli
+      - script: |
+          cd ${BUILD_SOURCESDIRECTORY}
+          ./util/Metabuli-regression/run_regression.sh ./build/metabuli examples 
+        displayName: Run Regression Suite
+        condition: eq(variables['regression'], 1)
+      - task: PublishPipelineArtifact@0
+        inputs:
+          targetPath: $(Build.SourcesDirectory)/build/metabuli
+          artifactName: metabuli-darwin-universal
+
   - job: upload_artifacts
     displayName: Upload Artifacts
     condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'))

From b5dde073a96770faaa3e700c76dadaed8cf2dfa5 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Fri, 8 Dec 2023 02:30:25 +0900
Subject: [PATCH 3/5] don't store all matche of paths. just store the fisrt and
 last

---
 README.md                  |   4 +-
 src/commons/Classifier.cpp |   2 +-
 src/commons/Taxonomer.cpp  | 106 +++++++------------------------------
 src/commons/Taxonomer.h    |  16 +++---
 src/metabuli.cpp           |  35 ------------
 5 files changed, 30 insertions(+), 133 deletions(-)

diff --git a/README.md b/README.md
index 49d3525f..2d8ebd2d 100644
--- a/README.md
+++ b/README.md
@@ -261,8 +261,6 @@ The whole process must take less than 10 mins using a personal machine.
 ```
 metabuli databases RefSeq_virus OUTDIR tmp
 ```
-`OUTDIR/refseq_virus` is the `DBDIR` in step 3.
-
 
 #### 2. Download an RNA-seq result (SRR14484345)
    Option 1. Download using SRA Toolkit 
@@ -278,7 +276,7 @@ cat SRR14484345.fastq | paste - - - - - - - - | tee >(cut -f 1-4 | tr "\t" "\n"
 
 #### 3. Classify the reads using metabuli
    ```
-   metabuli classify SRR14484345_1.fq SRR14484345_2.fq DBDIR RESULT_DIR JOB_ID --max-ram RAM_SIZE
+   metabuli classify SRR14484345_1.fq SRR14484345_2.fq OUTDIR/refseq_virus RESULT_DIR JOB_ID --max-ram RAM_SIZE
    ```
 #### 4. Check RESULT_DIR/JOB_ID_report.tsv
   Find a section like the example below
diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp
index 70a6d59a..fe4a4b74 100644
--- a/src/commons/Classifier.cpp
+++ b/src/commons/Classifier.cpp
@@ -103,7 +103,7 @@ void Classifier::startClassify(const LocalParameters &par) {
         kmerMatcher->sortMatches(&matchBuffer);
 
         // Classify queries based on the matches.
-        //  omp_set_num_threads(1);
+        // omp_set_num_threads(1);
         taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par);
         processedSeqCnt += queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start;
         cout << "The number of processed sequences: " << processedSeqCnt << " (" << (double) processedSeqCnt / (double) numOfSeq << ")" << endl;
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index 76e2830c..f139be2d 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -412,13 +412,11 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<const Match *> & speciesMatch
         }
         matchPaths.clear();
     }
-    
     // If there are no meaningful species
     if (species2score.empty()) {
         bestScore.score = 0;
         return bestScore;
     }
-
     vector<TaxID> maxSpecies;
     for (auto & spScore : species2score) {
         if (spScore.second > bestSpScore * 0.95) {
@@ -513,71 +511,23 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
     return score / readLength;
 }
 
-bool Taxonomer::isMatchPathLinked(const MatchPath & matchPath1, const MatchPath & matchPath2) {
-    int overlappedLength = min(matchPath1.end, matchPath2.end) - max(matchPath1.start, matchPath2.start) + 1;
-    if (20 >= overlappedLength || overlappedLength >= 24) {
-        return false;
-    }
-    const Match * last;
-    const Match * first;
-    if (matchPath1.start < matchPath2.start) {
-        last = matchPath1.matches.back();
-        first = matchPath2.matches.front();
-    } else {
-        last = matchPath2.matches.back();
-        first = matchPath1.matches.front();
-    }
-    if (overlappedLength == 21) {
-        return isConsecutive(last, first);
-    } else {
-
-        return isConsecutive_diffFrame(last, first);
-    }
-    return false;
-}
-
 bool Taxonomer::isMatchPathOverlapped(const MatchPath & matchPath1,
                                       const MatchPath & matchPath2) {
     return !((matchPath1.end < matchPath2.start) || (matchPath2.end < matchPath1.start));                                       
 }
 
-void Taxonomer::mergeMatchPaths(const MatchPath & source, MatchPath & target) {
-    if (source.start < target.start) {
-        target.start = source.start;
-        uint8_t lastEndHamming = GET_2_BITS(target.matches.front()->rightEndHamming);
-        target.hammingDist += source.hammingDist - (source.matches.back()->hamming - lastEndHamming);
-        target.score += source.score - source.matches.back()->getScore();
-        if (lastEndHamming == 0) {
-            target.score += 3.0f;
-        } else {
-            target.score += 2.0f - 0.5f * lastEndHamming;
-        }
-        target.matches.insert(target.matches.begin(), source.matches.begin(), source.matches.end() - 1);
-    } else {
-        target.end = source.end;
-        uint8_t lastEndHamming = GET_2_BITS(source.matches.front()->rightEndHamming >> 14);
-        target.hammingDist += source.hammingDist - (source.matches.front()->hamming - lastEndHamming);
-        target.score += source.score - source.matches.front()->getScore();
-        if (lastEndHamming == 0) {
-            target.score += 3.0f;
-        } else {
-            target.score += 2.0f - 0.5f * lastEndHamming;
-        }
-        target.matches.insert(target.matches.end(), source.matches.begin() + 1, source.matches.end());
-    }
-}
 
 void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength) {
     if (path1.start < path2.start) { 
         path1.end = path2.start - 1;
         // uint8_t lastEndHamming = GET_2_BITS(path1.matches.back()->rightEndHamming);
-        path1.hammingDist = max(0, path1.hammingDist - path1.matches.back()->getRightPartHammingDist(overlapLength/3));
-        path1.score = path1.score - path1.matches.back()->getRightPartScore(overlapLength/3) - (overlapLength % 3);
+        path1.hammingDist = max(0, path1.hammingDist - path1.endMatch->getRightPartHammingDist(overlapLength/3));
+        path1.score = path1.score - path1.endMatch->getRightPartScore(overlapLength/3) - (overlapLength % 3);
     } else {
         path1.start = path2.end + 1;
         // uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14);
-        path1.hammingDist = max(0, path1.hammingDist - path1.matches.front()->getLeftPartHammingDist(overlapLength/3));
-        path1.score = path1.score - path1.matches.front()->getLeftPartScore(overlapLength/3) - (overlapLength % 3);
+        path1.hammingDist = max(0, path1.hammingDist - path1.startMatch->getLeftPartHammingDist(overlapLength/3));
+        path1.score = path1.score - path1.startMatch->getLeftPartScore(overlapLength/3) - (overlapLength % 3);
     }
 }
 
@@ -649,34 +599,20 @@ void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameM
             currPos = nextPos;
         }
     }
-    
-    // Print linkedMatches
-//    if (par.printLog) {
-//        cout << "linkedMatches: " << endl;
-//        for (const auto &entry: linkedMatches) {
-//            cout << entry.first << ": ";
-//            for (auto &idx: entry.second) {
-//                cout << idx << " ";
-//            }
-//            cout << endl;
-//        }
-//    }
 
     // Iterate linkedMatches to get filteredMatches 
-    //(ignore matches not enoughly consecutive)
+    // (ignore matches not enoughly consecutive)
     size_t MIN_DEPTH = minConsCnt - 1;
     if (taxonomy->IsAncestor(eukaryotaTaxId, genusId)) {
         MIN_DEPTH = minConsCntEuk - 1;
     }
     unordered_set<const Match *> used;
     unordered_map<const Match *, depthScore> idx2depthScore;
-    unordered_map<const Match *, const Match *> edges;
-
+    
     for (const auto& entry : linkedMatches) {
         if (!used.count(entry.first)) {
             used.insert(entry.first);
             depthScore bestPath{};
-            const Match * bestNextMatch = nullptr;
             for (size_t j = 0; j < entry.second.size(); j++) {
                 used.insert(entry.second[j]);
                 depthScore curPath = DFS(curFrameMatches,
@@ -686,11 +622,9 @@ void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameM
                                          MIN_DEPTH, 
                                          used, 
                                          idx2depthScore,
-                                         edges, 
                                          entry.first->getScore(),
                                          entry.first->hamming);
                 if (curPath.score > bestPath.score && curPath.depth > MIN_DEPTH) {
-                    bestNextMatch = entry.second[j];
                     bestPath = curPath;
                 }
             }
@@ -698,14 +632,9 @@ void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameM
             if (bestPath.depth > MIN_DEPTH) {
                 matchPaths.emplace_back(entry.first->qInfo.pos, // start coordinate on query
                                         entry.first->qInfo.pos + bestPath.depth * 3 + 20, // end coordinate on query
-                                        bestPath.score, bestPath.hammingDist);
-                const Match * curMatch = entry.first;
-                edges[curMatch] = bestNextMatch;
-                matchPaths.back().matches.push_back(curMatch);
-                while (edges.find(curMatch) != edges.end()) {
-                    matchPaths.back().matches.push_back(edges[curMatch]);
-                    curMatch = edges[curMatch];
-                }
+                                        bestPath.score, bestPath.hammingDist,
+                                        entry.first,
+                                        bestPath.endMatch);
             }
         }
     }
@@ -717,9 +646,9 @@ depthScore Taxonomer::DFS(const vector<const Match *> &matches,
                           size_t depth, size_t MIN_DEPTH,
                           unordered_set<const Match *> &used,
                           unordered_map<const Match *, depthScore> & match2depthScore,
-                          unordered_map<const Match *, const Match *> & edges, float score, int hammingDist) {
+                          float score, int hammingDist) {
     depth++;
-    depthScore bestDepthScore = depthScore(0, 0, 0);
+    depthScore bestDepthScore = depthScore{};
     depthScore returnDepthScore;
     depthScore curDepthScore;
     float recievedScore = score;
@@ -730,8 +659,8 @@ depthScore Taxonomer::DFS(const vector<const Match *> &matches,
         } else {
             score += 2.0f - 0.5f * lastEndHamming;
         }
-        match2depthScore[curMatch] = depthScore(1, score - recievedScore, lastEndHamming);
-        return depthScore(depth, score, hammingDist + lastEndHamming);
+        match2depthScore[curMatch] = depthScore(1, score - recievedScore, lastEndHamming, curMatch);
+        return depthScore(depth, score, hammingDist + lastEndHamming, curMatch);
     } else { // not a leaf node
         uint8_t lastEndHamming = (curMatch->rightEndHamming >> 14);
         if (lastEndHamming == 0) {
@@ -746,20 +675,21 @@ depthScore Taxonomer::DFS(const vector<const Match *> &matches,
                 returnDepthScore = match2depthScore[nextMatch];
                 curDepthScore = depthScore(returnDepthScore.depth + depth,
                                            returnDepthScore.score + score,
-                                           returnDepthScore.hammingDist + hammingDist + lastEndHamming);
+                                           returnDepthScore.hammingDist + hammingDist + lastEndHamming,
+                                           returnDepthScore.endMatch);
             } else {
-                curDepthScore = DFS(matches, nextMatch, linkedMatches, depth, MIN_DEPTH, used, match2depthScore, edges, score, hammingDist + lastEndHamming);
+                curDepthScore = DFS(matches, nextMatch, linkedMatches, depth, MIN_DEPTH, used, match2depthScore, score, hammingDist + lastEndHamming);
             }
             if (curDepthScore.score > bestDepthScore.score
                 && curDepthScore.depth > MIN_DEPTH) {
                 bestDepthScore = curDepthScore;
-                edges[curMatch] = nextMatch;
             }
         }    
         if (bestDepthScore.depth > MIN_DEPTH) {
             match2depthScore[curMatch] = depthScore(bestDepthScore.depth - depth + 1,
                                                      bestDepthScore.score - recievedScore,
-                                                     bestDepthScore.hammingDist - hammingDist);
+                                                     bestDepthScore.hammingDist - hammingDist,
+                                                     bestDepthScore.endMatch);
         }
     }
     return bestDepthScore;
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index b94c2348..c3186d43 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -23,21 +23,25 @@ struct TaxonScore {
 };
 
 struct depthScore {
-    depthScore(size_t depth, float score, int hammingDist) : depth(depth), score(score), hammingDist(hammingDist) {}
-    depthScore() : depth(0), score(0.f), hammingDist(0) {}
+    depthScore(size_t depth, float score, int hammingDist, const Match * endMatch) : 
+        depth(depth), score(score), hammingDist(hammingDist), endMatch(endMatch) {}
+    depthScore() : depth(0), score(0.f), hammingDist(0), endMatch(nullptr) {}
     size_t depth;
     float score;
     int hammingDist;
+    const Match * endMatch;
 };
 
 struct MatchPath {
-    MatchPath(int start, int end, float score, int hammingDist) : start(start), end(end), score(score), hammingDist(hammingDist) {}
-    MatchPath() : start(0), end(0), score(0.f), hammingDist(0) {}
+    MatchPath(int start, int end, float score, int hammingDist, const Match * startMatch, const Match * endMatch) :
+         start(start), end(end), score(score), hammingDist(hammingDist), startMatch(startMatch), endMatch(endMatch) {}
+    MatchPath() : start(0), end(0), score(0.f), hammingDist(0), startMatch(nullptr), endMatch(nullptr) {}
     int start;
     int end;
     float score;
     int hammingDist;
-    vector<const Match *> matches;
+    const Match * startMatch;
+    const Match * endMatch;
 };
 
 
@@ -114,7 +118,7 @@ class Taxonomer {
                    const map<const Match *, vector<const Match *>> &linkedMatches,
                    size_t depth, size_t MIN_DEPTH, unordered_set<const Match *> &used,
                    unordered_map<const Match *, depthScore> &match2depthScore,
-                   unordered_map<const Match *, const Match *> & edges, float score, int hammingDist);
+                   float score, int hammingDist);
 
     static bool isConsecutive(const Match * match1, const Match * match2);
 
diff --git a/src/metabuli.cpp b/src/metabuli.cpp
index b7285d0d..2458d7c1 100644
--- a/src/metabuli.cpp
+++ b/src/metabuli.cpp
@@ -58,14 +58,6 @@ std::vector<Command> commands = {
                  {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
                  {"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory},
                  {"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
-        // {"filter", filter, &localPar.filter, COMMAND_MAIN,
-        //         "Filtering reads based on the classification result",
-        //         nullptr,
-        //         "Jaebeom Kim <jbeom0731@gmail.com>",
-        //         "<i:READ FILE> <i:FILTER DB>",
-        //         CITATION_SPACEPHARER,
-        //         {{"READ FILE", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile},
-        //          {"FILTER DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}},
         {"grade", grade, &localPar.grade, COMMAND_EXPERT,
                 "Grade the classification result (only for benchmarking)",
                 nullptr,
@@ -85,14 +77,6 @@ std::vector<Command> commands = {
                         {"List of answer sheets (Query ID 2 tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                         {"List of assembly accessions of reference sequences", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                         {"Taxonomy directory", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::directory}}},
-        // {"seqHeader2TaxId", seqHeader2TaxId, &localPar.seqHeader2TaxId, COMMAND_EXPERT,
-        //         "It extracts k-mers from query sequences, and compares them to the target database",
-        //         nullptr,
-        //         "Jaebeom Kim <jbeom0731@gmail.com>",
-        //         "<i:read-classification> <i:mapping>",
-        //         CITATION_SPACEPHARER,
-        //         {{"read-classification", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
-        //                 {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
         {"add-to-library", addToLibrary, &localPar.addToLibrary, COMMAND_DATABASE_CREATION,
                     "It bins sequences into files according to their species.",
                     nullptr,
@@ -102,16 +86,6 @@ std::vector<Command> commands = {
                     {{"List of absolute paths of files to be added. One path per line.", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                      {"NCBI style accession2taxid file. It should be consistent to tax dump files.", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                      {"DB directory", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}},
-        // {"apply-threshold", applyThreshold, &localPar.applyThreshold, COMMAND_EXPERT,
-        //         "Assigning taxonomy label to query reads",
-        //         nullptr,
-        //         "Jaebeom Kim <jbeom0731@gmail.com>",
-        //         "<i:Old readclassification> <o:OUT DIR> <o:JOB ID> <i: TAXONOMY DIR> ",
-        //         CITATION_SPACEPHARER,
-        //         {{"Old Results", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
-        //                 {"OUT DIR", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
-        //                 {"JOB ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile},
-        //                 {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
         {"binning2report", binning2report, &localPar.binning2report, COMMAND_FORMAT_CONVERSION,
                 "It generates Kraken style report file from binning results",
                 nullptr,
@@ -122,15 +96,6 @@ std::vector<Command> commands = {
                         {"OUT DIR", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
                         {"JOB ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                         {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
-        // {"filter-by-genus", filterByGenus, &localPar.filterByGenus, COMMAND_EXPERT,
-        //             "It filters out reads classified as a specific genus",
-        //         nullptr,
-        //             "Jaebeom Kim <jbeom0731@gmail.com>",
-        //             "<i:Binning Result> <i:Genus list> <i: TAXONOMY DIR> ",
-        //             CITATION_SPACEPHARER,
-        //             {{"Binning Result", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
-        //              {"Genus list", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile},
-        //              {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
         {"mapping2taxon", mapping2taxon, &localPar.mapping2taxon, COMMAND_EXPERT,
                     "It takes a mapping file (multiple targets for each read) and generates a read2taxon file (one target for each read)",
                 nullptr,

From 79acbc8c6d196d46aa1b5bb1765de351bba38cd5 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Fri, 8 Dec 2023 02:37:15 +0900
Subject: [PATCH 4/5] copy Matches of speices

---
 src/commons/Taxonomer.cpp | 34 +++++++++++++++++-----------------
 src/commons/Taxonomer.h   |  6 +++---
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index f139be2d..cff5d5c1 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -97,7 +97,7 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
 //        }
 //    }
     // Get the best species for current query
-    vector<const Match*> speciesMatches;
+    vector<Match> speciesMatches;
     speciesMatches.reserve(end - offset + 1);
     TaxonScore speciesScore(0, 0, 0, 0, 0);
     if (par.seqMode == 2) {
@@ -177,26 +177,26 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
 //    }
 }
 
-void Taxonomer::filterRedundantMatches(vector<const Match *> & speciesMatches,
+void Taxonomer::filterRedundantMatches(vector<Match> & speciesMatches,
                                        map<TaxID, int > & taxCnt) {
     // Sort matches by the coordinate on the query
     sort(speciesMatches.begin(), speciesMatches.end(),
-         [](const Match * a, const Match * b) { return a->qInfo.pos < b->qInfo.pos; });
+         [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
     
     // Remove redundant matches
     size_t matchNum = speciesMatches.size();
     for (size_t i = 0; i < matchNum;) {
-        size_t currQuotient = speciesMatches[i]->qInfo.pos / 3;
-        uint8_t minHamming = speciesMatches[i]->hamming;
-        const Match * minHammingMatch = speciesMatches[i];
-        TaxID minHammingTaxId = minHammingMatch->targetId;
-        while ((i < matchNum) && (currQuotient == speciesMatches[i]->qInfo.pos / 3)) {
-            if (speciesMatches[i]->hamming < minHamming) {
-                minHamming = speciesMatches[i]->hamming;
+        size_t currQuotient = speciesMatches[i].qInfo.pos / 3;
+        uint8_t minHamming = speciesMatches[i].hamming;
+        Match minHammingMatch = speciesMatches[i];
+        TaxID minHammingTaxId = minHammingMatch.targetId;
+        while ((i < matchNum) && (currQuotient == speciesMatches[i].qInfo.pos / 3)) {
+            if (speciesMatches[i].hamming < minHamming) {
+                minHamming = speciesMatches[i].hamming;
                 minHammingMatch = speciesMatches[i];
-                minHammingTaxId = minHammingMatch->targetId;
-            } else if (speciesMatches[i]->hamming == minHamming) {
-                minHammingTaxId = taxonomy->LCA(minHammingTaxId, speciesMatches[i]->targetId);
+                minHammingTaxId = minHammingMatch.targetId;
+            } else if (speciesMatches[i].hamming == minHamming) {
+                minHammingTaxId = taxonomy->LCA(minHammingTaxId, speciesMatches[i].targetId);
             }
             i++;
         }
@@ -268,7 +268,7 @@ TaxID Taxonomer::BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID r
     }
 }
 
-TaxonScore Taxonomer::getBestSpeciesMatches(vector<const Match *> & speciesMatches,
+TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> & speciesMatches,
                                             const Match *matchList,
                                             size_t end,
                                             size_t offset,
@@ -358,7 +358,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<const Match *> & speciesMatch
                         - speciesMatchRange[bestScore.taxId].first + 1);
 
     for (size_t j = speciesMatchRange[bestScore.taxId].first; j < speciesMatchRange[bestScore.taxId].second; j++) {
-        speciesMatches.push_back(& matchList[j]);
+        speciesMatches.push_back(matchList[j]);
     }
     bestScore.coverage = coveredLength / queryLength;
     bestScore.hammingDist = hammingDist;
@@ -366,7 +366,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<const Match *> & speciesMatch
     return bestScore;                                  
 }
 
-TaxonScore Taxonomer::getBestSpeciesMatches(vector<const Match *> & speciesMatches,
+TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> & speciesMatches,
                                             const Match *matchList,
                                             size_t end,
                                             size_t offset,
@@ -455,7 +455,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<const Match *> & speciesMatch
                         - speciesMatchRange[bestScore.taxId].first + 1);
 
     for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) {
-        speciesMatches.push_back(&matchList[i]);
+        speciesMatches.push_back(matchList[i]);
     }
     bestScore.coverage = coveredLength / (readLength1 + readLength2);
     bestScore.hammingDist = hammingDist;
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index c3186d43..a9bc0486 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -111,7 +111,7 @@ class Taxonomer {
 
     void trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength);
 
-    void filterRedundantMatches(vector<const Match*> & speciesMatches,
+    void filterRedundantMatches(vector<Match> & speciesMatches,
                                 map<TaxID, int> & taxCnt);
 
     depthScore DFS(const vector<const Match *> &matches, const Match * curMatchIdx,
@@ -130,10 +130,10 @@ class Taxonomer {
     TaxonScore getBestGenusMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
                                    int readLength1, int readLength2);
 
-    TaxonScore getBestSpeciesMatches(vector<const Match*> &speciesMatches, const Match *matchList, size_t end,
+    TaxonScore getBestSpeciesMatches(vector<Match> &speciesMatches, const Match *matchList, size_t end,
                                      size_t offset, int queryLength);
     
-    TaxonScore getBestSpeciesMatches(vector<const Match*> &speciesMatches, const Match *matchList, size_t end,
+    TaxonScore getBestSpeciesMatches(vector<Match> &speciesMatches, const Match *matchList, size_t end,
                                      size_t offset, int readLength1, int readLength2);
 
     // TaxonScore getBestGenusMatches_spaced(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,

From 79e743d5994a4ea2db46f3706ba9a032a7224528 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Fri, 8 Dec 2023 13:22:00 +0900
Subject: [PATCH 5/5] remove unessary copies

---
 src/commons/KmerMatcher.cpp  | 12 ++-----
 src/commons/LocalUtil.cpp    |  6 ++--
 src/commons/QueryIndexer.cpp |  7 ++--
 src/commons/Taxonomer.cpp    | 70 +++++++++++++-----------------------
 src/commons/Taxonomer.h      |  6 ++--
 5 files changed, 37 insertions(+), 64 deletions(-)

diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 9c87d527..7627f0aa 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -331,11 +331,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
                         }
                         for (int k = 0; k < currMatchNum; k++) {
                             idx = selectedMatches[k];
-                            // Check if candidateKmerInfos[idx].sequenceID is valid
-                            // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
-                            //     taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
-                            //     cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
-                            // }
                             matches[matchCnt] = {queryKmerList[j].info,
                                                  candidateKmerInfos[idx].sequenceID,
                                                  taxId2speciesId[candidateKmerInfos[idx].sequenceID],
@@ -474,10 +469,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
     free(splitCheckList);
     queryKmerNum = 0;
 
-#ifdef OPENMP
-    omp_set_num_threads(threads);
-#endif
-
     totalMatchCnt += matchBuffer->startIndexOfReserve;
     return 1;
 }
@@ -520,8 +511,9 @@ void KmerMatcher::compareDna(uint64_t query,
     }
 
     // Select target k-mers that passed hamming criteria
+    uint8_t maxHamming = min(minHammingSum * 2, 7);
     for (size_t h = 0; h < size; h++) {
-        if (hammingSums[h] <= min(minHammingSum * 2, 7)) {
+        if (hammingSums[h] <= maxHamming) {
             selectedMatches.push_back(h);
             selectedHammingSum.push_back(hammingSums[h]);
             if (frame < 3) { // Frame of query k-mer
diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp
index d981f238..6b87319b 100644
--- a/src/commons/LocalUtil.cpp
+++ b/src/commons/LocalUtil.cpp
@@ -35,11 +35,11 @@ void LocalUtil::splitQueryFile(std::vector<SequenceBlock> & sequences, const std
 
 int LocalUtil::getMaxCoveredLength(int queryLength) {
     if (queryLength % 3 == 2) {
-        return queryLength - 2; // 2
+        return queryLength - 2; 
     } else if (queryLength % 3 == 1) {
-        return queryLength - 4; // 4
+        return queryLength - 4; 
     } else {
-        return queryLength - 3; // 3
+        return queryLength - 3; 
     }
 }
 
diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp
index 29191f8e..7846a969 100644
--- a/src/commons/QueryIndexer.cpp
+++ b/src/commons/QueryIndexer.cpp
@@ -14,6 +14,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) {
     maxRam = par.ramUsage;
     threads = par.threads;
     bytesPerKmer = sizeof(QueryKmer) + matchPerKmer * sizeof(Match);
+    // std::cout << "bytesPerKmer: " << bytesPerKmer << "\n";
     readNum_1 = 0;
     readNum_2 = 0;
     spaceNum = par.spaceMask.length() - kmerLength;
@@ -25,6 +26,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) {
 void QueryIndexer::setAvailableRam() {
     availableRam = ((size_t) maxRam * (size_t) 1024 * 1024 * 1024)
                          - ((size_t) 128 * 1024 * 1024 * (size_t) threads);
+    // std::cout << "availableRam: " << availableRam << "\n";
 }
 
 void QueryIndexer::indexQueryFile() {
@@ -40,7 +42,6 @@ void QueryIndexer::indexQueryFile() {
             totalReadLength += kseq->entry.sequence.l;
             size_t currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(kseq->entry.sequence.l, spaceNum);
             kmerCnt += currentKmerCnt;
-            // std::cout << "currentKmerCnt: " << kmerCnt << "\n";
         
             if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) {
                 querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt);
@@ -72,7 +73,6 @@ void QueryIndexer::indexQueryFile() {
                 seqCnt_1++;
                 totalReadLength += kseq_1->entry.sequence.l;
                 currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(kseq_1->entry.sequence.l, spaceNum);
-                kmerCnt += currentKmerCnt;
             } else {
                 end = true;
             }
@@ -82,7 +82,7 @@ void QueryIndexer::indexQueryFile() {
                 seqCnt_2++;
                 totalReadLength += kseq_2->entry.sequence.l;
                 currentKmerCnt += LocalUtil::getQueryKmerNumber<size_t>(kseq_2->entry.sequence.l, spaceNum);
-                kmerCnt += currentKmerCnt;
+                
             } else {
                 end = true;
             }
@@ -92,6 +92,7 @@ void QueryIndexer::indexQueryFile() {
                 EXIT(EXIT_FAILURE);
             }
 
+            kmerCnt += currentKmerCnt;
             if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) {
                 querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt);
                 kmerCnt = currentKmerCnt;
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index cff5d5c1..63fb5731 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -89,13 +89,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
                                 const Match *matchList,
                                 vector<Query> & queryList,
                                 const LocalParameters &par) {
-
-//    if (true) {
-//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
-//        for (size_t i = offset; i < end + 1; i++) {
-//            cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
-//        }
-//    }
     // Get the best species for current query
     vector<Match> speciesMatches;
     speciesMatches.reserve(end - offset + 1);
@@ -109,14 +102,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
                                              queryList[currentQuery].queryLength);
     }
 
-//    if (true) {
-//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n";
-//        for (size_t i = 0; i < genusMatches.size(); i++) {
-//           cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) <<  " "  << int(genusMatches[i].redundancy) << endl;
-//         }
-//        cout << "Genus score: " << genusScore.score << "\n";
-//    }
-
     // If there is no proper species for current query, it is un-classified.
     if (speciesScore.score == 0 || speciesScore.score < par.minScore) {
         queryList[currentQuery].isClassified = false;
@@ -138,8 +123,12 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
         return;
     }
 
-    // Filter redundant matches  
-    filterRedundantMatches(speciesMatches, queryList[currentQuery].taxCnt);
+    // Filter redundant matches
+    unordered_map<TaxID, unsigned int> taxCnt;
+    filterRedundantMatches(speciesMatches, taxCnt);
+    for (auto & tax : taxCnt) {
+        queryList[currentQuery].taxCnt[tax.first] = tax.second;    
+    }
     
     // If score is not enough, classify to the parent of the selected species
     if (speciesScore.score < par.minSpScore) {
@@ -153,7 +142,7 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
     }
 
     // Lower rank classification
-    TaxID result = lowerRankClassification(queryList[currentQuery].taxCnt,
+    TaxID result = lowerRankClassification(taxCnt,
                                          speciesScore.taxId,
                                           queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2);
 
@@ -164,21 +153,10 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
     queryList[currentQuery].coverage = speciesScore.coverage;
     queryList[currentQuery].hammingDist = speciesScore.hammingDist;
     queryList[currentQuery].newSpecies = false;
-//    if (par.printLog) {
-//        cout << "# " << currentQuery << endl;
-//        for (size_t i = 0; i < genusMatches.size(); i++) {
-//            cout << i << " " << genusMatches[i].qInfo.pos << " " <<
-//            genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl;
-//        }
-//        cout << "Score: " << speciesScore.score << "  " << selectedSpecies << " "
-//             << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx)
-//
-//             << endl;
-//    }
 }
 
 void Taxonomer::filterRedundantMatches(vector<Match> & speciesMatches,
-                                       map<TaxID, int > & taxCnt) {
+                                        unordered_map<TaxID, unsigned int> & taxCnt) {
     // Sort matches by the coordinate on the query
     sort(speciesMatches.begin(), speciesMatches.end(),
          [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
@@ -204,7 +182,7 @@ void Taxonomer::filterRedundantMatches(vector<Match> & speciesMatches,
     }
 }
 
-TaxID Taxonomer::lowerRankClassification(const map<TaxID, int> & taxCnt, TaxID spTaxId, int queryLength) {
+TaxID Taxonomer::lowerRankClassification(const unordered_map<TaxID, unsigned int> & taxCnt, TaxID spTaxId, int queryLength) {
     unsigned int maxCnt = (queryLength - 1)/denominator + 1;
     unordered_map<TaxID, TaxonCounts> cladeCnt;
     getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
@@ -225,7 +203,7 @@ TaxID Taxonomer::lowerRankClassification(const map<TaxID, int> & taxCnt, TaxID s
     }
 }
 
-void Taxonomer::getSpeciesCladeCounts(const map<TaxID, int> &taxCnt,
+void Taxonomer::getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> &taxCnt,
                                        unordered_map<TaxID, TaxonCounts> & cladeCount,
                                        TaxID speciesTaxID) {
     for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) {
@@ -475,9 +453,11 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
     // 1. Add the matchPath with the highest score to combinedMatchPaths
     // 2. Add the matchPath with the highest score that is not overlapped with the matchPath in combinedMatchPaths
     // 3. Repeat 2 until no matchPath can be added
+    float score = 0;
     for (size_t i = 0; i < matchPaths.size(); i++) {
         if (combinedMatchPaths.empty()) {
             combinedMatchPaths.push_back(matchPaths[i]);
+            score += matchPaths[i].score;
         } else {
             bool isOverlapped = false;
             for (size_t j = 0; j < combinedMatchPaths.size(); j++) {
@@ -486,27 +466,26 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
                                             - max(matchPaths[i].start, combinedMatchPaths[j].start) + 1;
                     if (overlappedLength < 24) {
                         trimMatchPath(matchPaths[i], combinedMatchPaths[j], overlappedLength);
+                       
                         continue;
                     } else {
                         isOverlapped = true;
                         break;
                     }
-                    // Current path trimmed too much 
-                    if (matchPaths[i].end - matchPaths[i].start + 1 < 24) {
+                    if (matchPaths[i].end - matchPaths[i].start + 1 < 24) { // Current path trimmed too much 
                         isOverlapped = true;
                         break;
-                    }
+                    }  
                 } 
             }
             if (!isOverlapped) {
                 combinedMatchPaths.push_back(matchPaths[i]);
+                score += matchPaths[i].score;           
             }
         }
-    }
-    // Calculate the score of combinedMatchPaths
-    float score = 0;
-    for (auto & matchPath : combinedMatchPaths) {
-        score += matchPath.score;
+        // if (score / readLength >= 1) {
+        //     break;
+        // }
     }
     return score / readLength;
 }
@@ -520,12 +499,10 @@ bool Taxonomer::isMatchPathOverlapped(const MatchPath & matchPath1,
 void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength) {
     if (path1.start < path2.start) { 
         path1.end = path2.start - 1;
-        // uint8_t lastEndHamming = GET_2_BITS(path1.matches.back()->rightEndHamming);
         path1.hammingDist = max(0, path1.hammingDist - path1.endMatch->getRightPartHammingDist(overlapLength/3));
         path1.score = path1.score - path1.endMatch->getRightPartScore(overlapLength/3) - (overlapLength % 3);
     } else {
         path1.start = path2.end + 1;
-        // uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14);
         path1.hammingDist = max(0, path1.hammingDist - path1.startMatch->getLeftPartHammingDist(overlapLength/3));
         path1.score = path1.score - path1.startMatch->getLeftPartScore(overlapLength/3) - (overlapLength % 3);
     }
@@ -558,7 +535,7 @@ void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameM
                 // Compare curPosMatches and nextPosMatches
                 for (auto &curPosMatch: curPosMatches) {
                     for (auto &nextPosMatch: nextPosMatches) {
-                        if (curPosMatch->isConsecutive_DNA(nextPosMatch)) {
+                        if (isConsecutive(curPosMatch, nextPosMatch)){
                             linkedMatches[curPosMatch].push_back(nextPosMatch);
                         }
                     }
@@ -586,9 +563,12 @@ void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameM
                 // Compare curPosMatches and nextPosMatches
                 for (auto &curPosMatch: curPosMatches) {
                     for (auto &nextPosMatch: nextPosMatches) {
-                        if (nextPosMatch->isConsecutive_DNA(curPosMatch)) {
+                        if (isConsecutive(nextPosMatch, curPosMatch)){
                             linkedMatches[curPosMatch].push_back(nextPosMatch);
                         }
+                        // if (nextPosMatch->isConsecutive_DNA(curPosMatch)) {
+                        //     linkedMatches[curPosMatch].push_back(nextPosMatch);
+                        // }
                     }
                 }
 
@@ -1052,7 +1032,7 @@ TaxonScore Taxonomer::scoreSpecies(const vector<Match> &matches,
 bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) {
     // match1 87654321 -> 08765432
     // match2 98765432 -> 08765432
-    return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF);
+    return (match1->dnaEncoding >> 3) == (match2->dnaEncoding & 0x1FFFFF);
 }
 
 bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * match2) {
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index a9bc0486..0279fe38 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -112,7 +112,7 @@ class Taxonomer {
     void trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength);
 
     void filterRedundantMatches(vector<Match> & speciesMatches,
-                                map<TaxID, int> & taxCnt);
+                                unordered_map<TaxID, unsigned int> & taxCnt);
 
     depthScore DFS(const vector<const Match *> &matches, const Match * curMatchIdx,
                    const map<const Match *, vector<const Match *>> &linkedMatches,
@@ -177,9 +177,9 @@ class Taxonomer {
                             int queryLength,
                             int queryLength2);
 
-    TaxID lowerRankClassification(const map<TaxID, int> & matches, TaxID speciesID, int queryLength);
+    TaxID lowerRankClassification(const unordered_map<TaxID, unsigned int> & taxCnt, TaxID speciesID, int queryLength);
 
-    void getSpeciesCladeCounts(const map<TaxID, int> & taxCnt,
+    void getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> & taxCnt,
                                unordered_map<TaxID, TaxonCounts> & cladeCnt,
                                TaxID spciesID);