From 8a7783b662c34609717f425c0c5645915ded0e78 Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Fri, 11 Aug 2023 15:20:11 +0900
Subject: [PATCH 01/65] clean codes for parameters

---
 src/commons/LocalParameters.cpp | 105 +++++++++++++++-----------------
 src/commons/LocalParameters.h   |   2 -
 src/workflow/classify.cpp       |   1 -
 3 files changed, 48 insertions(+), 60 deletions(-)

diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp
index 5a7c7d5c..47b2b689 100644
--- a/src/commons/LocalParameters.cpp
+++ b/src/commons/LocalParameters.cpp
@@ -9,28 +9,28 @@ LocalParameters::LocalParameters() :
                      "NCBI: 10239 [Default]\nCUSTOM: Check names.dmp file ",
                      typeid(int),
                      (void *) &virusTaxId,
-                     "[^[1-9]\\d*$]"),
+                     "^[0-9]+$"),
         BACTERIA_TAX_ID(BACTERIA_TAX_ID_ID,
                         "--bacteria-taxid",
                         "Taxonomy ID of bacteria taxon",
                         "NCBI: 2 [Default]\nCUSTOM: Check names.dmp file ",
                         typeid(int),
                         (void *) &bacteriaTaxId,
-                        "[^[1-9]\\d*$]"),
+                        "^[0-9]+$"),
         ARCHAEA_TAX_ID(ARCHAEA_TAX_ID_ID,
                        "--archaea-taxid",
                        "Taxonomy ID of archaea taxon",
                        "NCBI: 2157 [Default]\nCUSTOM: Check names.dmp file ",
                        typeid(int),
                        (void *) &archaeaTaxId,
-                       "[^[1-9]\\d*$]"),
+                       "^[0-9]+$"),
         EUKARYOTA_TAX_ID(EUKARYOTA_TAX_ID_ID,
                          "--eukaryota-taxid",
                          "Taxonomy ID of eukaryota taxon",
                          "NCBI: 2759 [Default]\nCUSTOM: Check names.dmp file ",
                          typeid(int),
                          (void *) &eukaryotaTaxId,
-                         "[^[1-9]\\d*$]"),
+                         "^[0-9]+$"),
         SEQ_MODE(SEQ_MODE_ID,
                  "--seq-mode",
                  "Sequencing type",
@@ -38,13 +38,6 @@ LocalParameters::LocalParameters() :
                  typeid(int),
                  (void *) &seqMode,
                  "[1-3]"),
-        MEMORY_MODE(MEMORY_MODE_ID,
-                    "--memory-mode",
-                    "Keeping k-mer matches in the RAM or writing into a file",
-                    "Writing: 1 \nRAM:  2",
-                    typeid(int),
-                    (void *) &memoryMode,
-                    "[1-2]"),
         REDUCED_AA(REDUCED_AA_ID,
                    "--reduced-aa",
                    "Using reduced 15 alphabets to encode amino acids. It increases sensitivity",
@@ -73,14 +66,14 @@ LocalParameters::LocalParameters() :
                "A mask should contain at least eight '1's, and '0' means skip.",
                typeid(std::string),
                (void *) &spaceMask,
-               ""),
+               "^.*$"),
         MIN_COVERED_POS(MIN_COVERED_POS_ID,
                         "--min-covered-pos",
                         "Minimum number of covered positions of a range",
                         "Minimum number of covered positions of a range",
                         typeid(int),
                         (void *) &minCoveredPos,
-                        ""),
+                        "^[0-9]+$"),
         HAMMING_MARGIN(HAMMING_MARGIN_ID,
                        "--hamming-margin",
                        "If a query k-mer has multiple matches, the matches with hamming distance lower than sum of \n"
@@ -103,35 +96,63 @@ LocalParameters::LocalParameters() :
                    "Path to prodigal training information files",
                    typeid(std::string),
                    (void *) &tinfoPath,
-                   ""),
+                   "^.*$"),
         RAM_USAGE(RAM_USAGE_ID,
                   "--max-ram",
                   "RAM usage in GiB",
                   "RAM usage in GiB",
                   typeid(int),
                   (void *) &ramUsage,
-                  "^[1-9]{1}[0-9]*$"),
+                  "^[0-9]+$"),
         PRINT_LOG(PRINT_LOG_ID,
                   "--print-log",
                   "Print logs to debug",
                   "Print logs to debug",
                   typeid(int),
                   (void *) &printLog,
-                  ""),
+                  "^[0-9]+$"),
+        MAX_GAP(MAX_GAP_ID,
+                "--max-gap",
+                "Maximum gap between two consecutive k-mers (used only with spaced k-mer)",
+                "Maximum gap between two consecutive k-mers (used only with spaced k-mer)",
+                typeid(int),
+                (void *) &maxGap,
+                "^[0-9]+$"),
+        MIN_CONS_CNT(MIN_CONS_CNT_ID,
+                     "--min-cons-cnt",
+                     "Minimum number of consecutive metamer matches to be used for prokaryote/virus classification",
+                     "Minimum number of consecutive metamer matches to be used for prokaryote/virus classification",
+                     typeid(int),
+                     (void *) &minConsCnt,
+                     "^[0-9]+$"),
+        MIN_CONS_CNT_EUK(MIN_CONS_CNT_EUK_ID,
+                         "--min-cons-cnt-euk",
+                         "Minimum number of consecutive metamer matches to be used for eukaryote classification",
+                         "Minimum number of consecutive metamer matches to be used for eukaryote classification",
+                         typeid(int),
+                         (void *) &minConsCntEuk,
+                         "^[0-9]+$"),
+        MATCH_PER_KMER(MATCH_PER_KMER_ID,
+                       "--match-per-kmer",
+                       "Number of matches per query k-mer",
+                       "Number of matches per query k-mer. Larger values assign more memory for storing k-mer matches.",
+                       typeid(int),
+                       (void *) &matchPerKmer,
+                       "^[0-9]+$"),
         LIBRARY_PATH(LIBRARY_PATH_ID,
                      "--library-path",
                      "Path to library where the FASTA files are stored",
                      "Path to library where the FASTA files are stored",
                      typeid(std::string),
                      (void *) &libraryPath,
-                     ""),
+                     "^.*$"),
         TAXONOMY_PATH(TAXONOMY_PATH_ID,
                       "--taxonomy-path",
                       "Directory where the taxonomy dump files are stored",
                       "Directory where the taxonomy dump files are stored",
                       typeid(std::string),
                       (void *) &taxonomyPath,
-                      ""),
+                      "^.*$"),
         IS_ASSEMBLY(IS_ASSEMBLY_ID,
                     "--assembly",
                     "Input is an assembly",
@@ -139,97 +160,69 @@ LocalParameters::LocalParameters() :
                     typeid(bool),
                     (void *) &assembly,
                     ""),
-        MAX_GAP(MAX_GAP_ID,
-                "--max-gap",
-                "Maximum gap between two consecutive k-mers (used only with spaced k-mer)",
-                "Maximum gap between two consecutive k-mers (used only with spaced k-mer)",
-                typeid(int),
-                (void *) &maxGap,
-                ""),
-        MIN_CONS_CNT(MIN_CONS_CNT_ID,
-                     "--min-cons-cnt",
-                     "Minimum number of consecutive metamer matches to be used for prokaryote/virus classification",
-                     "Minimum number of consecutive metamer matches to be used for prokaryote/virus classification",
-                     typeid(int),
-                     (void *) &minConsCnt,
-                     ""),
-        MIN_CONS_CNT_EUK(MIN_CONS_CNT_EUK_ID,
-                         "--min-cons-cnt-euk",
-                         "Minimum number of consecutive metamer matches to be used for eukaryote classification",
-                         "Minimum number of consecutive metamer matches to be used for eukaryote classification",
-                         typeid(int),
-                         (void *) &minConsCntEuk,
-                         ""),
         SPLIT_NUM(SPLIT_NUM_ID,
                   "--split-num",
                   "A database is divided to N splits (offsets). During classification, unnecessary splits are skipped",
                   "A database is divided to N splits (offsets). During classification, unnecessary splits are skipped",
                   typeid(int),
                   (void *) &splitNum,
-                  ""),
+                  "^[0-9]+$"),
         BUFFER_SIZE(BUFFER_SIZE_ID,
                     "--buffer-size",
                     "Buffer size (the number of k-mers)",
                     "Buffer size (the number of k-mers)",
                     typeid(size_t),
                     (void *) &bufferSize,
-                    ""),
+                    "^[0-9]+$"),
         TEST_RANK(TEST_RANK_ID,
                   "--test-rank",
                   ".",
                   "csv of ranks to be tested",
                   typeid(std::string),
                   (void *) &testRank,
-                  ""),
+                  "^.*$"),
         TEST_TYPE(TEST_TYPE_ID,
                   "--test-type",
                   ".",
                   "Test Type",
                   typeid(std::string),
                   (void *) &testType,
-                  ""),
+                  "^.*$"),
         READID_COL(READID_COL_ID,
                    "--readid-col",
                    "Column number of accession in classification result",
                    "Column number of accession in classification result",
                    typeid(int),
                    (void *) &readIdCol,
-                   ""),
+                   "^[0-9]+$"),
         TAXID_COL(TAXID_COL_ID,
                   "--taxid-col",
                   "Column number of taxonomy ID in classification result",
                   "Column number of taxonomy ID in classification result",
                   typeid(int),
                   (void *) &taxidCol,
-                  ""),
+                  "^[0-9]+$"),
         SCORE_COL(SCORE_COL_ID,
                   "--score-col",
                   "Column number of score in classification result",
                   "Column number of score in classification result",
                   typeid(int),
                   (void *) &scoreCol,
-                  ""),
+                  "^[0-9]+$"),
         COVERAGE_COL(COVERAGE_COL_ID,
                      "--coverage-col",
                      "Column number of coverage in classification result",
                      "Column number of coverage in classification result",
                      typeid(int),
                      (void *) &coverageCol,
-                     ""),
-        MATCH_PER_KMER(MATCH_PER_KMER_ID,
-                       "--match-per-kmer",
-                       "Number of matches per query k-mer",
-                       "Number of matches per query k-mer. Larger values assign more memory for storing k-mer matches.",
-                       typeid(int),
-                       (void *) &matchPerKmer,
-                       ""),
+                     "^[0-9]+$"),
         PRINT_COLUMNS(PRINT_COLUMNS_ID,
                       "--print-columns",
                       "CSV of column numbers to be printed",
                       "CSV of column numbers to be printed",
                       typeid(std::string),
                       (void *) &printColumns,
-                      "")
+                      "^.*$")
   {
     //add_to_library
 
@@ -247,12 +240,10 @@ LocalParameters::LocalParameters() :
     classify.push_back(&PARAM_THREADS);
     classify.push_back(&SEQ_MODE);
     classify.push_back(&VIRUS_TAX_ID);
-//    classify.push_back(&MEMORY_MODE);
     classify.push_back(&REDUCED_AA);
     classify.push_back(&MIN_SCORE);
     classify.push_back(&MIN_COVERAGE);
     classify.push_back(&SPACED);
-//    classify.push_back(&MIN_CONSECUTIVE);
     classify.push_back(&HAMMING_MARGIN);
     classify.push_back(&MIN_SP_SCORE);
     classify.push_back(&PARAM_V);
diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h
index abde1c8f..4169ab25 100644
--- a/src/commons/LocalParameters.h
+++ b/src/commons/LocalParameters.h
@@ -40,7 +40,6 @@ class LocalParameters : public Parameters {
 
     // Classify
     PARAMETER(SEQ_MODE)
-    PARAMETER(MEMORY_MODE)
     PARAMETER(REDUCED_AA)
     PARAMETER(MIN_SCORE)
     PARAMETER(MIN_COVERAGE)
@@ -81,7 +80,6 @@ class LocalParameters : public Parameters {
 
     // Classify
     int seqMode;
-    int memoryMode;
     int reducedAA;
     float minScore;
     std::string spaceMask;
diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp
index 69786781..e88a4aa4 100644
--- a/src/workflow/classify.cpp
+++ b/src/workflow/classify.cpp
@@ -6,7 +6,6 @@
 #include "FileUtil.h"
 
 void setClassifyDefaults(LocalParameters & par){
-    par.virusTaxId = 10239;// Taxonomy ID of virus taxon in NCBI
     par.seqMode = 2;
     par.memoryMode = 1;
     par.reducedAA = 0;

From ccb05116edaab67fef70d3d4224506fb4a163a91 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <68528165+jaebeom-kim@users.noreply.github.com>
Date: Fri, 1 Sep 2023 13:31:29 +0900
Subject: [PATCH 02/65] Update README.md

---
 README.md | 103 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 55 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index 65b00984..6765aa17 100644
--- a/README.md
+++ b/README.md
@@ -82,7 +82,7 @@ metabuli classify --seq-mode 1 read.fna dbdir outdir jobid
    --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids. Give the same value used for DB creation.
    --spacing-mask : Binary patterend mask for spaced k-mer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip.
    
-  * --min-score and --min-sp-score for precision mode are optimized only for short reads.
+  * Values of --min-score and --min-sp-score for precision mode are optimized only for short reads.
   * We don't recommend using them for long reads.
 ```
 
@@ -135,79 +135,86 @@ We tested it with a MacBook Air (2020, M1, 8 GiB), where we classified about 1.5
 
 ## Custom database
 To build a custom database, you need three things:
-1. **FASTA files** : Each sequence of your FASTA files must be separated by '>accession.version' like '>CP001849.1'
-2. **accession2taxid** : Mapping from acession to taxonomy identifier. Sequences whose accessions are not listed in this file will be skipped.
-3. **NCBI-style taxonomy dump** : 'names.dmp' , 'nodes.dmp', and 'merged.dmp' are required. Sequences whose taxid are not included here will be skipped.
+1. **FASTA files** : Each sequence of your FASTA files must be separated by '>accession.version' like '>CP001849.1'.
+2. **accession2taxid** : Mapping from accession to taxonomy ID. The sequences whose accessions are not listed here will be skipped.
+3. **NCBI-style taxonomy dump** : 'names.dmp' , 'nodes.dmp', and 'merged.dmp' are required. The sequences whose taxonomy IDs are not included here will be skipped.
 
-Next, the steps for creating a database based on NCBI or GTDB taxonomy are described.
+The steps for building a database with NCBI or GTDB taxonomy are described below.
 
+### To build a database with NCBI taxonomy
 #### 1. Prepare taxonomy and accession2taxid
-##### NCBI taxonomy
-  
   * accession2taxid can be downloaded from
   https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/
   
   * Taxonomy dump files can be downloaded from 
   https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/
-  
-##### GTDB taxonomy
-  
-  Follow two steps below to generate GTDB taxonomy and accession2taxid file.
-  * Requirements: You need assembly FASTA files whose file name (or path) includes the assembly accession.  
-    If you downloaded assemblies using [ncbi-genome-download](https://github.com/kblin/ncbi-genome-download), you probably don't have to care about it.  
-    The regular expression of assembly accessions is (GC[AF]_[0-9].[0-9])
-    
-  ```
-  # 1. 
-  In 'util' directory
-  ./prepare_gtdb_taxonomy.sh <DBDIR>
-    - DBDIR : Result files are stored in 'DBDIR/taxonomy'. 
-      Make sure that 'DBDIR/taxonomy' is exist and empty. 
-      The same path should be used in step 1.
-  ```
-  This will generate taxonomy dump files and `assacc_to_taxid.tsv` with other files.
-    
-  ```
-  # 2. 
-  metabuli add-to-library <FASTA list> <accession2taxid> <DBDIR> --assembly true
-    - FASTA list : A list of absolute paths of each assembly files.
-      Each absolute path must include assembly accession. 
-    - accession2taxid : 'assacc_to_taxid.tsv' from the previous step
-    - DBDIR : The same DBDIR from the previous step.
-  ```
-  This will add your FASTA files to `DBDIR/library` according to their species taxonomy ID and generate 'my.accession2taxid' 
 
-  
-#### 2. Add to libarary (optional)
+#### 2. Add to library
 ```
 metabuli add-to-library <FASTA list> <accession2taxid> <DBDIR>
-  - FASTA list: A list of absolute paths of each FASTA files.
-  - accession2taxid: A path to NCBI-style accession2taxid
-  - DBDIR: The same DBDIR from the previous step.
+- FASTA list: A file containing absolute paths of each FASTA file.
+- accession2taxid: A path to NCBI-style accession2taxid.
+- DBDIR: Sequences will be stored in 'DBDIR/library'. 
 ```
-This command groups your FASTA files of the same species and add stores them in separate files to DBDIR/library.  
-You can skip this step in the case of
-1. You have already used this command during the preparation for GTDB taxonomy.
-2. Your FASTA list includes only one FASTA file per species.
-
+It groups your sequences into separate files according to their species.
+Accessions that are not included in the `<accession2taxid>` will be skipped and listed in `unmapped.txt`.
 
 #### 3. Build
 
 ```
 metabuli build <DBDIR> <FASTA list> <accession2taxid> [options]
 - DBDIR: The same DBDIR from the previous step. 
-- FASTA list: A list of absolute paths to your FASTA files (in DBDIR/library)
-- accession2taxid : accession2taxid file
+- FASTA list: A file containing absolute paths of the FASTA files in DBDIR/library
+- accession2taxid : A path to NCBI-style accession2taxid.
   
   * Options
    --threads : The number of CPU-cores used (all by default)
-   --tinfo-path : Path to prodigal training information files. (DBDIR/prodigal by default)
    --taxonomy-path: Directory where the taxonomy dump files are stored. (DBDIR/taxonomy by default)
    --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids.
-   --spacing-mask : Binary patterend mask for spaced k-mer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip.
+   --spacing-mask : Binary mask for spaced metamer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip.
 ```
 This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete '\*\_diffIdx' and '\*\_info' if generated.
 
+### To build a database with GTDB taxonomy
+#### 1. Prepare GTDB taxonomy and accession2taxid
+*Requirements*: You need assembly FASTA files whose file name (or path) includes the assembly accession.
+If you downloaded assemblies using `ncbi-genome-download`, you probably don't have to care about it.
+The regular expression of assembly accessions is (GC[AF]_[0-9].[0-9])
+
+```
+# 1. 
+In the 'util' directory
+./prepare_gtdb_taxonomy.sh <DBDIR>
+  - DBDIR : Result files are stored in 'DBDIR/taxonomy'. 
+```
+This will generate taxonomy dump files and `assacc_to_taxid.tsv` with other files.
+
+```
+# 2. 
+metabuli add-to-library <FASTA list> <accession2taxid> <DBDIR> --assembly true
+  - FASTA list : A file containing absolute paths of each assembly file.
+    Each path must include a corresponding assembly accession. 
+  - accession2taxid : 'assacc_to_taxid.tsv' from the previous step
+  - DBDIR : The same DBDIR from the previous step.
+```
+This will add your FASTA files to DBDIR/library according to their species taxonomy ID and generate 'my.accession2taxid'
+
+#### 2. Build
+```
+metabuli build <DBDIR> <FASTA list> <accession2taxid> [options]
+- DBDIR: The same DBDIR from the previous step. 
+- FASTA list: A file containing absolute paths of the FASTA files in DBDIR/library
+- accession2taxid : A path to NCBI-style accession2taxid.
+  
+  * Options
+   --threads : The number of CPU-cores used (all by default)
+   --taxonomy-path: Directory where the taxonomy dump files are stored. (DBDIR/taxonomy by default)
+   --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids.
+   --spacing-mask : Binary mask for spaced metamer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip.
+```
+This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete '\*\_diffIdx' and '\*\_info' if generated.
+
+
 ## Example
 ```
 Classifying RNA-seq reads from a COVID-19 patient to identify the culprit variant.

From d77f6b18ca39b5abefd7206a668a5d019c86bebc Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Fri, 11 Aug 2023 15:53:37 +0900
Subject: [PATCH 03/65] support fna.gz files for add-to-library module

---
 src/workflow/add_to_library.cpp | 114 +++++++++++---------------------
 1 file changed, 37 insertions(+), 77 deletions(-)

diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp
index bf4d6fa9..0d533eec 100644
--- a/src/workflow/add_to_library.cpp
+++ b/src/workflow/add_to_library.cpp
@@ -5,7 +5,6 @@
 #include "KSeqWrapper.h"
 #include <iostream>
 #include "IndexCreator.h"
-#include <string>
 #include "FileUtil.h"
 
 using namespace std;
@@ -28,9 +27,8 @@ int addToLibrary(int argc, const char **argv, const Command &command){
     if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
     if (par.libraryPath == "DBDIR/library/") par.libraryPath = dbDir + "/library/";
 
-//    string libraryPath = dbDir + "/library";
     // If the library directory does not exist, create it
-    if (FileUtil::directoryExists(par.libraryPath.c_str()) == false) {
+    if (!FileUtil::directoryExists(par.libraryPath.c_str())) {
         FileUtil::makeDir(par.libraryPath.c_str());
     }
 
@@ -40,7 +38,6 @@ int addToLibrary(int argc, const char **argv, const Command &command){
     string merged =  par.taxonomyPath + "/merged.dmp";
     NcbiTaxonomy ncbiTaxonomy(names, nodes, merged);
 
-
     // Load file names
     ifstream fileListFile;
     fileListFile.open(fileList);
@@ -72,38 +69,23 @@ int addToLibrary(int argc, const char **argv, const Command &command){
         }
         cout << "done" << endl;
 
-        vector<SequenceBlock> sequences;
-        vector<string> unmapped;
-        // Process each file
-        size_t numberOfFiles = fileNames.size();
-        for (size_t i = 0; i < numberOfFiles; ++i) {
-            sequences.clear();
-            string fileName = fileNames[i];
-
-            // Getting start and end position of each sequence
-            IndexCreator::getSeqSegmentsWithHead(sequences, fileName.c_str());
-
-            // Mmap the file
-            struct MmapedData<char> seqFile = mmapData<char>(fileName.c_str());
-            kseq_buffer_t buffer;
-            kseq_t *seq;
 
-            for (size_t j = 0; j < sequences.size(); ++j) {
-                buffer = {const_cast<char *>(&seqFile.data[sequences[j].start]),
-                          static_cast<size_t>(sequences[j].length)};
-                seq = kseq_init(&buffer);
-                kseq_read(seq);
+        // Process each file
+        vector<string> unmapped;
+        for (size_t i = 0; i < fileNames.size(); ++i) {
+            KSeqWrapper* kseq = KSeqFactory(fileNames[i].c_str());
+            while (kseq->ReadEntry()) {
+                const KSeqWrapper::KSeqEntry & e = kseq->entry;
 
                 // Extract accession and Remove the version number
-                string accession = string(seq->name.s);
+                string accession = string(e.name.s);
                 size_t pos = accession.find('.');
                 if (pos != string::npos) { accession = accession.substr(0, pos); }
 
                 // Skip if accession is not in the mapping file
                 if (acc2taxid.find(accession) == acc2taxid.end()) {
-                    cout << "During processing " << fileName << ", accession " << accession <<
+                    cout << "During processing " << fileNames[i] << ", accession " << accession <<
                          " is not found in the mapping file. It is skipped." << endl;
-                    kseq_destroy(seq);
                     unmapped.push_back(accession);
                     continue;
                 }
@@ -111,27 +93,26 @@ int addToLibrary(int argc, const char **argv, const Command &command){
                 // Get species taxID
                 int speciesTaxID = ncbiTaxonomy.getTaxIdAtRank(acc2taxid[accession], "species");
 
+                // Skip if species taxID is not found
                 if (speciesTaxID == 0) {
-                    cout << "During processing " << fileName << ", accession " << accession <<
+                    cout << "During processing " << fileNames[i] << ", accession " << accession <<
                          " is not matched to any species. It is skipped." << endl;
-                    kseq_destroy(seq);
                     continue;
                 }
 
-                // Write to file
+                // Write each sequence to file with species taxID as file name
                 FILE *file = fopen((dbDir + "/library/" + to_string(speciesTaxID) + ".fna").c_str(), "a");
-                fprintf(file, ">%s %s\n", seq->name.s, seq->comment.s);
-                fprintf(file, "%s\n", seq->seq.s);
+                fprintf(file, ">%s %s\n", e.name.s, e.comment.s);
+                fprintf(file, "%s\n", e.sequence.s);
                 fclose(file);
-
-                kseq_destroy(seq);
             }
-            munmap(seqFile.data, seqFile.fileSize + 1);
+            delete kseq;
         }
+
         // Write unmapped accession to file
         FILE *file = fopen((dbDir + "/unmapped.txt").c_str(), "w");
-        for (size_t i = 0; i < unmapped.size(); ++i) {
-            fprintf(file, "%s\n", unmapped[i].c_str());
+        for (const auto & i : unmapped) {
+            fprintf(file, "%s\n", i.c_str());
         }
         fclose(file);
     }
@@ -156,33 +137,20 @@ int addToLibrary(int argc, const char **argv, const Command &command){
             cerr << "Cannot open the mapping from assembly accession to tax ID" << endl;
         }
 
-        vector<SequenceBlock> sequences;
+        // Process each file
         vector<string> unmapped;
         regex regex1("(GC[AF]_[0-9]*\\.[0-9]*)");
-        // Process each file
-        size_t numberOfFiles = fileNames.size();
-        for (size_t i = 0; i < numberOfFiles; ++i) {
-            sequences.clear();
-            string fileName = fileNames[i];
-
-            // Getting start and end position of each sequence
-            IndexCreator::getSeqSegmentsWithHead(sequences, fileName.c_str());
-
-            // Mmap the file
-            struct MmapedData<char> seqFile = mmapData<char>(fileName.c_str());
-            kseq_buffer_t buffer;
-            kseq_t *seq;
-
+        for (size_t i = 0; i < fileNames.size(); ++i) {
             // Get assembly accession from file name using regex and remove the version number
             smatch match;
-            regex_search(fileName, match, regex1);
+            regex_search(fileNames[i], match, regex1);
             string assemblyID = match[0];
             size_t pos = assemblyID.find('.');
             if (pos != string::npos) { assemblyID = assemblyID.substr(0, pos); }
 
             // Skip if current assembly accession is not in the mapping file
             if (assembly2taxid.find(assemblyID) == assembly2taxid.end()) {
-                cout << "During processing " << fileName << ", accession " << assemblyID <<
+                cout << "During processing " << fileNames[i] << ", accession " << assemblyID <<
                      " is not found in the mapping file. It is skipped." << endl;
                 unmapped.push_back(assemblyID);
                 continue;
@@ -191,35 +159,27 @@ int addToLibrary(int argc, const char **argv, const Command &command){
             // Get species taxID
             int speciesTaxID = ncbiTaxonomy.getTaxIdAtRank(assembly2taxid[assemblyID], "species");
             if (speciesTaxID == 0) {
-                cout << "During processing " << fileName << ", accession " << assemblyID <<
+                cout << "During processing " << fileNames[i] << ", accession " << assemblyID <<
                      " is not matched to any species. It is skipped." << endl;
                 continue;
             }
 
-            for (size_t j = 0; j < sequences.size(); ++j) {
-                buffer = {const_cast<char *>(&seqFile.data[sequences[j].start]),
-                          static_cast<size_t>(sequences[j].length)};
-                seq = kseq_init(&buffer);
-                kseq_read(seq);
-
-                // Extract accession
-                string accession = string(seq->name.s);
-                acc2taxid[accession] = assembly2taxid[assemblyID];
-
+            KSeqWrapper* kseq = KSeqFactory(fileNames[i].c_str());
+            while (kseq->ReadEntry()){
+                const KSeqWrapper::KSeqEntry & e = kseq->entry;
                 // Write to file
-//                FILE *file = fopen((dbDir + "/library/" + to_string(speciesTaxID) + ".fna").c_str(), "a");
-//                fprintf(file, ">%s %s\n", seq->name.s, seq->comment.s);
-//                fprintf(file, "%s\n", seq->seq.s);
-//                fclose(file);
-
-                kseq_destroy(seq);
+                FILE *file = fopen((dbDir + "/library/" + to_string(speciesTaxID) + ".fna").c_str(), "a");
+                fprintf(file, ">%s %s\n", e.name.s, e.comment.s);
+                fprintf(file, "%s\n", e.sequence.s);
+                fclose(file);
             }
-            munmap(seqFile.data, seqFile.fileSize + 1);
+            delete kseq;
         }
+
         // Write unmapped accession to file
         FILE *file = fopen((dbDir + "/unmapped.txt").c_str(), "w");
-        for (size_t i = 0; i < unmapped.size(); ++i) {
-            fprintf(file, "%s\n", unmapped[i].c_str());
+        for (const auto & i : unmapped) {
+            fprintf(file, "%s\n", i.c_str());
         }
         fclose(file);
 
@@ -227,12 +187,12 @@ int addToLibrary(int argc, const char **argv, const Command &command){
         cout << "Write mapping from accession to taxonomy ID" << endl;
         file = fopen((dbDir + "/my.accession2taxid").c_str(), "w");
         fprintf(file, "accession\taccession.version\ttaxid\tgi");
-        for (auto it = acc2taxid.begin(); it != acc2taxid.end(); ++it) {
+        for (auto & it : acc2taxid) {
             // Get accession without a version number
-            string accession = it->first;
+            string accession = it.first;
             size_t pos = accession.find('.');
             if (pos != string::npos) { accession = accession.substr(0, pos);}
-            fprintf(file, "\n%s\t%s\t%d\t0", accession.c_str(), it->first.c_str(), it->second);
+            fprintf(file, "\n%s\t%s\t%d\t0", accession.c_str(), it.first.c_str(), it.second);
         }
         fclose(file);
     }

From 34194627bcf18e2a9064d0b4cd517a7e14bf57bf Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Fri, 11 Aug 2023 15:58:05 +0900
Subject: [PATCH 04/65] some code clean

---
 src/commons/IndexCreator.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index 37a9a29e..be7c0f77 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -63,12 +63,6 @@ void IndexCreator::createIndex(const LocalParameters &par) {
     makeBlocksForParallelProcessing();
     cout << "Made blocks for each thread" << endl;
 
-    // Train Prodigal for each species
-//    time_t prodigalStart = time(nullptr);
-//    trainProdigal();
-//    time_t prodigalEnd = time(nullptr);
-//    cout << "Prodigal training time: " << prodigalEnd - prodigalStart << " seconds" << endl;
-
     // Write taxonomy id list
     string taxidListFileName = dbDir + "/taxID_list";
     FILE * taxidListFile = fopen(taxidListFileName.c_str(), "w");

From 4aef8ae79916c43ac05095417b454ab794efc187 Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Tue, 15 Aug 2023 16:37:09 +0900
Subject: [PATCH 05/65] Assign a proper mode to each command

---
 src/metabuli.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/metabuli.cpp b/src/metabuli.cpp
index e972c35f..f41040d1 100644
--- a/src/metabuli.cpp
+++ b/src/metabuli.cpp
@@ -48,7 +48,7 @@ std::vector<Command> commands = {
                 "<i: DBDIR> ",
                 CITATION_SPACEPHARER,
                 {{"Directory where the DB will be generated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
-       {"updateDB", build, &localPar.build, COMMAND_MAIN,
+       {"updateDB", build, &localPar.build, COMMAND_DATABASE_CREATION,
                "Update database based on the list of FASTA files.",
                NULL,
                "Jaebeom Kim <jbeom0731@gmail.com>",
@@ -57,7 +57,7 @@ std::vector<Command> commands = {
                {{"DB directory to be updated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty},
                        {"A list of FASTA files", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                        {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
-        {"classify", classify, &localPar.classify, COMMAND_MAIN,
+        {"classify", classify, &localPar.classify, COMMAND_TAXONOMY,
          "Assigning taxonomy label to query reads",
          NULL,
          "Jaebeom Kim <jbeom0731@gmail.com>",
@@ -85,7 +85,7 @@ std::vector<Command> commands = {
                 CITATION_SPACEPHARER,
                 {{"read-classification", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                         {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
-        {"add-to-library", addToLibrary, &localPar.addToLibrary, COMMAND_MAIN,
+        {"add-to-library", addToLibrary, &localPar.addToLibrary, COMMAND_DATABASE_CREATION,
                     "It bins sequences into distinct files according to their species referring their accession number.\n "
                     "It requires a mapping file (accession to tax ID) and NCBI style tax dump files in a taxonomy directory.",
                     NULL,

From 4bb97449474dbd1db28d6fffb541b5f1305c0e7b Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Fri, 18 Aug 2023 11:21:36 +0900
Subject: [PATCH 06/65] Classifier --> QueryIndexer, KmerExtractor,
 KmerMatcher, Taxonomer, Reporter

---
 src/LocalCommandDeclarations.h                |    7 +-
 src/commons/CMakeLists.txt                    |   17 +-
 src/commons/Classifier.cpp                    | 2132 +----------------
 src/commons/Classifier.h                      |  356 +--
 src/commons/IndexCreator.h                    |    8 +-
 src/commons/KmerExtractor.cpp                 |  216 ++
 src/commons/KmerExtractor.h                   |   56 +
 src/commons/KmerMatcher.cpp                   |  466 ++++
 src/commons/KmerMatcher.h                     |  196 ++
 src/commons/LocalUtil.cpp                     |   40 +
 src/commons/LocalUtil.h                       |   23 +
 src/commons/Mmap.h                            |    4 -
 src/commons/QueryFilter.cpp                   |   52 +
 src/commons/QueryFilter.h                     |   23 +
 src/commons/QueryIndexer.cpp                  |  102 +
 src/commons/QueryIndexer.h                    |   67 +
 src/commons/ReducedClassifier.cpp             |   11 -
 ...ducedClassifier.h => ReducedKmerMatcher.h} |   25 +-
 src/commons/Reporter.cpp                      |   94 +
 src/commons/Reporter.h                        |   42 +
 src/commons/Taxonomer.cpp                     | 1164 +++++++++
 src/commons/Taxonomer.h                       |  133 +
 src/commons/common.h                          |   26 +
 src/metabuli.cpp                              |   37 +-
 src/workflow/CMakeLists.txt                   |    1 +
 src/workflow/classify.cpp                     |   11 +-
 src/workflow/filter.cpp                       |   57 +
 27 files changed, 2891 insertions(+), 2475 deletions(-)
 create mode 100644 src/commons/KmerExtractor.cpp
 create mode 100644 src/commons/KmerExtractor.h
 create mode 100644 src/commons/KmerMatcher.cpp
 create mode 100644 src/commons/KmerMatcher.h
 create mode 100644 src/commons/LocalUtil.cpp
 create mode 100644 src/commons/LocalUtil.h
 create mode 100644 src/commons/QueryFilter.cpp
 create mode 100644 src/commons/QueryFilter.h
 create mode 100644 src/commons/QueryIndexer.cpp
 create mode 100644 src/commons/QueryIndexer.h
 delete mode 100644 src/commons/ReducedClassifier.cpp
 rename src/commons/{ReducedClassifier.h => ReducedKmerMatcher.h} (80%)
 create mode 100644 src/commons/Reporter.cpp
 create mode 100644 src/commons/Reporter.h
 create mode 100644 src/commons/Taxonomer.cpp
 create mode 100644 src/commons/Taxonomer.h
 create mode 100644 src/workflow/filter.cpp

diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h
index 332b5ecc..000aa648 100644
--- a/src/LocalCommandDeclarations.h
+++ b/src/LocalCommandDeclarations.h
@@ -1,15 +1,11 @@
-//
-// Created by KJB on 25/09/2020.
-//
-
 #ifndef ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H
 #define ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H
 #include "Command.h"
 
-//extern int download_databases(int argc, const char **argv, const Command& command);
 extern int build(int argc, const char **argv, const Command& command);
 extern int updataDB(int argc, const char **argv, const Command& command);
 extern int classify(int argc, const char **argv, const Command& command);
+extern int filter(int argc, const char **argv, const Command& command);
 extern int grade(int argc, const char **argv, const Command& command);
 extern int seqHeader2TaxId(int argc, const char **argv, const Command& command);
 extern int addToLibrary(int argc, const char **argv, const Command& command);
@@ -17,4 +13,5 @@ extern int applyThreshold(int argc, const char **argv, const Command& command);
 extern int binning2report(int argc, const char **argv, const Command& command);
 extern int filterByGenus(int argc, const char **argv, const Command& command);
 extern int databaseReport(int argc, const char **argv, const Command& command);
+
 #endif //ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H
diff --git a/src/commons/CMakeLists.txt b/src/commons/CMakeLists.txt
index 995f8a2b..39d5498c 100644
--- a/src/commons/CMakeLists.txt
+++ b/src/commons/CMakeLists.txt
@@ -18,8 +18,21 @@ set(commons_source_files
         commons/LocalParameters.h
 		commons/ProdigalWrapper.h
 		commons/ProdigalWrapper.cpp
-		commons/ReducedClassifier.cpp
-		commons/ReducedClassifier.h
 		commons/Match.h
 		commons/common.cpp
+		commons/QueryFilter.h
+		commons/QueryFilter.cpp
+		commons/LocalUtil.h
+		commons/LocalUtil.cpp
+		commons/QueryIndexer.h
+		commons/QueryIndexer.cpp
+		commons/KmerMatcher.h
+		commons/KmerMatcher.cpp
+		commons/ReducedKmerMatcher.h
+		commons/KmerExtractor.h
+		commons/KmerExtractor.cpp
+		commons/Taxonomer.h
+		commons/Taxonomer.cpp
+		commons/Reporter.h
+		commons/Reporter.cpp
         PARENT_SCOPE)
diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp
index 810aa7e6..eebbc63f 100644
--- a/src/commons/Classifier.cpp
+++ b/src/commons/Classifier.cpp
@@ -1,199 +1,46 @@
 #include "Classifier.h"
 #include "LocalParameters.h"
-//#include "krona_prelude.html.h"
 #include "taxonomyreport.cpp"
-#include <ctime>
 
-Classifier::Classifier(LocalParameters & par) : maskMode(par.maskMode), maskProb(par.maskProb) {
+Classifier::Classifier(LocalParameters & par) {
     // Load parameters
-    if (par.seqMode == 2){
-        queryPath_1 = par.filenames[0];
-        queryPath_2 = par.filenames[1];
-        dbDir = par.filenames[2];
-        outDir = par.filenames[3];
-        jobId = par.filenames[4];
-        cout << "Query file 1: " << queryPath_1 << endl;
-        cout << "Query file 2: " << queryPath_2 << endl;
-        cout << "Database directory: " << dbDir << endl;
-        cout << "Output directory: " << outDir << endl;
-        cout << "Job ID: " << jobId << endl;
-    } else {
-        queryPath_1 = par.filenames[0];
-        dbDir = par.filenames[1];
-        outDir = par.filenames[2];
-        jobId = par.filenames[3];
-        cout << "Query file: " << queryPath_1 << endl;
-        cout << "Database directory: " << dbDir << endl;
-        cout << "Output directory: " << outDir << endl;
-        cout << "Job ID: " << jobId << endl;
-    }
-    if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
-
-    MARKER = 16777215;
-    MARKER = ~ MARKER;
-    bitsForCodon = 3;
-    numOfSplit = 0;
-    minCoveredPos = par.minCoveredPos;
-    minSpScore = par.minSpScore;
-    verbosity = par.verbosity;
-    maxGap = par.maxGap;
-
-    // Mask for spaced k-mer
-    size_t maskLen = par.spaceMask.length();
-    mask = new uint32_t[maskLen];
-    spaceNum = 0;
-    spaceNum_int = 0;
-    for(size_t i = 0, j = 0; i < maskLen; i++){
-        mask[i] = par.spaceMask[i] - 48;
-        spaceNum += (mask[i] == 0);
-        spaceNum_int += (mask[i] == 0);
-        if(mask[i]==1){
-            unmaskedPos[j] = (int) i;
-            j++;
-        }
-    }
-
-    // Hamming Dist. margin
-    hammingMargin = (uint8_t) par.hammingMargin;
+    dbDir = par.filenames[1 + (par.seqMode == 2)];
+    matchPerKmer = par.matchPerKmer;
 
     // Taxonomy
-    const string names = par.taxonomyPath + "/names.dmp";
-    const string nodes = par.taxonomyPath + "/nodes.dmp";
-    const string merged = par.taxonomyPath + "/merged.dmp";
-    taxonomy = new NcbiTaxonomy(names, nodes, merged);
-
-    // Taxonomy ID list
-    // Load the taxonomical ID list
-    FILE * taxIdFile;
-    if((taxIdFile = fopen((dbDir + "/taxID_list").c_str(),"r")) == NULL){
-        cout<<"Cannot open the taxID list file."<<endl;
-        return;
-    }
-    char taxID[100];
-    while(feof(taxIdFile) == 0)
-    {
-        fscanf(taxIdFile,"%s",taxID);
-        TaxID taxId = atol(taxID);
-        TaxonNode const * taxon = taxonomy->taxonNode(taxId);
-        if (taxId == taxon->taxId) {
-            TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
-            TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
-            while (taxon->taxId != speciesTaxID) {
-                taxId2speciesId[taxon->taxId] = speciesTaxID;
-                taxId2genusId[taxon->taxId] = genusTaxID;
-                taxon = taxonomy->taxonNode(taxon->parentTaxId);
-            }
-            taxId2speciesId[speciesTaxID] = speciesTaxID;
-            taxId2genusId[speciesTaxID] = genusTaxID;
-        } else {
-            TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
-            TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
-            while (taxon->taxId != speciesTaxID) {
-                taxId2speciesId[taxon->taxId] = speciesTaxID;
-                taxId2genusId[taxon->taxId] = genusTaxID;
-                taxon = taxonomy->taxonNode(taxon->parentTaxId);
-            }
-            taxId2speciesId[speciesTaxID] = speciesTaxID;
-            taxId2genusId[speciesTaxID] = genusTaxID;
-            taxId2speciesId[taxId] = speciesTaxID;
-            taxId2genusId[taxId] = genusTaxID;
-        }
+    if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
+    taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp",
+                                par.taxonomyPath + "/nodes.dmp",
+                                par.taxonomyPath + "/merged.dmp");
+
+    // Agents
+    queryIndexer = new QueryIndexer(par);
+    kmerExtractor = new KmerExtractor(par);
+    if (par.reducedAA) {
+        kmerMatcher = new ReducedKmerMatcher(par, taxonomy);
+    } else {
+        kmerMatcher = new KmerMatcher(par, taxonomy);
     }
-    fclose(taxIdFile);
-
-    subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0);
-    probMatrix = new ProbabilityMatrix(*(subMat));
-//    localIndexBufferSize =  16 * 1024 * 1024;
-//    localMatchBufferSize = 2 * 1024 * 1024;
+    taxonomer = new Taxonomer(par, taxonomy);
+    reporter = new Reporter(par, taxonomy);
 }
 
 Classifier::~Classifier() {
-    delete[] mask;
-    delete subMat;
-    delete probMatrix;
     delete taxonomy;
-}
-
-static inline bool compareForLinearSearch(const QueryKmer &a, const QueryKmer &b) {
-    if (a.ADkmer < b.ADkmer) {
-        return true;
-    } else if (a.ADkmer == b.ADkmer) {
-        return (a.info.sequenceID < b.info.sequenceID);
-    }
-    return false;
+    delete queryIndexer;
+    delete kmerExtractor;
+    delete kmerMatcher;
+    delete taxonomer;
+    delete reporter;
 }
 
 void Classifier::startClassify(const LocalParameters &par) {
 
-    // Calculate maximum number of k-mers for each iteration.
-    size_t matchPerKmer = par.matchPerKmer;
-    size_t c = sizeof(QueryKmer) + matchPerKmer * sizeof(Match);
-    size_t ram_threads = ((size_t) par.ramUsage * (size_t) 1024 * 1024 * 1024)
-                         - ((size_t) 134217728 * (size_t) par.threads);
-
-
-    // Load query file
     cout << "Indexing query file ...";
-    vector<SequenceBlock> sequences_read1;
-    vector<SequenceBlock> sequences_read2;
-    size_t numOfSeq = 0;
-    size_t start = 0;
-    size_t kmerCnt = 0;
-    size_t currentKmerCnt = 0;
-    size_t seqCnt = 0;
-    vector<size_t> splitKmerCnt;
-    vector<pair<size_t, size_t>> queryReadSplit;
-    size_t totalReadLength = 0;
-    if (par.seqMode == 1 || par.seqMode == 3) {
-        splitQueryFile(sequences_read1, queryPath_1);
-
-        // Make query read splits
-        numOfSeq = sequences_read1.size();
-        for (size_t i = 0; i < numOfSeq; i++) {
-            currentKmerCnt = getQueryKmerNumber<size_t>(sequences_read1[i].seqLength);
-            kmerCnt += currentKmerCnt;
-            seqCnt++;
-            if (c * kmerCnt + ((size_t) 200 * seqCnt) > ram_threads) {
-                splitKmerCnt.push_back(kmerCnt - currentKmerCnt);
-                queryReadSplit.emplace_back(start, i);
-                kmerCnt = currentKmerCnt;
-                start = i;
-                seqCnt = 1;
-            }
-            totalReadLength += sequences_read1[i].seqLength;
-        }
-        queryReadSplit.emplace_back(start, numOfSeq);
-        splitKmerCnt.push_back(kmerCnt);
-    } else {
-        splitQueryFile(sequences_read1, queryPath_1);
-        splitQueryFile(sequences_read2, queryPath_2);
-
-        // Check if the number of reads in the two files are equal
-        if (sequences_read1.size() != sequences_read2.size()) {
-            Debug(Debug::ERROR) << "The number of reads in the two files are not equal." << "\n";
-            EXIT(EXIT_FAILURE);
-        }
-
-        // Make query read splits
-        numOfSeq = sequences_read1.size();
-        for (size_t i = 0; i < numOfSeq; i++) {
-            totalReadLength += sequences_read1[i].seqLength + sequences_read2[i].seqLength;
-            currentKmerCnt = getQueryKmerNumber<size_t>(sequences_read1[i].seqLength) +
-                             getQueryKmerNumber<size_t>(sequences_read2[i].seqLength);
-            kmerCnt += currentKmerCnt;
-            seqCnt ++;
-            if (c * kmerCnt + ((size_t) 200 * seqCnt) > ram_threads) {
-                splitKmerCnt.push_back(kmerCnt - currentKmerCnt);
-                queryReadSplit.emplace_back(start, i);
-                kmerCnt = currentKmerCnt;
-                start = i;
-                seqCnt = 1;
-            }
-        }
-        queryReadSplit.emplace_back(start, numOfSeq);
-        splitKmerCnt.push_back(kmerCnt);
-    }
-
+    queryIndexer->indexQueryFile();
+    size_t numOfSeq = queryIndexer->getReadNum_1();
+    size_t totalReadLength = queryIndexer->getTotalReadLength();
+    const vector<QuerySplit> & queryReadSplit = queryIndexer->getQuerySplits();
     cout << "Done" << endl;
     cout << "Total number of sequences: " << numOfSeq << endl;
     cout << "Total read length: " << totalReadLength <<  "nt" << endl;
@@ -206,63 +53,50 @@ void Classifier::startClassify(const LocalParameters &par) {
     size_t totalMatchCnt = 0;
     size_t processedSeqCnt = 0;
 
-    ofstream readClassificationFile;
-    readClassificationFile.open(outDir + "/" + jobId + "_classifications.tsv");
+    reporter->openReadClassificationFile();
 #ifdef OPENMP
     omp_set_num_threads(par.threads);
 #endif
     // Extract k-mers from query sequences and compare them to target k-mer DB
-    double vm, rss;
     KSeqWrapper* kseq1 = KSeqFactory(par.filenames[0].c_str());
     KSeqWrapper* kseq2 = nullptr;
     if (par.seqMode == 2) { kseq2 = KSeqFactory(par.filenames[1].c_str()); }
+//    while (true) {
+//        bool success = false;
+//        while (!success) {
+//
+//        }
+//        if (complete) {
+//            break;
+//        }
+//    }
 
     for (size_t splitIdx = 0; splitIdx < queryReadSplit.size(); splitIdx++) {
         // Allocate memory for query list
         queryList.clear();
-        queryList.resize(queryReadSplit[splitIdx].second - queryReadSplit[splitIdx].first);
+        queryList.resize(queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start);
 
         // Allocate memory for query k-mer list and match list
-        kmerBuffer.reallocateMemory(splitKmerCnt[splitIdx]);
-        if (splitKmerCnt.size() == 1) {
-            size_t remain = ram_threads - splitKmerCnt[splitIdx] * sizeof(QueryKmer) - numOfSeq * 200;
-            matchPerKmer = remain / (sizeof(Match) * splitKmerCnt[splitIdx]);
-            matchBuffer.reallocateMemory(splitKmerCnt[splitIdx] * matchPerKmer);
+        kmerBuffer.reallocateMemory(queryReadSplit[splitIdx].kmerCnt);
+        if (queryReadSplit.size() == 1) {
+            size_t remain = queryIndexer->getAvailableRam() - queryReadSplit[splitIdx].kmerCnt * sizeof(QueryKmer) - numOfSeq * 200;
+            matchBuffer.reallocateMemory(remain / sizeof(Match));
         } else {
-            matchBuffer.reallocateMemory(splitKmerCnt[splitIdx] * matchPerKmer);
+            matchBuffer.reallocateMemory(queryReadSplit[splitIdx].kmerCnt * matchPerKmer);
         }
 
-
         // Initialize query k-mer buffer and match buffer
         kmerBuffer.startIndexOfReserve = 0;
         matchBuffer.startIndexOfReserve = 0;
 
         // Extract query k-mer
-        time_t beforeKmerExtraction = time(nullptr);
-        cout << "Extracting query metamers ... " << endl;
-        if (par.seqMode == 1 || par.seqMode == 3) { // Single-end short-read sequence or long-read sequence
-            fillQueryKmerBufferParallel(kseq1,
-                                        kmerBuffer,
-                                        queryList,
-                                        queryReadSplit[splitIdx],
-                                        par);
-        } else if (par.seqMode == 2) {
-            fillQueryKmerBufferParallel_paired(kseq1,
-                                               kseq2,
-                                               kmerBuffer,
-                                               queryList,
-                                               queryReadSplit[splitIdx],
-                                               par);
-        }
-
+        kmerExtractor->extractQueryKmers(kmerBuffer,
+                                         queryList,
+                                         queryReadSplit[splitIdx],
+                                         par,
+                                         kseq1,
+                                         kseq2);
         numOfTatalQueryKmerCnt += kmerBuffer.startIndexOfReserve;
-        cout << "Time spent for metamer extraction: " << double(time(nullptr) - beforeKmerExtraction) << endl;
-
-        // Sort query k-mer
-        time_t beforeQueryKmerSort = time(nullptr);
-        cout << "Sorting query metamer list ..." << endl;
-        SORT_PARALLEL(kmerBuffer.buffer, kmerBuffer.buffer + kmerBuffer.startIndexOfReserve, compareForLinearSearch);
-        cout << "Time spent for sorting query metamer list: " << double(time(nullptr) - beforeQueryKmerSort) << endl;
 
 //#ifdef OPENMP
 //        if (par.printLog == 1) {
@@ -271,26 +105,9 @@ void Classifier::startClassify(const LocalParameters &par) {
 //            omp_set_num_threads(par.threads);
 //        }
 //#endif
-        // Search matches between query and target k-mers
-        linearSearchParallel(kmerBuffer.buffer, kmerBuffer.startIndexOfReserve, matchBuffer, par);
-
-#ifdef OPENMP
-        omp_set_num_threads(par.threads);
-#endif
-        // Sort matches
-        time_t beforeSortMatches = time(nullptr);
-        totalMatchCnt += matchBuffer.startIndexOfReserve;
-        cout << "Sorting matches ..." << endl;
-        SORT_PARALLEL(matchBuffer.buffer, matchBuffer.buffer + matchBuffer.startIndexOfReserve,
-                      sortMatch());
-        cout << "Time spent for sorting matches: " << double(time(nullptr) - beforeSortMatches) << endl;
 
-//        for (size_t i = 0; i < matchBuffer.startIndexOfReserve; i++) {
-//            cout << matchBuffer.buffer[i].queryId << " " <<  matchBuffer.buffer[i].splitIdx << " " <<
-//            matchBuffer.buffer[i].targetSplitIdx << " " << matchBuffer.buffer[i].targetId << " " <<
-//            genusTaxIdList[matchBuffer.buffer[i].targetId] << " " << speciesTaxIdList[matchBuffer.buffer[i].targetId] << " "
-//            << matchBuffer.buffer[i].position << " " << (int) matchBuffer.buffer[i].hamming << " " << taxIdList[matchBuffer.buffer[i].targetId] << endl;
-//        }
+        // Search matches between query and target k-mers
+        kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer);
 
 
 //#ifdef OPENMP
@@ -302,25 +119,27 @@ void Classifier::startClassify(const LocalParameters &par) {
 //#endif
 
         // Classify queries based on the matches
-        time_t beforeAnalyze = time(nullptr);
-        cout << "Analyzing matches ..." << endl;
-        fromMatchToClassification(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par);
-        cout << "Time spent for analyzing: " << double(time(nullptr) - beforeAnalyze) << endl;
-        processedSeqCnt += queryReadSplit[splitIdx].second - queryReadSplit[splitIdx].first;
+        taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par);
+        processedSeqCnt += queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start;
         cout << "The number of processed sequences: " << processedSeqCnt << " (" << (double) processedSeqCnt / (double) numOfSeq << ")" << endl;
 
+//        for (size_t i = 0; i < matchBuffer.startIndexOfReserve; i++) {
+//            cout << matchBuffer.buffer[i].queryId << " " <<  matchBuffer.buffer[i].splitIdx << " " <<
+//            matchBuffer.buffer[i].targetSplitIdx << " " << matchBuffer.buffer[i].targetId << " " <<
+//            genusTaxIdList[matchBuffer.buffer[i].targetId] << " " << speciesTaxIdList[matchBuffer.buffer[i].targetId] << " "
+//            << matchBuffer.buffer[i].position << " " << (int) matchBuffer.buffer[i].hamming << " " << taxIdList[matchBuffer.buffer[i].targetId] << endl;
+//        }
+
         // Write classification results
-        writeReadClassification(queryList,
-                                (int) (queryReadSplit[splitIdx].second - queryReadSplit[splitIdx].first),
-                                readClassificationFile);
+        reporter->writeReadClassification(queryList);
     }
 
     cout << "Number of query k-mers: " << numOfTatalQueryKmerCnt << endl;
     cout << "The number of matches: " << totalMatchCnt << endl;
-    readClassificationFile.close();
+    reporter->closeReadClassificationFile();
 
     // Write report files
-    writeReportFile(outDir, numOfSeq, taxCounts);
+    reporter->writeReportFile(numOfSeq, taxonomer->getTaxCounts());
 
     // Memory deallocation
     free(matchBuffer.buffer);
@@ -328,1830 +147,3 @@ void Classifier::startClassify(const LocalParameters &par) {
     delete kseq2;
 
 }
-
-void Classifier::fillQueryKmerBufferParallel(KSeqWrapper* kseq1,
-                                             QueryKmerBuffer &kmerBuffer,
-                                             vector<Query> & queryList,
-                                             const pair<size_t, size_t> & currentSplit,
-                                             const LocalParameters &par) {
-    size_t queryNum = currentSplit.second - currentSplit.first;
-    size_t processedQueryNum = 0;
-
-    // Array to store reads of thread number
-    vector<string> reads1(par.threads);
-
-    while (processedQueryNum < queryNum) {
-        size_t currentQueryNum = min(queryNum - processedQueryNum, (size_t) par.threads);
-        size_t count = 0;
-        while (count < currentQueryNum) {
-            // Read query
-            kseq1->ReadEntry();
-            const KSeqWrapper::KSeqEntry & e1 = kseq1->entry;
-
-            // Get k-mer count
-            int kmerCnt = getQueryKmerNumber<int>((int) e1.sequence.l);
-
-            // Query Info
-            queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l);
-            queryList[processedQueryNum].name = string(e1.name.s);
-            queryList[processedQueryNum].kmerCnt = (int) (kmerCnt);
-
-            // Store reads
-            reads1[count] = string(kseq1->entry.sequence.s);
-
-            processedQueryNum ++;
-            count ++;
-        }
-#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1)
-        {
-            SeqIterator seqIterator(par);
-            size_t posToWrite;
-#pragma omp for schedule(dynamic, 1)
-            for (size_t i = 0; i < currentQueryNum; i ++) {
-                size_t queryIdx = processedQueryNum - currentQueryNum + i;
-                // Get k-mer count
-                auto kmerCnt = getQueryKmerNumber<size_t>(reads1[i].length());
-
-                // Ignore short read
-                if (kmerCnt < 1) { continue; }
-
-                // Get masked sequence
-                char *maskedSeq1 = nullptr;
-                if (maskMode) {
-                    maskedSeq1 = new char[reads1[i].length() + 1];
-                    SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat);
-                } else {
-                    maskedSeq1 = const_cast<char *>(reads1[i].c_str());
-                }
-
-                posToWrite = kmerBuffer.reserveMemory(kmerCnt);
-
-                // Process Read 1
-                seqIterator.sixFrameTranslation(maskedSeq1, (int) reads1[i].length());
-                seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) reads1[i].length(), kmerBuffer, posToWrite,
-                                                (uint32_t) queryIdx);
-
-                if (maskMode) {
-                    delete[] maskedSeq1;
-                }
-            }
-        }
-    }
-}
-
-
-
-int Classifier::getMaxCoveredLength(int queryLength) {
-    if (queryLength % 3 == 2) {
-        return queryLength - 2; // 2
-    } else if (queryLength % 3 == 1) {
-        return queryLength - 4; // 4
-    } else {
-        return queryLength - 3; // 3
-    }
-}
-
-template <typename T>
-T Classifier::getQueryKmerNumber(T queryLength) {
-    return (getMaxCoveredLength(queryLength) / 3 - kmerLength - spaceNum_int + 1) * 6;
-}
-
-void Classifier::fillQueryKmerBufferParallel_paired(KSeqWrapper* kseq1,
-                                                    KSeqWrapper* kseq2,
-                                                    QueryKmerBuffer &kmerBuffer,
-                                                    vector<Query> & queryList,
-                                                    const pair<size_t, size_t> & currentSplit,
-                                                    const LocalParameters &par) {
-    size_t queryNum = currentSplit.second - currentSplit.first;
-    size_t processedQueryNum = 0;
-
-    // Array to store reads of thread number
-    vector<string> reads1(par.threads);
-    vector<string> reads2(par.threads);
-
-    while (processedQueryNum < queryNum) {
-        size_t currentQueryNum = min(queryNum - processedQueryNum, (size_t) par.threads);
-        size_t count = 0;
-
-        // Fill reads in sequential
-        while (count < currentQueryNum) {
-            // Read query
-            kseq1->ReadEntry();
-            kseq2->ReadEntry();
-            const KSeqWrapper::KSeqEntry & e1 = kseq1->entry;
-            const KSeqWrapper::KSeqEntry & e2 = kseq2->entry;
-
-            // Get k-mer count
-            int kmerCnt = getQueryKmerNumber<int>((int) e1.sequence.l);
-            int kmerCnt2 = getQueryKmerNumber<int>((int) e2.sequence.l);
-
-            // Query Info
-            queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l);
-            queryList[processedQueryNum].queryLength2 = getMaxCoveredLength((int) e2.sequence.l);
-            queryList[processedQueryNum].name = string(e1.name.s);
-            queryList[processedQueryNum].kmerCnt = (int) (kmerCnt + kmerCnt2);
-
-            // Store reads
-            reads1[count] = string(kseq1->entry.sequence.s);
-            reads2[count] = string(kseq2->entry.sequence.s);
-
-            processedQueryNum ++;
-            count ++;
-        }
-
-        // Process reads in parallel
-#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1, reads2)
-        {
-            SeqIterator seqIterator(par);
-            SeqIterator seqIterator2(par);
-            size_t posToWrite;
-#pragma omp for schedule(dynamic, 1)
-            for (size_t i = 0; i < currentQueryNum; i ++) {
-                size_t queryIdx = processedQueryNum - currentQueryNum + i;
-                // Get k-mer count
-                auto kmerCnt = getQueryKmerNumber<size_t>(reads1[i].length());
-                auto kmerCnt2 = getQueryKmerNumber<size_t>(reads2[i].length());
-
-                // Ignore short read
-                if (kmerCnt2 < 1 || kmerCnt < 1) { continue; }
-
-                // Get masked sequence
-                char *maskedSeq1 = nullptr;
-                char *maskedSeq2 = nullptr;
-                if (maskMode) {
-                    maskedSeq1 = new char[reads1[i].length() + 1];
-                    maskedSeq2 = new char[reads2[i].length() + 1];
-                    SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat);
-                    SeqIterator::maskLowComplexityRegions(reads2[i].c_str(),maskedSeq2, *probMatrix, maskProb, subMat);
-                } else {
-                    maskedSeq1 = const_cast<char *>(reads1[i].c_str());
-                    maskedSeq2 = const_cast<char *>(reads2[i].c_str());
-                }
-
-                posToWrite = kmerBuffer.reserveMemory(kmerCnt + kmerCnt2);
-
-                // Process Read 1
-                seqIterator.sixFrameTranslation(maskedSeq1, (int) reads1[i].length());
-                seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) reads1[i].length(), kmerBuffer, posToWrite,
-                                                (uint32_t) queryIdx);
-
-                // Process Read 2
-                seqIterator2.sixFrameTranslation(maskedSeq2, (int) reads2[i].length());
-                seqIterator2.fillQueryKmerBuffer(maskedSeq2, (int) reads2[i].length(), kmerBuffer, posToWrite,
-                                                 (uint32_t) queryIdx, queryList[queryIdx].queryLength);
-
-                if (maskMode) {
-                    delete[] maskedSeq1;
-                    delete[] maskedSeq2;
-                }
-            }
-        }
-    }
-}
-
-void Classifier::linearSearchParallel(QueryKmer *queryKmerList, size_t &queryKmerCnt,
-                                      Buffer<Match> &matchBuffer, const LocalParameters &par) {
-    int threadNum = par.threads;
-    string targetDiffIdxFileName = dbDir + "/diffIdx";
-    string targetInfoFileName = dbDir + "/info";
-    string diffIdxSplitFileName = dbDir + "/split";;
-
-    struct stat diffIdxFileSt{};
-    stat(targetDiffIdxFileName.c_str(), &diffIdxFileSt);
-    size_t numOfDiffIdx = diffIdxFileSt.st_size / sizeof(uint16_t);
-
-    struct MmapedData<DiffIdxSplit> diffIdxSplits = mmapData<DiffIdxSplit>(diffIdxSplitFileName.c_str(), 3);
-
-    cout << "Comparing query and reference metamers..." << endl;
-
-    // Find the first index of garbage query k-mer (UINT64_MAX) and discard from there
-    for (size_t checkN = queryKmerCnt - 1; checkN > 0; checkN--) {
-        if (queryKmerList[checkN].ADkmer != UINT64_MAX) {
-            queryKmerCnt = checkN + 1;
-            break;
-        }
-    }
-
-    // Filter out meaningless target splits
-    size_t numOfDiffIdxSplits = diffIdxSplits.fileSize / sizeof(DiffIdxSplit);
-    size_t numOfDiffIdxSplits_use = numOfDiffIdxSplits;
-    for (size_t i = 1; i < numOfDiffIdxSplits; i++) {
-        if (diffIdxSplits.data[i].ADkmer == 0 || diffIdxSplits.data[i].ADkmer == UINT64_MAX) {
-            diffIdxSplits.data[i] = {UINT64_MAX, UINT64_MAX, UINT64_MAX};
-            numOfDiffIdxSplits_use--;
-        }
-    }
-
-    // Divide query k-mer list into blocks for multi threading.
-    // Each split has start and end points of query list + proper offset point of target k-mer list
-    vector<QueryKmerSplit> querySplits;
-    uint64_t queryAA;
-    vector<int> targetSplitIdxs;
-    if (threadNum == 1) { //Single thread
-        querySplits.emplace_back(0, queryKmerCnt - 1, queryKmerCnt, diffIdxSplits.data[0]);
-    } else if (threadNum == 2) { //Two threads
-        size_t splitWidth = queryKmerCnt / 2;
-        querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]);
-        for (size_t tSplitCnt = 0; tSplitCnt < numOfDiffIdxSplits_use; tSplitCnt++) {
-            queryAA = AminoAcidPart(queryKmerList[splitWidth].ADkmer);
-            if (queryAA <= AminoAcidPart(diffIdxSplits.data[tSplitCnt].ADkmer)) {
-                tSplitCnt = tSplitCnt - (tSplitCnt != 0);
-                querySplits.emplace_back(splitWidth, queryKmerCnt - 1, queryKmerCnt - splitWidth,
-                                         diffIdxSplits.data[tSplitCnt]);
-                break;
-            }
-        }
-    } else { //More than two threads
-        // Devide query k-mers into blocks
-        size_t splitWidth = queryKmerCnt / (threadNum - 1);
-        querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]);
-        for (int i = 1; i < threadNum; i++) {
-            queryAA = AminoAcidPart(queryKmerList[splitWidth * i].ADkmer);
-            bool needLastTargetBlock = true;
-            for (size_t j = 0; j < numOfDiffIdxSplits_use; j++) {
-                if (queryAA <= AminoAcidPart(diffIdxSplits.data[j].ADkmer)) {
-                    j = j - (j != 0);
-                    if (i != threadNum - 1) {
-                        querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth,
-                                                 diffIdxSplits.data[j]);
-                    } else {
-                        querySplits.emplace_back(splitWidth * i, queryKmerCnt - 1, queryKmerCnt - splitWidth * i,
-                                                 diffIdxSplits.data[j]);
-                    }
-                    targetSplitIdxs.emplace_back(j);
-                    needLastTargetBlock = false;
-                    break;
-                }
-            }
-            if (needLastTargetBlock) {
-                if (i != threadNum - 1) { // If it is not the last split
-                    querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth,
-                                             diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
-                    targetSplitIdxs.emplace_back(numOfDiffIdxSplits_use - 2);
-                } else {
-                    querySplits.emplace_back(splitWidth * i, queryKmerCnt - 1, queryKmerCnt - splitWidth * i,
-                                             diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
-                    targetSplitIdxs.emplace_back(numOfDiffIdxSplits_use - 2);
-                }
-            }
-        }
-    }
-
-    bool *splitCheckList = (bool *) malloc(sizeof(bool) * threadNum);
-    fill_n(splitCheckList, threadNum, false);
-    int completedSplitCnt = 0;
-
-    time_t beforeSearch = time(nullptr);
-
-    while (completedSplitCnt < threadNum) {
-        bool hasOverflow = false;
-#pragma omp parallel default(none), shared(completedSplitCnt, splitCheckList, hasOverflow, \
-querySplits, queryKmerList, matchBuffer, cout, par, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName, targetSplitIdxs)
-        {
-            // FILE
-            FILE * diffIdxFp = fopen(targetDiffIdxFileName.c_str(), "rb");
-            FILE * kmerInfoFp = fopen(targetInfoFileName.c_str(), "rb");
-
-            // Target K-mer buffer
-            uint16_t * diffIdxBuffer = (uint16_t *) malloc(sizeof(uint16_t) * (BufferSize + 1)); // size = 32 Mb
-            TargetKmerInfo * kmerInfoBuffer = (TargetKmerInfo *) malloc(sizeof(TargetKmerInfo) * (BufferSize+1)); // 64 Mb
-            size_t kmerInfoBufferIdx = 0;
-            size_t diffIdxBufferIdx = 0;
-
-            //query variables
-            uint64_t currentQuery = UINT64_MAX;
-            uint64_t currentQueryAA = UINT64_MAX;
-            QueryKmerInfo currentQueryInfo;
-
-            //target variables
-            size_t diffIdxPos = 0;
-            vector<uint64_t> candidateTargetKmers; //vector for candidate target k-mer, some of which are selected after based on hamming distance
-            vector<TargetKmerInfo> candidateKmerInfos;
-            uint64_t currentTargetKmer;
-
-            //Match buffer for each thread
-            int localBufferSize = 2'000'000; // 32 Mb
-            auto *matches = new Match[localBufferSize]; // 16 * 2'000'000 = 32 Mb
-            int matchCnt = 0;
-
-            // For debug
-            SeqIterator seqIterator(par);
-
-            //vectors for selected target k-mers
-            vector<uint8_t> selectedHammingSum;
-            vector<size_t> selectedMatches;
-            vector<uint16_t> selectedHammings;
-            size_t posToWrite;
-
-            int currMatchNum;
-            size_t idx;
-#pragma omp for schedule(dynamic, 1)
-            for (size_t i = 0; i < querySplits.size(); i++) {
-                if (hasOverflow || splitCheckList[i]) {
-                    continue;
-                }
-
-                currentTargetKmer = querySplits[i].diffIdxSplit.ADkmer;
-                diffIdxBufferIdx = querySplits[i].diffIdxSplit.diffIdxOffset;
-                kmerInfoBufferIdx = querySplits[i].diffIdxSplit.infoIdxOffset
-                                    - (querySplits[i].diffIdxSplit.ADkmer != 0);
-                diffIdxPos = querySplits[i].diffIdxSplit.diffIdxOffset;
-
-                fseek(kmerInfoFp, 4 * (long)(kmerInfoBufferIdx), SEEK_SET);
-                loadBuffer(kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx, BufferSize);
-                fseek(diffIdxFp, 2 * (long) (diffIdxBufferIdx), SEEK_SET);
-                loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize);
-
-                if (i == 0) {
-                    currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer,
-                                                          diffIdxBufferIdx, diffIdxPos);
-                }
-                currentQuery = UINT64_MAX;
-                currentQueryAA = UINT64_MAX;
-
-                size_t lastMovedQueryIdx = 0;
-                for (size_t j = querySplits[i].start; j < querySplits[i].end + 1; j++) {
-                    querySplits[i].start++;
-
-                    // Reuse the comparison data if queries are exactly identical
-                    if (currentQuery == queryKmerList[j].ADkmer
-                        && (currentQueryInfo.frame/3 == queryKmerList[j].info.frame/3)) {
-                        currMatchNum = selectedMatches.size();
-                        // If local buffer is full, copy them to the shared buffer.
-                        if (matchCnt + currMatchNum > localBufferSize) {
-                            // Check if the shared buffer is full.
-                            posToWrite = matchBuffer.reserveMemory(matchCnt);
-                            if (posToWrite + matchCnt >= matchBuffer.bufferSize) {
-                                hasOverflow = true;
-                                querySplits[i].start = lastMovedQueryIdx + 1;
-                                __sync_fetch_and_sub(& matchBuffer.startIndexOfReserve, matchCnt);
-                                break;
-                            } else { // not full -> copy matches to the shared buffer
-                                moveMatches(matchBuffer.buffer + posToWrite, matches, matchCnt);
-                                lastMovedQueryIdx = j;
-                            }
-                        }
-                        for (int k = 0; k < currMatchNum; k++) {
-                            idx = selectedMatches[k];
-                            matches[matchCnt] = {queryKmerList[j].info,
-                                                 candidateKmerInfos[idx].sequenceID,
-                                                 taxId2genusId[candidateKmerInfos[idx].sequenceID],
-                                                 taxId2speciesId[candidateKmerInfos[idx].sequenceID],
-                                                 selectedHammings[k],
-                                                 selectedHammingSum[k],
-                                                 (bool) candidateKmerInfos[idx].redundancy};
-                            matchCnt++;
-                        }
-                        continue;
-                    }
-                    selectedMatches.clear();
-                    selectedHammingSum.clear();
-                    selectedHammings.clear();
-
-                    // Reuse the candidate target k-mers to compare in DNA level if queries are the same at amino acid level but not at DNA level
-                    if (currentQueryAA == AminoAcidPart(queryKmerList[j].ADkmer)) {
-                        compareDna(queryKmerList[j].ADkmer, candidateTargetKmers, selectedMatches,
-                                   selectedHammingSum, selectedHammings,queryKmerList[j].info.frame);
-                        currMatchNum = selectedMatches.size();
-
-                        // If local buffer is full, copy them to the shared buffer.
-                        if (matchCnt + currMatchNum > localBufferSize) {
-                            // Check if the shared buffer is full.
-                            posToWrite = matchBuffer.reserveMemory(matchCnt);
-                            if (posToWrite + matchCnt >= matchBuffer.bufferSize) {
-                                hasOverflow = true;
-                                querySplits[i].start = lastMovedQueryIdx + 1;
-                                __sync_fetch_and_sub(& matchBuffer.startIndexOfReserve, matchCnt);
-                                break;
-                            } else { // not full -> copy matches to the shared buffer
-                                moveMatches(matchBuffer.buffer + posToWrite, matches, matchCnt);
-                                lastMovedQueryIdx = j;
-                            }
-                        }
-                        for (int k = 0; k < currMatchNum; k++) {
-                            idx = selectedMatches[k];
-                            matches[matchCnt] = {queryKmerList[j].info,
-                                                 candidateKmerInfos[idx].sequenceID,
-                                                 taxId2genusId[candidateKmerInfos[idx].sequenceID],
-                                                 taxId2speciesId[candidateKmerInfos[idx].sequenceID],
-                                                 selectedHammings[k],
-                                                 selectedHammingSum[k],
-                                                 (bool) candidateKmerInfos[idx].redundancy};
-                            matchCnt++;
-                        }
-                        currentQuery = queryKmerList[j].ADkmer;
-                        currentQueryAA = AminoAcidPart(currentQuery);
-                        currentQueryInfo = queryKmerList[j].info;
-                        continue;
-                    }
-                    candidateTargetKmers.clear();
-                    candidateKmerInfos.clear();
-
-                    // Get next query, and start to find
-                    currentQuery = queryKmerList[j].ADkmer;
-                    currentQueryAA = AminoAcidPart(currentQuery);
-                    currentQueryInfo = queryKmerList[j].info;
-
-                    // Skip target k-mers that are not matched in amino acid level
-                    while (diffIdxPos != numOfDiffIdx
-                        && (currentQueryAA > AminoAcidPart(currentTargetKmer))) {
-                        if (unlikely(BufferSize < diffIdxBufferIdx + 7)){
-                            loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 );
-                        }
-                        currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer,
-                                                              diffIdxBufferIdx, diffIdxPos);
-                        kmerInfoBufferIdx ++;
-                    }
-
-                    if (currentQueryAA != AminoAcidPart(currentTargetKmer)) // Move to next query k-mer if there isn't any match.
-                        continue;
-
-                    // Load target k-mers that are matched in amino acid level
-                    while (diffIdxPos != numOfDiffIdx &&
-                    currentQueryAA == AminoAcidPart(currentTargetKmer)) {
-                        candidateTargetKmers.push_back(currentTargetKmer);
-                        candidateKmerInfos.push_back(getKmerInfo(BufferSize, kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx));
-                        // Print the target k-mer
-//                        if (par.printLog == 1) {
-//                            cout << queryKmerList[j].info.sequenceID << "\t" << queryKmerList[j].info.pos << "\t"
-//                                 << (int) queryKmerList[j].info.frame << endl;
-//                            cout << "Query  k-mer: ";
-//                            print_binary64(64, currentQuery);
-//                            cout << "\t";
-//                            seqIterator.printKmerInDNAsequence(currentQuery);
-//                            cout << endl;
-//                            cout << "Target k-mer: ";
-//                            print_binary64(64, currentTargetKmer);
-//                            cout << "\t";
-//                            seqIterator.printKmerInDNAsequence(currentTargetKmer);
-//                            cout << "\t" << kmerInfoBuffer[kmerInfoBufferIdx].sequenceID
-//                                 << "\t" << taxId2speciesId[kmerInfoBuffer[kmerInfoBufferIdx].sequenceID] << endl;
-//                            cout << (int) getHammingDistanceSum(currentQuery, currentTargetKmer) << "\t";
-//                            print_binary16(16, getHammings(currentQuery, currentTargetKmer)); cout << endl;
-//                        }
-
-                        if (unlikely(BufferSize < diffIdxBufferIdx + 7)){
-                            loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx,
-                                       BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 );
-                        }
-
-                        currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer,
-                                                              diffIdxBufferIdx, diffIdxPos);
-                        kmerInfoBufferIdx ++;
-                    }
-
-                    // Compare the current query and the loaded target k-mers and select
-                    compareDna(currentQuery, candidateTargetKmers, selectedMatches, selectedHammingSum,
-                               selectedHammings, queryKmerList[j].info.frame);
-
-                    // If local buffer is full, copy them to the shared buffer.
-                    currMatchNum = selectedMatches.size();
-                    if (matchCnt + currMatchNum > localBufferSize) {
-                        // Check if the shared buffer is full.
-                        posToWrite = matchBuffer.reserveMemory(matchCnt);
-                        if (posToWrite + matchCnt >= matchBuffer.bufferSize) { // full -> write matches to file first
-                            hasOverflow = true;
-                            querySplits[i].start = lastMovedQueryIdx + 1;
-                            __sync_fetch_and_sub(&matchBuffer.startIndexOfReserve, matchCnt);
-                            break;
-                        } else { // not full -> copy matches to the shared buffer
-                            moveMatches(matchBuffer.buffer + posToWrite, matches, matchCnt);
-                            lastMovedQueryIdx = j;
-                        }
-                    }
-
-                    for (int k = 0; k < currMatchNum; k++) {
-                        idx = selectedMatches[k];
-                        matches[matchCnt] = {queryKmerList[j].info,
-                                             candidateKmerInfos[idx].sequenceID,
-                                             taxId2genusId[candidateKmerInfos[idx].sequenceID],
-                                             taxId2speciesId[candidateKmerInfos[idx].sequenceID],
-                                             selectedHammings[k],
-                                             selectedHammingSum[k],
-                                             (bool) candidateKmerInfos[idx].redundancy};
-                        matchCnt++;
-                    }
-                } // End of one split
-
-                // Move matches in the local buffer to the shared buffer
-                posToWrite = matchBuffer.reserveMemory(matchCnt);
-                if (posToWrite + matchCnt >= matchBuffer.bufferSize) {
-                    hasOverflow = true;
-                    querySplits[i].start = lastMovedQueryIdx + 1;
-                    __sync_fetch_and_sub(& matchBuffer.startIndexOfReserve, matchCnt);
-                } else {
-                    moveMatches(matchBuffer.buffer + posToWrite, matches, matchCnt);
-                }
-
-                // Check whether current split is completed or not
-                if (querySplits[i].start - 1 == querySplits[i].end) {
-                    splitCheckList[i] = true;
-                    __sync_fetch_and_add(&completedSplitCnt, 1);
-                }
-            } // End of omp for (Iterating for splits)
-            delete[] matches;
-            fclose(diffIdxFp);
-            fclose(kmerInfoFp);
-            free(diffIdxBuffer);
-            free(kmerInfoBuffer);
-        } // End of omp parallel
-        if (hasOverflow) {
-            cout << "overflow!!!" << endl;
-            break;
-        }
-    } // end of while(completeSplitCnt < threadNum)
-    cout << "Time spent for the comparison: " << double(time(nullptr) - beforeSearch) << endl;
-    munmap(diffIdxSplits.data, diffIdxSplits.fileSize + 1);
-    free(splitCheckList);
-    queryKmerCnt = 0;
-}
-
-void Classifier::moveMatches(Match *dest, Match *src, int &matchNum) {
-    memcpy(dest, src, sizeof(Match) * matchNum);
-    matchNum = 0;
-}
-
-// It compares query k-mers to target k-mers.
-// If a query has matches, the matches with the smallest hamming distance will be selected
-void Classifier::compareDna(uint64_t query, vector<uint64_t> &targetKmersToCompare,
-                            vector<size_t> &selectedMatches, vector<uint8_t> &selectedHammingSum,
-                            vector<uint16_t> &selectedHammings, uint8_t frame) {
-
-    size_t size = targetKmersToCompare.size();
-    auto *hammingSums = new uint8_t[size + 1];
-    uint8_t currentHammingSum;
-    uint8_t minHammingSum = UINT8_MAX;
-
-    // Calculate hamming distance
-    for (size_t i = 0; i < size; i++) {
-        currentHammingSum = getHammingDistanceSum(query, targetKmersToCompare[i]);
-        if (currentHammingSum < minHammingSum) {
-            minHammingSum = currentHammingSum;
-        }
-        hammingSums[i] = currentHammingSum;
-    }
-
-    // Select target k-mers that passed hamming criteria
-    for (size_t h = 0; h < size; h++) {
-        if (hammingSums[h] <= minHammingSum + hammingMargin) {
-            selectedMatches.push_back(h);
-            selectedHammingSum.push_back(hammingSums[h]);
-            if (frame < 3) {
-                selectedHammings.push_back(getHammings(query, targetKmersToCompare[h]));
-            } else {
-                selectedHammings.push_back(getHammings_reverse(query, targetKmersToCompare[h]));
-            }
-        }
-    }
-    delete[] hammingSums;
-}
-
-// It analyses the result of linear search.
-void Classifier::fromMatchToClassification(const Match *matchList,
-                                           size_t numOfMatches,
-                                           vector<Query> & queryList,
-                                           const LocalParameters &par) {
-
-    // Devide matches into blocks for multi threading
-    size_t seqNum = queryList.size();
-    MatchBlock *matchBlocks = new MatchBlock[seqNum];
-    size_t matchIdx = 0;
-    size_t blockIdx = 0;
-    uint32_t currentQuery;
-    while (matchIdx < numOfMatches) {
-        currentQuery = matchList[matchIdx].qInfo.sequenceID;
-        matchBlocks[blockIdx].id = currentQuery;
-        matchBlocks[blockIdx].start = matchIdx;
-        while ((currentQuery == matchList[matchIdx].qInfo.sequenceID) && (matchIdx < numOfMatches)) ++matchIdx;
-        matchBlocks[blockIdx].end = matchIdx - 1;
-        blockIdx++;
-    }
-
-    // Process each block
-#pragma omp parallel default(none), shared(cout, matchBlocks, matchList, seqNum, queryList, blockIdx, par)
-    {
-#pragma omp for schedule(dynamic, 1)
-        for (size_t i = 0; i < blockIdx; ++i) {
-            chooseBestTaxon(matchBlocks[i].id,
-                            matchBlocks[i].start,
-                            matchBlocks[i].end,
-                            matchList,
-                            queryList,
-                            par);
-        }
-    }
-
-    for (size_t i = 0; i < seqNum; i++) {
-        ++taxCounts[queryList[i].classification];
-    }
-    delete[] matchBlocks;
-}
-
-
-void Classifier::chooseBestTaxon(uint32_t currentQuery,
-                                 size_t offset,
-                                 size_t end,
-                                 const Match *matchList,
-                                 vector<Query> & queryList,
-                                 const LocalParameters &par) {
-    TaxID selectedTaxon;
-//    if (par.printLog) {
-//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
-//        for (size_t i = offset; i < end + 1; i++) {
-//            cout << taxId2genusId[matchList[i].targetId] << " " << taxId2speciesId[matchList[i].targetId] <<
-//            " "  << matchList[i].targetId << " " << matchList[i].qInfo.frame << " ";
-//            print_binary16(16, matchList[i].rightEndHamming);
-//            cout << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
-//        }
-//    }
-
-    // Get the best genus for current query
-    vector<Match> genusMatches;
-    genusMatches.reserve(end - offset + 1);
-
-    int res;
-    TaxonScore genusScore(0, 0, 0, 0);
-    if (par.seqMode == 2) {
-        if (par.spaceMask != "11111111"){
-            genusScore = getBestGenusMatches_spaced(genusMatches, matchList, end, offset,
-                                                    queryList[currentQuery].queryLength,
-                                                    queryList[currentQuery].queryLength2);
-        } else {
-            genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
-                                              queryList[currentQuery].queryLength,
-                                              queryList[currentQuery].queryLength2, par);
-        }
-    } else {
-        if (par.spaceMask != "11111111") {
-            genusScore = getBestGenusMatches_spaced(genusMatches, matchList, end, offset,
-                                                    queryList[currentQuery].queryLength);
-        } else {
-            genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
-                                              queryList[currentQuery].queryLength, par);
-        }
-    }
-
-//    if (par.printLog) {
-//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n";
-//        for (size_t i = 0; i < genusMatches.size(); i++) {
-//            cout << taxId2genusId[genusMatches[i].targetId] << " " << taxId2speciesId[genusMatches[i].targetId] <<
-//                 " "  << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " ";
-//            print_binary16(16, genusMatches[i].rightEndHamming);
-//            cout << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) <<  " "  << int(genusMatches[i].redundancy) << endl;
-//        }
-//        cout << "Genus score: " << genusScore.score << "\n";
-//    }
-
-    // If there is no proper genus for current query, it is un-classified.
-    if (genusScore.score == 0 || genusScore.coverage < par.minCoverage || genusScore.score < par.minScore) {
-        queryList[currentQuery].isClassified = false;
-        queryList[currentQuery].classification = 0;
-        queryList[currentQuery].score = genusScore.score;
-        queryList[currentQuery].coverage = genusScore.coverage;
-        queryList[currentQuery].hammingDist = genusScore.hammingDist;
-        queryList[currentQuery].newSpecies = false;
-        return;
-    }
-
-    // If there are two or more good genus level candidates, find the LCA.
-    if (genusScore.taxId == 0) {
-        vector<TaxID> genusList;
-        genusList.reserve(genusMatches.size());
-        for (auto & genusMatch : genusMatches) {
-            genusList.push_back(genusMatch.genusId);
-        }
-        selectedTaxon = taxonomy->LCA(genusList)->taxId;
-        queryList[currentQuery].isClassified = true;
-        queryList[currentQuery].classification = selectedTaxon;
-        queryList[currentQuery].score = genusScore.score;
-        queryList[currentQuery].coverage = genusScore.coverage;
-        queryList[currentQuery].hammingDist = genusScore.hammingDist;
-        for (auto & genusMatch : genusMatches) {
-            queryList[currentQuery].taxCnt[genusMatch.targetId]++;
-        }
-        return;
-    }
-
-    // Choose the species with the highest coverage.
-    TaxID selectedSpecies;
-    TaxonScore speciesScore;
-    vector<TaxID> species;
-    unordered_map<TaxID, pair<int, int>> speciesMatchRange;
-    if (par.seqMode == 2) {
-        speciesScore = chooseSpecies(genusMatches,
-                                     queryList[currentQuery].queryLength,
-                                     queryList[currentQuery].queryLength2,
-                                     species,
-                                     speciesMatchRange);
-    } else {
-        speciesScore = chooseSpecies(genusMatches,
-                                     queryList[currentQuery].queryLength,
-                                     species,
-                                     speciesMatchRange);
-    }
-
-
-    // Classify to LCA if more than one species are selected
-    if (species.size() > 1) {
-        queryList[currentQuery].isClassified = true;
-        queryList[currentQuery].classification = taxonomy->LCA(species)->taxId;
-        queryList[currentQuery].score = genusScore.score;
-        queryList[currentQuery].coverage = genusScore.coverage;
-        queryList[currentQuery].hammingDist = genusScore.hammingDist;
-        for (auto & genusMatch : genusMatches) {
-            queryList[currentQuery].taxCnt[genusMatch.targetId]++;
-        }
-        return;
-    }
-
-    // If score is not enough, classify to the parent of the selected species
-    if (speciesScore.score < minSpScore) {
-        queryList[currentQuery].isClassified = true;
-        queryList[currentQuery].classification = taxonomy->taxonNode(
-                taxonomy->getTaxIdAtRank(species[0], "species"))->parentTaxId;
-        queryList[currentQuery].score = genusScore.score;
-        queryList[currentQuery].coverage = genusScore.coverage;
-        queryList[currentQuery].hammingDist = genusScore.hammingDist;
-        for (auto & genusMatch : genusMatches) {
-            if(genusMatch.speciesId == species[0]){
-                queryList[currentQuery].taxCnt[genusMatch.targetId]++;
-            }
-        }
-        return;
-    }
-
-    // Sort matches by the position of the query sequence
-    selectedSpecies = species[0];
-//    sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first,
-//         genusMatches.begin() + speciesMatchRange[selectedSpecies].second,
-//         [](const Match & a, const Match & b) {
-//        if (a.qInfo.position / 3 == b.qInfo.position / 3)
-//            return a.hamming < b.hamming;
-//        else
-//            return a.qInfo.position / 3 < b.qInfo.position / 3;
-//    });
-
-    sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first,
-         genusMatches.begin() + speciesMatchRange[selectedSpecies].second,
-         [](const Match & a, const Match & b) { return a.qInfo.pos > b.qInfo.pos; });
-
-
-    TaxID result = lowerRankClassification(genusMatches, speciesMatchRange[selectedSpecies], selectedSpecies);
-
-    // Record matches of selected species
-    for (size_t i = speciesMatchRange[selectedSpecies].first; i < speciesMatchRange[selectedSpecies].second; i++) {
-        queryList[currentQuery].taxCnt[genusMatches[i].targetId]++;
-    }
-
-
-    // Store classification results
-    queryList[currentQuery].isClassified = true;
-    queryList[currentQuery].classification = result;
-    queryList[currentQuery].score = speciesScore.score;
-    queryList[currentQuery].coverage = speciesScore.coverage;
-    queryList[currentQuery].hammingDist = speciesScore.hammingDist;
-    queryList[currentQuery].newSpecies = false;
-//    if (par.printLog) {
-//        cout << "# " << currentQuery << endl;
-//        for (size_t i = 0; i < genusMatches.size(); i++) {
-//            cout << i << " " << genusMatches[i].qInfo.pos << " " <<
-//            genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl;
-//        }
-//        cout << "Score: " << speciesScore.score << "  " << selectedSpecies << " "
-//             << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx)
-//
-//             << endl;
-//    }
-}
-
-TaxID Classifier::lowerRankClassification(vector<Match> &matches, pair<int, int> &matchRange, TaxID spTaxId) {
-    int i = matchRange.second - 1;
-    unordered_map<TaxID, unsigned int> taxCnt;
-
-    while ( i >= matchRange.first ) {
-        size_t currQuotient = matches[i].qInfo.pos / 3;
-        uint8_t minHamming = matches[i].hamming;
-        Match * minHammingMatch = & matches[i];
-        TaxID minHammingTaxId = minHammingMatch->targetId;
-        bool first = true;
-        i --;
-        while ( (i >= matchRange.first) && (currQuotient == matches[i].qInfo.pos / 3) ) {
-            if (matches[i].hamming < minHamming) {
-                minHamming = matches[i].hamming;
-                minHammingMatch = & matches[i];
-                minHammingTaxId = minHammingMatch->targetId;
-            } else if (matches[i].hamming == minHamming) {
-                minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId);
-                minHammingMatch->redundancy = true;
-                matches[i].redundancy = true;
-            }
-            i--;
-        }
-        taxCnt[minHammingTaxId]++;
-    }
-
-    unordered_map<TaxID, TaxonCounts> cladeCnt;
-    getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
-
-    return BFS(cladeCnt, spTaxId);
-}
-
-void Classifier::getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> &taxCnt,
-                                       unordered_map<TaxID, TaxonCounts> & cladeCount,
-                                       TaxID speciesTaxID) {
-    for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) {
-//        cladeCount[it->first].taxCount = it->second;
-//        cladeCount[it->first].cladeCount += it->second;
-        TaxonNode const * taxon = taxonomy->taxonNode(it->first);
-        cladeCount[taxon->taxId].taxCount = it->second;
-        cladeCount[taxon->taxId].cladeCount += it->second;
-        while (taxon->taxId != speciesTaxID) {
-            if (find(cladeCount[taxon->parentTaxId].children.begin(),
-                     cladeCount[taxon->parentTaxId].children.end(),
-                     taxon->taxId) == cladeCount[taxon->parentTaxId].children.end()) {
-                cladeCount[taxon->parentTaxId].children.push_back(taxon->taxId);
-            }
-            cladeCount[taxon->parentTaxId].cladeCount += it->second;
-            taxon = taxonomy->taxonNode(taxon->parentTaxId);
-        }
-    }
-}
-
-TaxID Classifier::BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID root) {
-    if (cladeCnt.at(root).children.empty()) { // root is a leaf
-        return root;
-    }
-    unsigned int maxCnt = 3;
-    unsigned int currentCnt;
-    vector<TaxID> bestChildren;
-    for (auto it = cladeCnt.at(root).children.begin(); it != cladeCnt.at(root).children.end(); it++) {
-        currentCnt = cladeCnt.at(*it).cladeCount;
-        if (currentCnt > maxCnt) {
-            bestChildren.clear();
-            bestChildren.push_back(*it);
-            maxCnt = currentCnt;
-        } else if (currentCnt == maxCnt) {
-            bestChildren.push_back(*it);
-        }
-    }
-    if (bestChildren.size() == 1) {
-        return BFS(cladeCnt, bestChildren[0]);
-    } else {
-        return root;
-    }
-}
-
-TaxonScore Classifier::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
-                                           size_t offset, int readLength1, int readLength2, const LocalParameters & par) {
-    TaxID currentGenus;
-    TaxID currentSpecies;
-
-    vector<const Match *> filteredMatches;
-    vector<vector<const Match *>> matchesForEachGenus;
-    vector<TaxonScore> genusScores;
-    TaxonScore bestScore;
-    size_t i = offset;
-    uint8_t curFrame;
-    vector<const Match *> curFrameMatches;
-    while (i  < end + 1) {
-//        currentGenus = taxId2genusId[matchList[i].targetId];
-        currentGenus = matchList[i].genusId;
-        // For current genus
-        while ((i < end + 1) && currentGenus == matchList[i].genusId) {
-//            currentSpecies = taxId2speciesId[matchList[i].targetId];
-            currentSpecies = matchList[i].speciesId;
-//            if (par.printLog) {
-//                cout << currentGenus << " " << currentSpecies << endl;
-//            }
-            // For current species
-            while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
-                curFrame = matchList[i].qInfo.frame;
-                curFrameMatches.clear();
-
-                // For current frame
-                while ((i < end + 1) && currentSpecies == matchList[i].speciesId
-                    && curFrame == matchList[i].qInfo.frame) {
-                    curFrameMatches.push_back(&matchList[i]);
-                    i ++;
-                }
-                if (curFrameMatches.size() > 1) {
-                    remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus, par);
-                }
-            }
-        }
-
-        // Construct a match combination using filtered matches of current genus
-        // so that it can best cover the query, and score the combination
-        if (!filteredMatches.empty()) {
-            matchesForEachGenus.push_back(filteredMatches);
-            genusScores.push_back(scoreGenus(filteredMatches, readLength1, readLength2));
-        }
-        filteredMatches.clear();
-    }
-
-    // If there are no meaningful genus
-    if (genusScores.empty()) {
-        bestScore.score = 0;
-        return bestScore;
-    }
-
-    TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
-                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
-
-    vector<size_t> maxIdx;
-    for (size_t g = 0; g < genusScores.size(); g++) {
-        if (genusScores[g].score > maxScore.score * 0.95f) {
-            maxIdx.push_back(g);
-        }
-    }
-    bestScore = maxScore;
-
-    for (unsigned long g : maxIdx) {
-        for (const Match * m : matchesForEachGenus[g]) {
-            genusMatches.push_back(*m);
-        }
-    }
-
-
-
-    // More than one genus
-    if (maxIdx.size() > 1) {
-        bestScore.taxId = 0;
-        return bestScore;
-    }
-
-    return bestScore;
-
-    //Three cases
-    //1. one genus
-    //2. more than one genus
-    //4. no genus
-}
-
-
-
-void Classifier::remainConsecutiveMatches(vector<const Match *> & curFrameMatches,
-                                          vector<const Match *> & filteredMatches,
-                                          TaxID genusId,
-                                          const LocalParameters & par) {
-    size_t i = 0;
-    size_t end = curFrameMatches.size();
-    vector<pair<const Match *, size_t>> curPosMatches; // <match, index>
-    vector<pair<const Match *, size_t>> nextPosMatches;
-    map<size_t, vector<size_t>> linkedMatches; // <index, linked indexes>
-
-    size_t currPos = curFrameMatches[0]->qInfo.pos;
-    while ( i < end && curFrameMatches[i]->qInfo.pos == currPos) {
-        curPosMatches.emplace_back(curFrameMatches[i], i);
-        i++;
-    }
-    while (i < end) {
-        uint32_t nextPos = curFrameMatches[i]->qInfo.pos;
-        while (i < end  && nextPos == curFrameMatches[i]->qInfo.pos) {
-            nextPosMatches.emplace_back(curFrameMatches[i], i);
-            ++ i;
-        }
-        // Check if current position and next position are consecutive
-        if (currPos + 3 == nextPos) {
-            // Compare curPosMatches and nextPosMatches
-            for (auto &curPosMatch: curPosMatches) {
-                for (auto &nextPosMatch: nextPosMatches) {
-                    if (isConsecutive(curPosMatch.first, nextPosMatch.first)) {
-                        linkedMatches[curPosMatch.second].push_back(nextPosMatch.second);
-                    }
-                }
-            }
-
-        }
-        // Update curPosMatches and nextPosMatches
-        curPosMatches = nextPosMatches;
-        nextPosMatches.clear();
-        currPos = nextPos;
-    }
-    // Print linkedMatches
-//    if (par.printLog) {
-//        cout << "linkedMatches: " << endl;
-//        for (const auto &entry: linkedMatches) {
-//            cout << entry.first << ": ";
-//            for (auto &idx: entry.second) {
-//                cout << idx << " ";
-//            }
-//            cout << endl;
-//        }
-//    }
-
-    // Iterate linkedMatches to get filteredMatches
-    int MIN_DEPTH = par.minConsCnt - 1;
-    if (taxonomy->IsAncestor(par.eukaryotaTaxId, genusId)) {
-        MIN_DEPTH = par.minConsCntEuk - 1;
-    }
-    unordered_set<size_t> used;
-    vector<size_t> filteredMatchIdx;
-    unordered_map<size_t, size_t> idx2depth;
-    for (const auto& entry : linkedMatches) {
-        if (!used.count(entry.first)) {
-            used.insert(entry.first);
-            vector<const Match*> curMatches;
-            DFS(entry.first, linkedMatches, filteredMatchIdx, 0, MIN_DEPTH, used, idx2depth);
-        }
-    }
-    
-//    if (par.printLog) {
-//        cout << "filteredMatchIdx: ";
-//        for (auto &idx: filteredMatchIdx) {
-//            cout << idx << " ";
-//        }
-//        cout << endl;
-//    }
-
-    for (auto &idx: filteredMatchIdx) {
-        filteredMatches.push_back(curFrameMatches[idx]);
-    }
-}
-
-
-size_t Classifier::DFS(size_t curMatchIdx, const map<size_t, vector<size_t>> & linkedMatches,
-                       vector<size_t>& filteredMatches, size_t depth, size_t MIN_DEPTH, unordered_set<size_t>& used,
-                       unordered_map<size_t, size_t> & idx2depth) {
-    depth++;
-    size_t maxDepth = 0;
-    size_t returnDepth = 0;
-    if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { //|| linkedMatches.at(curMatchIdx).empty()) {
-        // reached a leaf node
-        idx2depth[curMatchIdx] = depth;
-        if (depth > MIN_DEPTH) {
-            filteredMatches.push_back(curMatchIdx);
-        }
-        return depth;
-    } else { // not a leaf node
-        for (auto &nextMatchIdx: linkedMatches.at(curMatchIdx)) {
-            used.insert(nextMatchIdx);
-            if (idx2depth.find(nextMatchIdx) != idx2depth.end()) {
-                returnDepth = idx2depth[nextMatchIdx];
-                maxDepth = max(maxDepth, returnDepth);
-                continue;
-            }
-            returnDepth = DFS(nextMatchIdx, linkedMatches, filteredMatches, depth, MIN_DEPTH, used, idx2depth);
-            maxDepth = max(maxDepth, returnDepth);
-        }
-        if (maxDepth > MIN_DEPTH) {
-            filteredMatches.push_back(curMatchIdx);
-            idx2depth[curMatchIdx] = maxDepth;
-        }
-    }
-    return maxDepth;
-}
-
-TaxonScore Classifier::getBestGenusMatches_spaced(vector<Match> &genusMatches, const Match *matchList, size_t end,
-                                                  size_t offset, int readLength1, int readLength2) {
-    TaxID currentGenus;
-    TaxID currentSpecies;
-
-    vector<const Match *> tempMatchContainer;
-    vector<const Match *> filteredMatches;
-    vector<vector<const Match *>> matchesForEachGenus;
-    vector<bool> conservedWithinGenus;
-    vector<TaxonScore> genusScores;
-    TaxonScore bestScore;
-    size_t i = offset;
-    bool lastIn;
-    while (i + 1 < end + 1) {
-        currentGenus = matchList[i].genusId;
-        // For current genus
-        while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) {
-//            currentSpecies = taxId2speciesId[matchList[i].targetId];
-            currentSpecies = matchList[i].speciesId;
-            // For current species
-            // Filter un-consecutive matches (probably random matches)
-            lastIn = false;
-            int distance = 0;
-            int diffPosCntOfCurrRange = 1;
-            int dnaDist = 0;
-
-            // For the same species
-            while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) {
-                distance = matchList[i+1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3;
-                dnaDist = matchList[i+1].qInfo.pos - matchList[i].qInfo.pos;
-                if (distance == 0) { // At the same position
-                    tempMatchContainer.push_back(matchList + i);
-                } else if (dnaDist < (8 + spaceNum_int + maxGap) * 3) { // Overlapping
-                    lastIn = true;
-                    tempMatchContainer.push_back(matchList + i);
-                    diffPosCntOfCurrRange ++;
-                } else { // Not consecutive --> End range
-                    if (lastIn){
-                        tempMatchContainer.push_back(matchList + i);
-                        if (diffPosCntOfCurrRange >= minCoveredPos) {
-                            filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
-                                                   tempMatchContainer.end());
-                        }
-                    }
-                    lastIn = false;
-                    // Initialize range info
-                    tempMatchContainer.clear();
-                    diffPosCntOfCurrRange = 1;
-                }
-                i++;
-            }
-
-            // Met next species
-            if (lastIn) {
-                tempMatchContainer.push_back(matchList + i);
-                if (diffPosCntOfCurrRange >= minCoveredPos) {
-                    filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
-                                           tempMatchContainer.end());
-                }
-            }
-            tempMatchContainer.clear();
-            i++;
-        }
-
-        // Construct a match combination using filtered matches of current genus
-        // so that it can best cover the query, and score the combination
-        if (!filteredMatches.empty()) {
-            genusScores.push_back(scoreGenus(filteredMatches, readLength1, readLength2));
-        }
-        filteredMatches.clear();
-    }
-
-    // If there are no meaningful genus
-    if (genusScores.empty()) {
-        bestScore.score = 0;
-        return bestScore;
-    }
-
-    TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
-                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
-
-    vector<size_t> maxIdx;
-    for (size_t g = 0; g < genusScores.size(); g++) {
-        if (genusScores[g].score > maxScore.score * 0.95f) {
-            maxIdx.push_back(g);
-        }
-    }
-    bestScore = maxScore;
-
-    for (unsigned long g : maxIdx) {
-        for (const Match * m : matchesForEachGenus[g]) {
-            genusMatches.push_back(*m);
-        }
-    }
-
-    // More than one genus
-    if (maxIdx.size() > 1) {
-        bestScore.taxId = 0;
-        return bestScore;
-    }
-    return bestScore;
-
-    //Three cases
-    //1. one genus
-    //2. more than one genus
-    //4. no genus
-}
-
-TaxonScore Classifier::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
-                                           size_t offset, int queryLength, const LocalParameters & par) {
-    TaxID currentGenus;
-    TaxID currentSpecies;
-
-    vector<const Match *> filteredMatches;
-    vector<vector<const Match *>> matchesForEachGenus;
-    vector<TaxonScore> genusScores;
-    TaxonScore bestScore;
-    size_t i = offset;
-    uint8_t curFrame;
-    vector<const Match *> curFrameMatches;
-    while (i  < end + 1) {
-        currentGenus = matchList[i].genusId;
-        // For current genus
-        while ((i < end + 1) && currentGenus == matchList[i].genusId) {
-            currentSpecies = matchList[i].speciesId;
-
-            // For current species
-            while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
-                curFrame = matchList[i].qInfo.frame;
-                curFrameMatches.clear();
-
-                // For current frame
-                while ((i < end + 1) && currentSpecies == matchList[i].speciesId
-                       && curFrame == matchList[i].qInfo.frame) {
-                    curFrameMatches.push_back(&matchList[i]);
-                    i ++;
-                }
-                if (curFrameMatches.size() > 1) {
-                    remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus, par);
-                }
-            }
-        }
-
-        // Construct a match combination using filtered matches of current genus
-        // so that it can best cover the query, and score the combination
-
-        if (!filteredMatches.empty()) {
-            matchesForEachGenus.push_back(filteredMatches);
-            genusScores.push_back(scoreGenus(filteredMatches, queryLength));
-        }
-        filteredMatches.clear();
-    }
-
-    // If there are no meaningful genus
-    if (genusScores.empty()) {
-        bestScore.score = 0;
-        return bestScore;
-    }
-
-    TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
-                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
-
-    vector<size_t> maxIdx;
-    for (size_t g = 0; g < genusScores.size(); g++) {
-        if (genusScores[g].score > maxScore.score * 0.95f) {
-            maxIdx.push_back(g);
-        }
-    }
-
-    bestScore = maxScore;
-
-    for (unsigned long g : maxIdx) {
-        for (const Match * m : matchesForEachGenus[g]) {
-            genusMatches.push_back(*m);
-        }
-    }
-
-    // More than one genus
-    if (maxIdx.size() > 1) {
-        bestScore.taxId = 0;
-        return bestScore;
-    }
-    return bestScore;
-
-    //Three cases
-    //1. one genus
-    //2. more than one genus
-    //4. no genus
-}
-
-TaxonScore Classifier::getBestGenusMatches_spaced(vector<Match> &genusMatches, const Match *matchList, size_t end,
-                                                  size_t offset, int readLength) {
-    TaxID currentGenus;
-    TaxID currentSpecies;
-
-    vector<const Match *> tempMatchContainer;
-    vector<const Match *> filteredMatches;
-    vector<vector<Match>> matchesForEachGenus;
-    vector<bool> conservedWithinGenus;
-    vector<TaxonScore> genusScores;
-    TaxonScore bestScore;
-    size_t i = offset;
-    bool lastIn;
-    size_t speciesMatchCnt;
-    while (i + 1 < end + 1) {
-        currentGenus = matchList[i].genusId;
-        // For current genus
-        while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) {
-            currentSpecies = matchList[i].speciesId;
-            // For current species
-            // Filter un-consecutive matches (probably random matches)
-            lastIn = false;
-            int distance = 0;
-            int diffPosCntOfCurrRange = 1;
-            int dnaDist = 0;
-
-            // For the same species
-            while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) {
-                distance = matchList[i + 1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3;
-                dnaDist = matchList[i + 1].qInfo.pos - matchList[i].qInfo.pos;
-                if (distance == 0) { // At the same position
-                    tempMatchContainer.push_back(matchList + i);
-                } else if (dnaDist < (8 + spaceNum_int + maxGap) * 3) { // Overlapping
-                    lastIn = true;
-                    tempMatchContainer.push_back(matchList + i);
-                    diffPosCntOfCurrRange++;
-                } else { // Not consecutive --> End range
-                    if (lastIn) {
-                        tempMatchContainer.push_back(matchList + i);
-                        if (diffPosCntOfCurrRange >= minCoveredPos) {
-                            filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
-                                                   tempMatchContainer.end());
-                        }
-                    }
-                    lastIn = false;
-                    // Initialize range info
-                    tempMatchContainer.clear();
-                    diffPosCntOfCurrRange = 1;
-                }
-                i++;
-            }
-
-            // Met next species
-            if (lastIn) {
-                tempMatchContainer.push_back(matchList + i);
-                if (diffPosCntOfCurrRange >= minCoveredPos) {
-                    filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
-                                           tempMatchContainer.end());
-                }
-            }
-            tempMatchContainer.clear();
-            i++;
-        }
-
-        // Construct a match combination using filtered matches of current genus
-        // so that it can best cover the query, and score the combination
-        if (!filteredMatches.empty()) {
-            genusScores.push_back(scoreGenus(filteredMatches, readLength));
-        }
-        filteredMatches.clear();
-    }
-
-    // If there are no meaningful genus
-    if (genusScores.empty()) {
-        bestScore.score = 0;
-        return bestScore;
-    }
-
-    TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
-                                       [](const TaxonScore &a, const TaxonScore &b) { return a.score < b.score; });
-
-    vector<size_t> maxIdx;
-    for (size_t g = 0; g < genusScores.size(); g++) {
-        if (genusScores[g].score > maxScore.score * 0.95f) {
-            maxIdx.push_back(g);
-        }
-    }
-    bestScore = maxScore;
-
-    for (unsigned long g: maxIdx) {
-        genusMatches.insert(genusMatches.end(),
-                            matchesForEachGenus[g].begin(),
-                            matchesForEachGenus[g].end());
-    }
-
-    // More than one genus
-    if (maxIdx.size() > 1) {
-        bestScore.taxId = 0;
-        return bestScore;
-    }
-    return bestScore;
-
-    //Three cases
-    //1. one genus
-    //2. more than one genus
-    //4. no genus
-}
-
-TaxonScore Classifier::scoreGenus(vector<const Match *> &filteredMatches,
-                                  int queryLength) {
-    // Calculate Hamming distance & covered length
-    int coveredPosCnt = 0;
-    uint16_t currHammings;
-    int aminoAcidNum = (int) queryLength / 3;
-    int currPos;
-    size_t matchNum = filteredMatches.size();
-    size_t f = 0;
-
-    // Get the largest hamming distance at each position of query
-    auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1];
-    memset(hammingsAtEachPos, -1, (aminoAcidNum + 1));
-    while (f < matchNum) {
-        currPos = filteredMatches[f]->qInfo.pos / 3;
-        currHammings = filteredMatches[f]->rightEndHamming;
-        if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]])
-            hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings);
-        if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]])
-            hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2);
-        if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]])
-            hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4);
-        if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]])
-            hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6);
-        if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]])
-            hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8);
-        if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]])
-            hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10);
-        if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]])
-            hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12);
-        if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]])
-            hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14);
-        f++;
-    }
-
-    // Sum up hamming distances and count the number of position covered by the matches.
-    float hammingSum = 0;
-    for (int h = 0; h < aminoAcidNum; h++) {
-        if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
-            coveredPosCnt++;
-        } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
-            hammingSum += 1.0f + (0.5f * hammingsAtEachPos[h]);
-            coveredPosCnt++;
-        }
-    }
-    delete[] hammingsAtEachPos;
-
-    // Score current genus
-    int coveredLength = coveredPosCnt * 3;
-    if (coveredLength > queryLength) coveredLength = queryLength;
-    float score = ((float) coveredLength - hammingSum) / (float) queryLength;
-    float coverage = (float) (coveredLength) / (float) (queryLength);
-
-    return {filteredMatches[0]->genusId, score, coverage, (int) hammingSum};
-}
-
-TaxonScore Classifier::scoreGenus(vector<const Match *> &filteredMatches,
-                                  int readLength1,
-                                  int readLength2) {
-
-    // Calculate Hamming distance & covered length
-    uint16_t currHammings;
-    int aminoAcidNum_total = ((int) readLength1 / 3) + ((int) readLength2 / 3);
-    int aminoAcidNum_read1 = ((int) readLength1 / 3);
-    int currPos;
-    size_t matchNum = filteredMatches.size();
-    size_t f = 0;
-
-    // Get the largest hamming distance at each position of query
-    auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3];
-    memset(hammingsAtEachPos, -1, (aminoAcidNum_total + 3));
-    while (f < matchNum) {
-        currPos = (int) filteredMatches[f]->qInfo.pos / 3;
-        currHammings = filteredMatches[f]->rightEndHamming;
-        if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]])
-            hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings);
-        if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]])
-            hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2);
-        if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]])
-            hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4);
-        if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]])
-            hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6);
-        if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]])
-            hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8);
-        if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]])
-            hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10);
-        if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]])
-            hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12);
-        if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]])
-            hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14);
-        f++;
-    }
-
-    // Sum up hamming distances and count the number of position covered by the matches.
-    float hammingSum = 0;
-    int coveredPosCnt_read1 = 0;
-    int coveredPosCnt_read2 = 0;
-    for (int h = 0; h < aminoAcidNum_total; h++) {
-        // Read 1
-        if (h < aminoAcidNum_read1) {
-            if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
-                coveredPosCnt_read1++;
-            } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
-                hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]);
-                coveredPosCnt_read1++;
-            }
-        }
-        // Read 2
-        else {
-            if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
-                coveredPosCnt_read2++;
-            } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
-                hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]);
-                coveredPosCnt_read2++;
-            }
-        }
-    }
-    delete[] hammingsAtEachPos;
-
-    // Score current genus
-    int coveredLength_read1 = coveredPosCnt_read1 * 3;
-    int coveredLength_read2 = coveredPosCnt_read2 * 3;
-    if (coveredLength_read1 > readLength1) coveredLength_read1 = readLength1;
-    if (coveredLength_read2 > readLength2) coveredLength_read2 = readLength2;
-    float score =
-            ((float) (coveredLength_read1 + coveredLength_read2) - hammingSum) / (float) (readLength1 + readLength2);
-    float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (readLength1 + readLength2);
-
-//    matchesForEachGenus.push_back(move(filteredMatches));
-    return {filteredMatches[0]->genusId, score, coverage, (int) hammingSum};
-}
-
-TaxonScore Classifier::chooseSpecies(const vector<Match> &matches,
-                                     int queryLength,
-                                     vector<TaxID> &species,
-                                     unordered_map<TaxID, pair<int, int>> & speciesMatchRange) {
-    // Score each species
-    std::unordered_map<TaxID, TaxonScore> speciesScores;
-    size_t i = 0;
-    TaxID currentSpeices;
-    size_t numOfMatch = matches.size();
-    size_t speciesBegin, speciesEnd;
-    while (i < numOfMatch) {
-        currentSpeices = matches[i].speciesId;
-        speciesBegin = i;
-        while ((i < numOfMatch) && currentSpeices == matches[i].speciesId) {
-            i++;
-        }
-        speciesEnd = i;
-        speciesScores[currentSpeices] = scoreSpecies(matches, speciesBegin, speciesEnd, queryLength);
-        speciesMatchRange[currentSpeices] = {(int) speciesBegin, (int) speciesEnd};
-        speciesScores[currentSpeices].taxId = currentSpeices;
-    }
-
-    // Get the best species
-    TaxonScore bestScore;
-    for (auto & sp : speciesScores) {
-        if (sp.second.score > bestScore.score) {
-            species.clear();
-            species.push_back(sp.first);
-            bestScore = sp.second;
-        } else if (sp.second.coverage == bestScore.coverage) {
-            species.push_back(sp.first);
-        }
-    }
-    return bestScore;
-}
-
-TaxonScore Classifier::chooseSpecies(const vector<Match> &matches,
-                                     int read1Length,
-                                     int read2Length,
-                                     vector<TaxID> &species,
-                                     unordered_map<TaxID, pair<int, int>> & speciesMatchRange) {
-    // Score each species
-    std::unordered_map<TaxID, TaxonScore> speciesScores;
-
-
-    size_t i = 0;
-    TaxID currentSpeices;
-    size_t numOfMatch = matches.size();
-    size_t speciesBegin, speciesEnd;
-    while (i < numOfMatch) {
-        currentSpeices = matches[i].speciesId;
-        speciesBegin = i;
-        while ((i < numOfMatch) && currentSpeices == matches[i].speciesId) {
-            i++;
-        }
-        speciesEnd = i;
-        speciesScores[currentSpeices] = scoreSpecies(matches, speciesBegin, speciesEnd, read1Length, read2Length);
-        speciesMatchRange[currentSpeices] = {(int) speciesBegin, (int) speciesEnd};
-        speciesScores[currentSpeices].taxId = currentSpeices;
-    }
-
-    // Get the best species
-    TaxonScore bestScore;
-    for (auto & sp : speciesScores) {
-        if (sp.second.score > bestScore.score) {
-            species.clear();
-            species.push_back(sp.first);
-            bestScore = sp.second;
-        } else if (sp.second.coverage == bestScore.coverage) {
-            species.push_back(sp.first);
-        }
-    }
-    return bestScore;
-}
-
-TaxonScore Classifier::scoreSpecies(const vector<Match> &matches,
-                                  size_t begin,
-                                  size_t end,
-                                  int queryLength) {
-
-    // Get the largest hamming distance at each position of query
-    int aminoAcidNum = queryLength / 3;
-    auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1];
-    memset(hammingsAtEachPos, -1, (aminoAcidNum + 1));
-    int currPos;
-    size_t walker = begin;
-    uint16_t currHammings;
-    while (walker < end) {
-        currPos = matches[walker].qInfo.pos / 3;
-        currHammings = matches[walker].rightEndHamming;
-        if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]])
-            hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings);
-        if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]])
-            hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2);
-        if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]])
-            hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4);
-        if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]])
-            hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6);
-        if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]])
-            hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8);
-        if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]])
-            hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10);
-        if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]])
-            hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12);
-        if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]])
-            hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14);
-        walker++;
-    }
-
-    // Sum up hamming distances and count the number of position covered by the matches.
-    float hammingSum = 0;
-    int hammingDist = 0;
-    int coveredPosCnt = 0;
-    for (int h = 0; h < aminoAcidNum; h++) {
-        if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
-            coveredPosCnt++;
-        } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
-            hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]);
-            hammingDist += hammingsAtEachPos[h];
-            coveredPosCnt++;
-        }
-    }
-    delete[] hammingsAtEachPos;
-    // Score
-    int coveredLength = coveredPosCnt * 3;
-    if (coveredLength >= queryLength) coveredLength = queryLength;
-
-    float score = ((float)coveredLength - hammingSum) / (float) queryLength;
-    float coverage = (float) coveredLength / (float) (queryLength);
-
-    return {0, score, coverage, hammingDist};
-}
-
-TaxonScore Classifier::scoreSpecies(const vector<Match> &matches,
-                                  size_t begin,
-                                  size_t end,
-                                  int queryLength,
-                                  int queryLength2) {
-
-    // Get the smallest hamming distance at each position of query
-    int aminoAcidNum_total = queryLength / 3 + queryLength2 / 3;
-    int aminoAcidNum_read1 = queryLength / 3;
-    auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3];
-    memset(hammingsAtEachPos, -1, (aminoAcidNum_total + 3));
-
-    int currPos;
-    size_t walker = begin;
-    uint16_t currHammings;
-
-    while (walker < end) {
-        currPos = matches[walker].qInfo.pos / 3;
-        currHammings = matches[walker].rightEndHamming;
-        if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]])
-            hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings);
-        if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]])
-            hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2);
-        if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]])
-            hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4);
-        if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]])
-            hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6);
-        if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]])
-            hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8);
-        if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]])
-            hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10);
-        if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]])
-            hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12);
-        if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]])
-            hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14);
-        walker++;
-    }
-
-    // Sum up hamming distances and count the number of position covered by the matches.
-    float hammingSum = 0;
-    int hammingDist = 0;
-    int coveredPosCnt_read1 = 0;
-    int coveredPosCnt_read2 = 0;
-    for (int h = 0; h < aminoAcidNum_total; h++) {
-        // Read 1
-        if (h < aminoAcidNum_read1) {
-            if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
-                coveredPosCnt_read1++;
-            } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
-                hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]);
-                hammingDist += hammingsAtEachPos[h];
-                coveredPosCnt_read1++;
-            }
-        }
-        // Read 2
-        else {
-            if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
-                coveredPosCnt_read2++;
-            } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
-                hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]);
-                hammingDist += hammingsAtEachPos[h];
-                coveredPosCnt_read2++;
-            }
-        }
-    }
-    delete[] hammingsAtEachPos;
-
-    // Score
-    int coveredLength_read1 = coveredPosCnt_read1 * 3;
-    int coveredLength_read2 = coveredPosCnt_read2 * 3;
-    if (coveredLength_read1 >= queryLength) coveredLength_read1 = queryLength;
-    if (coveredLength_read2 >= queryLength2) coveredLength_read2 = queryLength2;
-
-    float score = ((float) (coveredLength_read1 + coveredLength_read2) - hammingSum) / (float) (queryLength + queryLength2);
-    float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (queryLength + queryLength2);
-
-    return {0, score, coverage, hammingDist};
-}
-
-void Classifier::writeReadClassification(const vector<Query> & queryList, int queryNum, ofstream &readClassificationFile) {
-    for (int i = 0; i < queryNum; i++) {
-        readClassificationFile << queryList[i].isClassified << "\t" << queryList[i].name << "\t"
-                               << queryList[i].classification << "\t"
-                               << queryList[i].queryLength + queryList[i].queryLength2 << "\t"
-                               << queryList[i].score << "\t"
-                               << queryList[i].coverage << "\t"
-                               << queryList[i].hammingDist << "\t"
-                               << taxonomy->getString(taxonomy->taxonNode(queryList[i].classification)->rankIdx) << "\t";
-        for (auto it = queryList[i].taxCnt.begin(); it != queryList[i].taxCnt.end(); ++it) {
-            readClassificationFile << it->first << ":" << it->second << " ";
-        }
-        readClassificationFile << "\n";
-    }
-}
-
-void Classifier::writeReportFile(const string &outdir, int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt) {
-    unordered_map<TaxID, TaxonCounts> cladeCounts = taxonomy->getCladeCounts(taxCnt);
-    FILE *fp;
-    fp = fopen((outdir + + "/" + jobId + "_report.tsv").c_str(), "w");
-    writeReport(fp, cladeCounts, numOfQuery);
-    fclose(fp);
-
-    // Write Krona chart
-    FILE *kronaFile = fopen((outDir + "/" + jobId + "_krona.html").c_str(), "w");
-    fwrite(krona_prelude_html, krona_prelude_html_len, sizeof(char), kronaFile);
-    fprintf(kronaFile, "<node name=\"all\"><magnitude><val>%zu</val></magnitude>", numOfQuery);
-    kronaReport(kronaFile, *taxonomy, cladeCounts, numOfQuery);
-    fprintf(kronaFile, "</node></krona></div></body></html>");
-
-}
-
-void Classifier::writeReport(FILE *FP, const std::unordered_map<TaxID, TaxonCounts> &cladeCounts,
-                             unsigned long totalReads, TaxID taxID, int depth) {
-    std::unordered_map<TaxID, TaxonCounts>::const_iterator it = cladeCounts.find(taxID);
-    unsigned int cladeCount = it == cladeCounts.end() ? 0 : it->second.cladeCount;
-    unsigned int taxCount = it == cladeCounts.end() ? 0 : it->second.taxCount;
-    if (taxID == 0) {
-        if (cladeCount > 0) {
-            fprintf(FP, "%.4f\t%i\t%i\tno rank\t0\tunclassified\n",
-                    100 * cladeCount / double(totalReads),
-                    cladeCount, taxCount);
-        }
-        writeReport(FP, cladeCounts, totalReads, 1);
-    } else {
-        if (cladeCount == 0) {
-            return;
-        }
-        const TaxonNode *taxon = taxonomy->taxonNode(taxID);
-        fprintf(FP, "%.4f\t%i\t%i\t%s\t%i\t%s%s\n",
-                100 * cladeCount / double(totalReads), cladeCount, taxCount,
-                taxonomy->getString(taxon->rankIdx), taxID, std::string(2 * depth, ' ').c_str(), taxonomy->getString(taxon->nameIdx));
-        std::vector<TaxID> children = it->second.children;
-        SORT_SERIAL(children.begin(), children.end(), [&](int a, int b) { return cladeCountVal(cladeCounts, a) > cladeCountVal(cladeCounts, b); });
-        for (size_t i = 0; i < children.size(); ++i) {
-            TaxID childTaxId = children[i];
-            if (cladeCounts.count(childTaxId)) {
-                writeReport(FP, cladeCounts, totalReads, childTaxId, depth + 1);
-            } else {
-                break;
-            }
-        }
-    }
-}
-
-unsigned int Classifier::cladeCountVal(const std::unordered_map<TaxID, TaxonCounts> &map, TaxID key) {
-    typename std::unordered_map<TaxID, TaxonCounts>::const_iterator it = map.find(key);
-    if (it == map.end()) {
-        return 0;
-    } else {
-        return it->second.cladeCount;
-    }
-}
-
-void Classifier::splitQueryFile(vector<SequenceBlock> & sequences, const std::string &queryPath) {
-    KSeqWrapper* kseq = nullptr;
-    kseq = KSeqFactory(queryPath.c_str());
-    while (kseq->ReadEntry()) {
-        const KSeqWrapper::KSeqEntry & e = kseq->entry;
-        sequences.emplace_back(e.headerOffset - 1,
-                               e.sequenceOffset + e.sequence.l,
-                               e.sequenceOffset + e.sequence.l - e.headerOffset + 2,
-                               e.sequence.l);
-    }
-    delete kseq;
-}
-
-bool Classifier::isConsecutive(const Match * match1, const Match * match2) {
-    return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF);
-}
-
-bool Classifier::isConsecutive(const Match & match1, const Match & match2, const LocalParameters & par) {
-    uint16_t hamming1 = match1.rightEndHamming;
-    uint16_t hamming2 = match2.rightEndHamming;
-//    if (par.printLog) {
-//        print_binary16(16, hamming1); cout << endl;
-//        print_binary16(16, hamming2); cout << endl;
-//    }
-
-    // set most significant two bits to 0
-    hamming2 &= 0x3FFF; // 07654321
-    // move bits to right by 2
-    hamming1 >>= 2; // 07654321
-//    if (par.printLog) {
-//        print_binary16(16, hamming1); cout << endl;
-//        print_binary16(16, hamming2); cout << endl;
-//    }
-
-    return hamming1 == hamming2;
-}
-
diff --git a/src/commons/Classifier.h b/src/commons/Classifier.h
index 06379189..5cf61ce9 100644
--- a/src/commons/Classifier.h
+++ b/src/commons/Classifier.h
@@ -24,286 +24,37 @@
 #include <cmath>
 #include "Match.h"
 #include <unordered_set>
-
+#include "LocalUtil.h"
+#include "QueryIndexer.h"
+#include "ReducedKmerMatcher.h"
+#include "KmerExtractor.h"
+#include "Taxonomer.h"
+#include "Reporter.h"
 #define BufferSize 16'777'216 //16 * 1024 * 1024 // 16 M
 using namespace std;
 
-struct TaxonScore {
-    TaxID taxId;
-    float score;
-    float coverage;
-    int hammingDist;
-    TaxonScore(TaxID taxId, float score, float coverage, int hammingDist) :
-                taxId(taxId), score(score), coverage(coverage), hammingDist(hammingDist) {}
-    TaxonScore() : taxId(0), score(0.0f), coverage(0.0f), hammingDist(0) {}
-};
+
 
 class Classifier {
 protected:
     // Parameters
-    int verbosity;
-    const int maskMode;
-    const float maskProb;
-    string queryPath_1;
-    string queryPath_2;
     string dbDir;
-    string outDir;
-    string jobId;
-
-//    size_t localIndexBufferSize;
-//    size_t localMatchBufferSize;
-
-    // For spaced k-mer
-    uint32_t * mask;
-    uint32_t spaceNum;
-    int spaceNum_int;
-    int unmaskedPos[9];
-
-    // For masking reads
-    ProbabilityMatrix * probMatrix;
-    BaseMatrix * subMat;
-
-    uint8_t hammingMargin;
-    float minSpScore;
-    int minCoveredPos;
-    int maxGap;
-
+    size_t matchPerKmer;
+
+    // Agents
+    QueryIndexer * queryIndexer;
+    KmerExtractor * kmerExtractor;
+    KmerMatcher * kmerMatcher;
+    Taxonomer * taxonomer;
+    Reporter * reporter;
     NcbiTaxonomy * taxonomy;
-    unordered_map<TaxID, TaxID> taxId2speciesId;
-    unordered_map<TaxID, TaxID> taxId2genusId;
-
-
-    struct MatchBlock {
-        MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {}
-        MatchBlock() : start(0), end(0), id(0) {}
-        size_t start;
-        size_t end;
-        uint32_t id;
-    };
-
-    struct QueryKmerSplit {
-        QueryKmerSplit(size_t start, size_t end, size_t length, const DiffIdxSplit& diffIdxSplit)
-                : start(start), end(end), length(length), diffIdxSplit(diffIdxSplit) {}
-
-        size_t start; // start idx in query k-mer list
-        size_t end; // end idx in query k-mer list
-        size_t length;
-        DiffIdxSplit diffIdxSplit; // index in target k-mer list from where the search begins.
-    };
-
-
-    template<typename T>
-    struct Buffer {
-        T *buffer;
-        size_t startIndexOfReserve;
-        size_t bufferSize;
-
-        explicit Buffer(size_t sizeOfBuffer=100) {
-            buffer = (T *) malloc(sizeof(T) * sizeOfBuffer);
-            bufferSize = sizeOfBuffer;
-            startIndexOfReserve = 0;
-        };
-
-        size_t reserveMemory(size_t numOfKmer) {
-            size_t offsetToWrite = __sync_fetch_and_add(&startIndexOfReserve, numOfKmer);
-            return offsetToWrite;
-        };
-
-        void reallocateMemory(size_t sizeOfBuffer) {
-            if (sizeOfBuffer > bufferSize) {
-                buffer = (T *) realloc(buffer, sizeof(T) * sizeOfBuffer);
-                bufferSize = sizeOfBuffer;
-            }
-        };
-    };
-
-    int numOfSplit;
-    unordered_map<TaxID, unsigned int> taxCounts;
-    uint64_t MARKER;
-    int bitsForCodon;
-    uint8_t hammingLookup[8][8] = {
-            {0, 1, 1, 1, 2, 1, 3, 3},
-            {1, 0, 1, 1, 2, 2, 3, 2},
-            {1, 1, 0, 1, 2, 2, 2, 3},
-            {1, 1, 1, 0, 1, 2, 3, 3},
-            {2, 2, 2, 1, 0, 1, 4, 4},
-            {1, 2, 2, 2, 1, 0, 4, 4},
-            {3, 3, 2, 3, 4, 4, 0, 1},
-            {3, 2, 3, 3, 4, 4, 1, 0}};
-
-    // Index reads in query file
-    static void splitQueryFile(vector<SequenceBlock> & seqSegments, const string & queryPath);
-
-    // Extract query k-mer
-    void fillQueryKmerBufferParallel(KSeqWrapper* kseq1,
-                                     QueryKmerBuffer &kmerBuffer,
-                                     vector<Query> & queryList,
-                                     const pair<size_t, size_t> & currentSplit,
-                                     const LocalParameters &par);
-
-    void fillQueryKmerBufferParallel_paired(KSeqWrapper* kseq1,
-                                            KSeqWrapper* kseq2,
-                                            QueryKmerBuffer &kmerBuffer,
-                                            vector<Query> &queryList,
-                                            const pair<size_t, size_t> &currentSplit,
-                                            const LocalParameters &par);
-
-    static int getMaxCoveredLength(int queryLength);
-
-    template<typename T>
-    T getQueryKmerNumber(T queryLength);
-
-    void linearSearchParallel(
-            QueryKmer *queryKmerList,
-            size_t &queryKmerCnt,
-            Buffer<Match> &matchBuffer,
-            const LocalParameters &par);
-
-    void compareDna(uint64_t query, vector<uint64_t> &targetKmersToCompare, vector<size_t> &selectedMatches,
-                    vector<uint8_t> &selectedHammingSum, vector<uint16_t> &rightEndHammings, uint8_t frame);
-
-    virtual uint8_t getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2);
-
-    virtual uint16_t getHammings(uint64_t kmer1, uint64_t kmer2);
-
-    virtual uint16_t getHammings_reverse(uint64_t kmer1, uint64_t kmer2);
-
-    void moveMatches(Match *dest, Match *src, int& matchNum);
-
-    // Analyzing k-mer matches
-    void fromMatchToClassification(const Match *matchList,
-                                   size_t numOfMatches,
-                                   vector<Query> & queryList,
-                                   const LocalParameters &par);
-
-    void chooseBestTaxon(uint32_t currentQuery,
-                         size_t offset,
-                         size_t end,
-                         const Match *matchList,
-                         vector<Query> & queryList,
-                         const LocalParameters &par);
-
-    void remainConsecutiveMatches(vector<const Match *> & curFrameMatches,
-                                  vector<const Match *> & filteredMatches,
-                                  TaxID genusId,
-                                  const LocalParameters & par);
-
-    size_t DFS(size_t curMatchIdx, const map<size_t, vector<size_t>>& linkedMatches,
-             vector<size_t>& fiteredMatchIdx, size_t depth, size_t MIN_DEPTH, unordered_set<size_t>& used,
-             unordered_map<size_t, size_t> & idx2depth);
-
-    static bool isConsecutive(const Match * match1, const Match * match2);
-    bool isConsecutive(const Match & match1, const Match & match2, const LocalParameters &par);
 
 
 
-    TaxonScore getBestGenusMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end,
-                                   size_t offset, int queryLength, const LocalParameters &par);
-
-    TaxonScore getBestGenusMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
-                                   int readLength1, int readLength2, const LocalParameters &par);
-
-    TaxonScore getBestGenusMatches_spaced(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
-                                          int readLength1, int readLength2);
-    TaxonScore getBestGenusMatches_spaced(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
-                                          int readLength1);
-
-    TaxonScore scoreGenus(vector<const Match *> &filteredMatches,
-                          int queryLength);
-
-    TaxonScore scoreGenus(vector<const Match *> &filteredMatches,
-                          int readLength1,
-                          int readLength2);
-
-    void scoreGenus_ExtensionScore(vector<Match> &filteredMatches,
-                                   vector<vector<Match>> &matchesForEachGenus,
-                                   vector<float> &scoreOfEachGenus,
-                                   int readLength1, int readLength2);
-
-    TaxonScore chooseSpecies(const std::vector<Match> &matches,
-                       int queryLength,
-                       vector<TaxID> &species,
-                       unordered_map<TaxID, pair<int, int>> & speciesMatchRange);
-
-    TaxonScore chooseSpecies(const std::vector<Match> &matches,
-                       int read1Length,
-                       int read2Length,
-                       vector<TaxID> &species,
-                       unordered_map<TaxID, pair<int, int>> & speciesMatchRange);
-
-    TaxonScore scoreSpecies(const vector<Match> &matches,
-                          size_t begin,
-                          size_t end,
-                          int queryLength);
-
-    TaxonScore scoreSpecies(const vector<Match> &matches,
-                          size_t begin,
-                          size_t end,
-                          int queryLength,
-                          int queryLength2);
-
-    TaxID lowerRankClassification(vector<Match> &matches, pair<int, int> &matchRange, TaxID speciesID);
-
-    void getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> & taxCnt,
-                               unordered_map<TaxID, TaxonCounts> & cladeCnt,
-                               TaxID spciesID);
-
-    TaxID BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID root);
-
-    template <typename T>
-    static void loadBuffer(FILE * fp, T * buffer, size_t & bufferIdx, size_t size, int cnt){
-        fseek(fp, cnt * sizeof(T), SEEK_CUR);
-        fread(buffer, sizeof(T), size, fp);
-        bufferIdx = 0;
-    }
-
-    template <typename T>
-    static void loadBuffer(FILE * fp, T * buffer, size_t & bufferIdx, size_t size){
-        fread(buffer, sizeof(T), size, fp);
-        bufferIdx = 0;
-    }
-
-    // Write report
-    void writeReadClassification(const vector<Query> & queryList, int queryNum, ofstream &readClassificationFile);
-
-    void writeReportFile(const string &reportFileName, int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt);
-
-    void writeReport(FILE *FP, const std::unordered_map<TaxID, TaxonCounts> &cladeCounts,
-                     unsigned long totalReads, TaxID taxID = 0, int depth = 0);
-
-    unsigned int cladeCountVal(const std::unordered_map<TaxID, TaxonCounts> &map, TaxID key);
-
-    size_t AminoAcidPart(size_t kmer) const {
-        return (kmer) & MARKER;
-    }
-
-    static size_t getCodonBits(size_t num) {
-        return num & 0X7U;
-    }
-
-    void setMarker(uint64_t marker) {
-        MARKER = marker;
-        MARKER = ~MARKER;
-    }
-
-    void setNumOfBitsForCodon(int num) {
-        bitsForCodon = num;
-    }
 
 public:
-
     void startClassify(const LocalParameters &par);
 
-//    static uint64_t getNextTargetKmer(uint64_t lookingTarget, const uint16_t *targetDiffIdxList, size_t &diffIdxPos);
-
-    static uint64_t getNextTargetKmer(uint64_t lookingTarget,
-                                      const uint16_t * diffIdxBuffer,
-                                      size_t & diffBufferIdx,
-                                      size_t & totalPos);// size_t bufferSize, FILE * diffIdxFp);
-
-    static TargetKmerInfo getKmerInfo(size_t bufferSize, FILE * kmerInfoFp, TargetKmerInfo * infoBuffer,
-                              size_t & infoBufferIdx);
-
     explicit Classifier(LocalParameters & par);
 
     virtual ~Classifier();
@@ -311,59 +62,8 @@ class Classifier {
 
 };
 
-struct sortMatch {
-    bool operator() (const Match& a, const Match& b) const {
-        if (a.qInfo.sequenceID != b.qInfo.sequenceID)
-            return a.qInfo.sequenceID < b.qInfo.sequenceID;
-
-        if (a.genusId != b.genusId)
-            return a.genusId < b.genusId;
-
-        if (a.speciesId != b.speciesId)
-            return a.speciesId < b.speciesId;
-
-        if (a.qInfo.frame != b.qInfo.frame)
-            return a.qInfo.frame < b.qInfo.frame;
-
-        if (a.qInfo.pos != b.qInfo.pos)
-            return a.qInfo.pos < b.qInfo.pos;
-
-        return a.hamming < b.hamming;
-    }
-};
-
-inline uint8_t Classifier::getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2) {//12345678
-    uint8_t hammingSum = 0;
-    hammingSum += hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 3U)][GET_3_BITS(kmer2 >> 3U)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 6U)][GET_3_BITS(kmer2 >> 6U)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 9U)][GET_3_BITS(kmer2 >> 9U)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 12U)][GET_3_BITS(kmer2 >> 12U)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 15U)][GET_3_BITS(kmer2 >> 15U)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 18U)][GET_3_BITS(kmer2 >> 18U)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 21U)][GET_3_BITS(kmer2 >> 21U)];
-    return hammingSum;
-}
 
-inline uint16_t Classifier::getHammings(uint64_t kmer1, uint64_t kmer2) {  //hammings 87654321
-    uint16_t hammings = 0;
-    for (int i = 0; i < 8; i++) {
-        hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * i;
-        kmer1 >>= bitsForCodon;
-        kmer2 >>= bitsForCodon;
-    }
-    return hammings;
-}
 
-inline uint16_t Classifier::getHammings_reverse(uint64_t kmer1, uint64_t kmer2) {  //hammings 87654321
-    uint16_t hammings = 0;
-    for (int i = 0; i < 8; i++) {
-        hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * (7-i);
-        kmer1 >>= bitsForCodon;
-        kmer2 >>= bitsForCodon;
-    }
-    return hammings;
-}
 
 //inline uint64_t
 //Classifier::getNextTargetKmer(uint64_t lookingTarget, const uint16_t *targetDiffIdxList, size_t &diffIdxPos) {
@@ -384,33 +84,7 @@ inline uint16_t Classifier::getHammings_reverse(uint64_t kmer1, uint64_t kmer2)
 //    return diffIn64bit + lookingTarget;
 //}
 
-inline uint64_t
-Classifier::getNextTargetKmer(uint64_t lookingTarget, const uint16_t * diffIdxBuffer, size_t & diffBufferIdx, size_t & totalPos) {
-//                              size_t bufferSize, FILE * diffIdxFp) {
-    uint16_t fragment;
-    uint16_t check = 32768; // 2^15
-    uint64_t diffIn64bit = 0;
-    fragment = diffIdxBuffer[diffBufferIdx++];
-    totalPos ++;
-    while (!(fragment & check)) { // 27 %
-        diffIn64bit |= fragment;
-        diffIn64bit <<= 15u;
-        fragment = diffIdxBuffer[diffBufferIdx++];
-        totalPos ++;
-    }
-    fragment &= ~check; // not; 8.47 %
-    diffIn64bit |= fragment; // or : 23.6%
-    return diffIn64bit + lookingTarget;
-}
 
-inline
-TargetKmerInfo Classifier::getKmerInfo(size_t bufferSize, FILE * kmerInfoFp, TargetKmerInfo * infoBuffer,
-                                       size_t & infoBufferIdx){
-    if (unlikely(infoBufferIdx >= bufferSize)) {
-        loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize, (int) (infoBufferIdx - bufferSize));
-    }
-    return infoBuffer[infoBufferIdx];
-}
 
 
 #endif //ADKMER4_SEARCHER_H
diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h
index 44461d9d..087cc3b3 100644
--- a/src/commons/IndexCreator.h
+++ b/src/commons/IndexCreator.h
@@ -21,18 +21,14 @@
 #include "NucleotideMatrix.h"
 #include "SubstitutionMatrix.h"
 #include "tantan.h"
-//#include "DBReader.h"
-//#include "DBWriter.h"
-//#include "Debug.h"
-//#include "Util.h"
-//#include "FileUtil.h"
+
 
 #ifdef OPENMP
 #include <omp.h>
 #endif
 
 
-#define kmerLength 8
+
 
 struct TaxId2Fasta{
     TaxID species;
diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp
new file mode 100644
index 00000000..f8860ec9
--- /dev/null
+++ b/src/commons/KmerExtractor.cpp
@@ -0,0 +1,216 @@
+#include "KmerExtractor.h"
+
+KmerExtractor::KmerExtractor(const LocalParameters &par) {
+    spaceNum = par.spaceMask.length() - 8;
+    maskMode = par.maskMode;
+    maskProb = par.maskProb;
+}
+
+KmerExtractor::~KmerExtractor() {
+    delete probMatrix;
+    delete subMat;
+}
+
+void KmerExtractor::extractQueryKmers(QueryKmerBuffer &kmerBuffer,
+                                      vector<Query> & queryList,
+                                      const QuerySplit & currentSplit,
+                                      const LocalParameters &par,
+                                      KSeqWrapper* kseq1,
+                                      KSeqWrapper* kseq2) {
+    time_t beforeKmerExtraction = time(nullptr);
+    std::cout << "Extracting query metamers ... " << endl;
+    if (par.seqMode == 1 || par.seqMode == 3) { // Single-end short-read sequence or long-read sequence
+        fillQueryKmerBufferParallel(kseq1,
+                                    kmerBuffer,
+                                    queryList,
+                                    currentSplit,
+                                    par);
+    } else if (par.seqMode == 2) {
+        fillQueryKmerBufferParallel_paired(kseq1,
+                                           kseq2,
+                                           kmerBuffer,
+                                           queryList,
+                                           currentSplit,
+                                           par);
+    }
+    cout << "Time spent for metamer extraction: " << double(time(nullptr) - beforeKmerExtraction) << endl;
+
+    // Sort query k-mer
+    time_t beforeQueryKmerSort = time(nullptr);
+    cout << "Sorting query metamer list ..." << endl;
+    SORT_PARALLEL(kmerBuffer.buffer, kmerBuffer.buffer + kmerBuffer.startIndexOfReserve, compareForLinearSearch);
+    cout << "Time spent for sorting query metamer list: " << double(time(nullptr) - beforeQueryKmerSort) << endl;
+}
+
+void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq1,
+                                                QueryKmerBuffer &kmerBuffer,
+                                                vector<Query> &queryList,
+                                                const QuerySplit &currentSplit,
+                                                const LocalParameters &par) {
+    size_t queryNum = currentSplit.end - currentSplit.start;
+    size_t processedQueryNum = 0;
+
+    // Array to store reads of thread number
+    vector<string> reads1(par.threads);
+
+    while (processedQueryNum < queryNum) {
+        size_t currentQueryNum = min(queryNum - processedQueryNum, (size_t) par.threads);
+        size_t count = 0;
+        while (count < currentQueryNum) {
+            // Read query
+            kseq1->ReadEntry();
+            const KSeqWrapper::KSeqEntry & e1 = kseq1->entry;
+
+            // Get k-mer count
+            int kmerCnt = LocalUtil::getQueryKmerNumber<int>((int) e1.sequence.l, spaceNum);
+
+            // Query Info
+            queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l);
+            queryList[processedQueryNum].name = string(e1.name.s);
+            queryList[processedQueryNum].kmerCnt = (int) (kmerCnt);
+
+            // Store reads
+            reads1[count] = string(kseq1->entry.sequence.s);
+
+            processedQueryNum ++;
+            count ++;
+        }
+#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1)
+        {
+            SeqIterator seqIterator(par);
+            size_t posToWrite;
+#pragma omp for schedule(dynamic, 1)
+            for (size_t i = 0; i < currentQueryNum; i ++) {
+                size_t queryIdx = processedQueryNum - currentQueryNum + i;
+                // Get k-mer count
+                auto kmerCnt = LocalUtil::getQueryKmerNumber<size_t>(reads1[i].length(), spaceNum);
+
+                // Ignore short read
+                if (kmerCnt < 1) { continue; }
+
+                // Get masked sequence
+                char *maskedSeq1 = nullptr;
+                if (maskMode) {
+                    maskedSeq1 = new char[reads1[i].length() + 1];
+                    SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat);
+                } else {
+                    maskedSeq1 = const_cast<char *>(reads1[i].c_str());
+                }
+
+                posToWrite = kmerBuffer.reserveMemory(kmerCnt);
+
+                // Process Read 1
+                seqIterator.sixFrameTranslation(maskedSeq1, (int) reads1[i].length());
+                seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) reads1[i].length(), kmerBuffer, posToWrite,
+                                                (uint32_t) queryIdx);
+
+                if (maskMode) {
+                    delete[] maskedSeq1;
+                }
+            }
+        }
+    }
+}
+
+void KmerExtractor::fillQueryKmerBufferParallel_paired(KSeqWrapper *kseq1,
+                                                       KSeqWrapper *kseq2,
+                                                       QueryKmerBuffer &kmerBuffer,
+                                                       vector<Query> &queryList,
+                                                       const QuerySplit &currentSplit,
+                                                       const LocalParameters &par) {
+    size_t queryNum = currentSplit.end - currentSplit.start;
+    size_t processedQueryNum = 0;
+
+    // Array to store reads of thread number
+    vector<string> reads1(par.threads);
+    vector<string> reads2(par.threads);
+
+    while (processedQueryNum < queryNum) {
+        size_t currentQueryNum = min(queryNum - processedQueryNum, (size_t) par.threads);
+        size_t count = 0;
+
+        // Fill reads in sequential
+        while (count < currentQueryNum) {
+            // Read query
+            kseq1->ReadEntry();
+            kseq2->ReadEntry();
+            const KSeqWrapper::KSeqEntry & e1 = kseq1->entry;
+            const KSeqWrapper::KSeqEntry & e2 = kseq2->entry;
+
+            // Get k-mer count
+            int kmerCnt = LocalUtil::getQueryKmerNumber<int>((int) e1.sequence.l, spaceNum);
+            int kmerCnt2 = LocalUtil::getQueryKmerNumber<int>((int) e2.sequence.l, spaceNum);
+
+            // Query Info
+            queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l);
+            queryList[processedQueryNum].queryLength2 = getMaxCoveredLength((int) e2.sequence.l);
+            queryList[processedQueryNum].name = string(e1.name.s);
+            queryList[processedQueryNum].kmerCnt = (int) (kmerCnt + kmerCnt2);
+
+            // Store reads
+            reads1[count] = string(kseq1->entry.sequence.s);
+            reads2[count] = string(kseq2->entry.sequence.s);
+
+            processedQueryNum ++;
+            count ++;
+        }
+
+        // Process reads in parallel
+#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1, reads2)
+        {
+            SeqIterator seqIterator(par);
+            SeqIterator seqIterator2(par);
+            size_t posToWrite;
+#pragma omp for schedule(dynamic, 1)
+            for (size_t i = 0; i < currentQueryNum; i ++) {
+                size_t queryIdx = processedQueryNum - currentQueryNum + i;
+                // Get k-mer count
+                auto kmerCnt = LocalUtil::getQueryKmerNumber<size_t>(reads1[i].length(), spaceNum);
+                auto kmerCnt2 = LocalUtil::getQueryKmerNumber<size_t>(reads2[i].length(), spaceNum);
+
+                // Ignore short read
+                if (kmerCnt2 < 1 || kmerCnt < 1) { continue; }
+
+                // Get masked sequence
+                char *maskedSeq1 = nullptr;
+                char *maskedSeq2 = nullptr;
+                if (maskMode) {
+                    maskedSeq1 = new char[reads1[i].length() + 1];
+                    maskedSeq2 = new char[reads2[i].length() + 1];
+                    SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat);
+                    SeqIterator::maskLowComplexityRegions(reads2[i].c_str(),maskedSeq2, *probMatrix, maskProb, subMat);
+                } else {
+                    maskedSeq1 = const_cast<char *>(reads1[i].c_str());
+                    maskedSeq2 = const_cast<char *>(reads2[i].c_str());
+                }
+
+                posToWrite = kmerBuffer.reserveMemory(kmerCnt + kmerCnt2);
+
+                // Process Read 1
+                seqIterator.sixFrameTranslation(maskedSeq1, (int) reads1[i].length());
+                seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) reads1[i].length(), kmerBuffer, posToWrite,
+                                                (uint32_t) queryIdx);
+
+                // Process Read 2
+                seqIterator2.sixFrameTranslation(maskedSeq2, (int) reads2[i].length());
+                seqIterator2.fillQueryKmerBuffer(maskedSeq2, (int) reads2[i].length(), kmerBuffer, posToWrite,
+                                                 (uint32_t) queryIdx, queryList[queryIdx].queryLength);
+
+                if (maskMode) {
+                    delete[] maskedSeq1;
+                    delete[] maskedSeq2;
+                }
+            }
+        }
+    }
+}
+
+int KmerExtractor::getMaxCoveredLength(int queryLength) {
+    if (queryLength % 3 == 2) {
+        return queryLength - 2; // 2
+    } else if (queryLength % 3 == 1) {
+        return queryLength - 4; // 4
+    } else {
+        return queryLength - 3; // 3
+    }
+}
\ No newline at end of file
diff --git a/src/commons/KmerExtractor.h b/src/commons/KmerExtractor.h
new file mode 100644
index 00000000..260bc78c
--- /dev/null
+++ b/src/commons/KmerExtractor.h
@@ -0,0 +1,56 @@
+#ifndef METABULI_KMEREXTRACTER_H
+#define METABULI_KMEREXTRACTER_H
+#include "SeqIterator.h"
+#include "QueryIndexer.h"
+#include "KseqWrapper.h"
+
+class KmerExtractor {
+private:
+    // Parameters
+    int spaceNum;
+    int maskMode;
+    float maskProb;
+
+    // For masking reads
+    ProbabilityMatrix * probMatrix;
+    BaseMatrix * subMat;
+
+    // Extract query k-mer
+    void fillQueryKmerBufferParallel(KSeqWrapper* kseq1,
+                                     QueryKmerBuffer &kmerBuffer,
+                                     vector<Query> & queryList,
+                                     const QuerySplit & currentSplit,
+                                     const LocalParameters &par);
+
+    void fillQueryKmerBufferParallel_paired(KSeqWrapper* kseq1,
+                                            KSeqWrapper* kseq2,
+                                            QueryKmerBuffer &kmerBuffer,
+                                            vector<Query> &queryList,
+                                            const QuerySplit & currentSplit,
+                                            const LocalParameters &par);
+
+    static int getMaxCoveredLength(int queryLength) ;
+
+public:
+    explicit KmerExtractor(const LocalParameters & par);
+    ~KmerExtractor();
+    void extractQueryKmers(QueryKmerBuffer &kmerBuffer,
+                           vector<Query> & queryList,
+                           const QuerySplit & currentSplit,
+                           const LocalParameters &par,
+                           KSeqWrapper* kseq1,
+                           KSeqWrapper* kseq2 = nullptr);
+
+
+};
+
+static inline bool compareForLinearSearch(const QueryKmer &a, const QueryKmer &b) {
+    if (a.ADkmer < b.ADkmer) {
+        return true;
+    } else if (a.ADkmer == b.ADkmer) {
+        return (a.info.sequenceID < b.info.sequenceID);
+    }
+    return false;
+}
+
+#endif //METABULI_KMEREXTRACTER_H
diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
new file mode 100644
index 00000000..7117e3ce
--- /dev/null
+++ b/src/commons/KmerMatcher.cpp
@@ -0,0 +1,466 @@
+#include "KmerMatcher.h"
+
+KmerMatcher::KmerMatcher(const LocalParameters & par,
+                         NcbiTaxonomy * taxonomy) {
+    threads = par.threads;
+    std::string dbDir = par.filenames[1 + (par.seqMode == 2)];
+    targetDiffIdxFileName = dbDir + "/diffIdx";
+    targetInfoFileName = dbDir + "/info";
+    diffIdxSplitFileName = dbDir + "/split";
+
+    diffIdxSplits = mmapData<DiffIdxSplit>(diffIdxSplitFileName.c_str(), 3);
+
+    MARKER = 16777215;
+    MARKER = ~ MARKER;
+    hammingMargin = par.hammingMargin;
+    totalMatchCnt = 0;
+
+    // Load the taxonomy ID list
+    FILE * taxIdFile;
+    if((taxIdFile = fopen((dbDir + "/taxID_list").c_str(),"r")) == NULL){
+        std::cout<<"Cannot open the taxID list file."<<std::endl;
+        return;
+    }
+    char taxID[100];
+
+    while(feof(taxIdFile) == 0) {
+        fscanf(taxIdFile,"%s",taxID);
+        TaxID taxId = atol(taxID);
+        TaxonNode const * taxon = taxonomy->taxonNode(taxId);
+        if (taxId == taxon->taxId) {
+            TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
+            TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
+            while (taxon->taxId != speciesTaxID) {
+                taxId2speciesId[taxon->taxId] = speciesTaxID;
+                taxId2genusId[taxon->taxId] = genusTaxID;
+                taxon = taxonomy->taxonNode(taxon->parentTaxId);
+            }
+            taxId2speciesId[speciesTaxID] = speciesTaxID;
+            taxId2genusId[speciesTaxID] = genusTaxID;
+        } else {
+            TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
+            TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
+            while (taxon->taxId != speciesTaxID) {
+                taxId2speciesId[taxon->taxId] = speciesTaxID;
+                taxId2genusId[taxon->taxId] = genusTaxID;
+                taxon = taxonomy->taxonNode(taxon->parentTaxId);
+            }
+            taxId2speciesId[speciesTaxID] = speciesTaxID;
+            taxId2genusId[speciesTaxID] = genusTaxID;
+            taxId2speciesId[taxId] = speciesTaxID;
+            taxId2genusId[taxId] = genusTaxID;
+        }
+    }
+    fclose(taxIdFile);
+}
+
+KmerMatcher::~KmerMatcher() {
+    munmap(diffIdxSplits.data, diffIdxSplits.fileSize + 1);
+}
+
+int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer<Match> * matchBuffer) {
+    size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve;
+    QueryKmer *queryKmerList = queryKmerBuffer->buffer;
+
+    size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t);
+
+    std::cout << "Comparing query and reference metamers..." << std::endl;
+
+    // Find the first index of garbage query k-mer (UINT64_MAX) and discard from there
+    for (size_t checkN = queryKmerNum - 1; checkN > 0; checkN--) {
+        if (queryKmerList[checkN].ADkmer != UINT64_MAX) {
+            queryKmerNum = checkN + 1;
+            break;
+        }
+    }
+
+    // Filter out meaningless target splits
+    size_t numOfDiffIdxSplits = diffIdxSplits.fileSize / sizeof(DiffIdxSplit);
+    size_t numOfDiffIdxSplits_use = numOfDiffIdxSplits;
+    for (size_t i = 1; i < numOfDiffIdxSplits; i++) {
+        if (diffIdxSplits.data[i].ADkmer == 0 || diffIdxSplits.data[i].ADkmer == UINT64_MAX) {
+            diffIdxSplits.data[i] = {UINT64_MAX, UINT64_MAX, UINT64_MAX};
+            numOfDiffIdxSplits_use--;
+        }
+    }
+
+    // Divide query k-mer list into blocks for multi threading.
+    // Each split has start and end points of query list + proper offset point of target k-mer list
+    std::vector<QueryKmerSplit> querySplits;
+    uint64_t queryAA;
+    std::vector<int> targetSplitIdxs;
+
+    if (threads == 1) { //Single thread
+        querySplits.emplace_back(0, queryKmerNum - 1, queryKmerNum, diffIdxSplits.data[0]);
+    } else if (threads == 2) { //Two threads
+        size_t splitWidth = queryKmerNum / 2;
+        querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]);
+        for (size_t tSplitCnt = 0; tSplitCnt < numOfDiffIdxSplits_use; tSplitCnt++) {
+            queryAA = AminoAcidPart(queryKmerList[splitWidth].ADkmer);
+            if (queryAA <= AminoAcidPart(diffIdxSplits.data[tSplitCnt].ADkmer)) {
+                tSplitCnt = tSplitCnt - (tSplitCnt != 0);
+                querySplits.emplace_back(splitWidth, queryKmerNum - 1, queryKmerNum - splitWidth,
+                                         diffIdxSplits.data[tSplitCnt]);
+                break;
+            }
+        }
+    } else { //More than two threads
+        // Devide query k-mers into blocks
+        size_t splitWidth = queryKmerNum / (threads - 1);
+        querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]);
+        for (size_t i = 1; i < threads; i++) {
+            queryAA = AminoAcidPart(queryKmerList[splitWidth * i].ADkmer);
+            bool needLastTargetBlock = true;
+            for (size_t j = 0; j < numOfDiffIdxSplits_use; j++) {
+                if (queryAA <= AminoAcidPart(diffIdxSplits.data[j].ADkmer)) {
+                    j = j - (j != 0);
+                    if (i != threads - 1) {
+                        querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth,
+                                                 diffIdxSplits.data[j]);
+                    } else {
+                        querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i,
+                                                 diffIdxSplits.data[j]);
+                    }
+                    targetSplitIdxs.emplace_back(j);
+                    needLastTargetBlock = false;
+                    break;
+                }
+            }
+            if (needLastTargetBlock) {
+                if (i != threads - 1) { // If it is not the last split
+                    querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth,
+                                             diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
+                    targetSplitIdxs.emplace_back(numOfDiffIdxSplits_use - 2);
+                } else {
+                    querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i,
+                                             diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
+                    targetSplitIdxs.emplace_back(numOfDiffIdxSplits_use - 2);
+                }
+            }
+        }
+    }
+
+    bool *splitCheckList = (bool *) malloc(sizeof(bool) * threads);
+    std::fill_n(splitCheckList, threads, false);
+    size_t completedSplitCnt = 0;
+
+    time_t beforeSearch = time(nullptr);
+
+    while (completedSplitCnt < threads) {
+        bool hasOverflow = false;
+#pragma omp parallel default(none), shared(completedSplitCnt, splitCheckList, hasOverflow, \
+querySplits, queryKmerList, matchBuffer, cout, par, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName, targetSplitIdxs)
+        {
+            // FILE
+            FILE * diffIdxFp = fopen(targetDiffIdxFileName.c_str(), "rb");
+            FILE * kmerInfoFp = fopen(targetInfoFileName.c_str(), "rb");
+
+            // Target K-mer buffer
+            uint16_t * diffIdxBuffer = (uint16_t *) malloc(sizeof(uint16_t) * (BufferSize + 1)); // size = 32 Mb
+            TargetKmerInfo * kmerInfoBuffer = (TargetKmerInfo *) malloc(sizeof(TargetKmerInfo) * (BufferSize+1)); // 64 Mb
+            size_t kmerInfoBufferIdx = 0;
+            size_t diffIdxBufferIdx = 0;
+
+            //query variables
+            uint64_t currentQuery = UINT64_MAX;
+            uint64_t currentQueryAA = UINT64_MAX;
+            QueryKmerInfo currentQueryInfo;
+
+            //target variables
+            size_t diffIdxPos = 0;
+            std::vector<uint64_t> candidateTargetKmers; //vector for candidate target k-mer, some of which are selected after based on hamming distance
+            std::vector<TargetKmerInfo> candidateKmerInfos;
+            uint64_t currentTargetKmer;
+
+            //Match buffer for each thread
+            int localBufferSize = 2'000'000; // 32 Mb
+            auto *matches = new Match[localBufferSize]; // 16 * 2'000'000 = 32 Mb
+            int matchCnt = 0;
+
+            // For debug
+//            SeqIterator seqIterator(par);
+
+            //vectors for selected target k-mers
+            std::vector<uint8_t> selectedHammingSum;
+            std::vector<size_t> selectedMatches;
+            std::vector<uint16_t> selectedHammings;
+            size_t posToWrite;
+
+            int currMatchNum;
+            size_t idx;
+#pragma omp for schedule(dynamic, 1)
+            for (size_t i = 0; i < querySplits.size(); i++) {
+                if (hasOverflow || splitCheckList[i]) {
+                    continue;
+                }
+
+                currentTargetKmer = querySplits[i].diffIdxSplit.ADkmer;
+                diffIdxBufferIdx = querySplits[i].diffIdxSplit.diffIdxOffset;
+                kmerInfoBufferIdx = querySplits[i].diffIdxSplit.infoIdxOffset
+                                    - (querySplits[i].diffIdxSplit.ADkmer != 0);
+                diffIdxPos = querySplits[i].diffIdxSplit.diffIdxOffset;
+
+                fseek(kmerInfoFp, 4 * (long)(kmerInfoBufferIdx), SEEK_SET);
+                loadBuffer(kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx, BufferSize);
+                fseek(diffIdxFp, 2 * (long) (diffIdxBufferIdx), SEEK_SET);
+                loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize);
+
+                if (i == 0) {
+                    currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer,
+                                                          diffIdxBufferIdx, diffIdxPos);
+                }
+                currentQuery = UINT64_MAX;
+                currentQueryAA = UINT64_MAX;
+
+                size_t lastMovedQueryIdx = 0;
+                for (size_t j = querySplits[i].start; j < querySplits[i].end + 1; j++) {
+                    querySplits[i].start++;
+
+                    // Reuse the comparison data if queries are exactly identical
+                    if (currentQuery == queryKmerList[j].ADkmer
+                        && (currentQueryInfo.frame/3 == queryKmerList[j].info.frame/3)) {
+                        currMatchNum = selectedMatches.size();
+                        // If local buffer is full, copy them to the shared buffer.
+                        if (matchCnt + currMatchNum > localBufferSize) {
+                            // Check if the shared buffer is full.
+                            posToWrite = matchBuffer->reserveMemory(matchCnt);
+                            if (posToWrite + matchCnt >= matchBuffer->bufferSize) {
+                                hasOverflow = true;
+                                querySplits[i].start = lastMovedQueryIdx + 1;
+                                __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt);
+                                break;
+                            } else { // not full -> copy matches to the shared buffer
+                                moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt);
+                                lastMovedQueryIdx = j;
+                            }
+                        }
+                        for (int k = 0; k < currMatchNum; k++) {
+                            idx = selectedMatches[k];
+                            matches[matchCnt] = {queryKmerList[j].info,
+                                                 candidateKmerInfos[idx].sequenceID,
+                                                 taxId2genusId[candidateKmerInfos[idx].sequenceID],
+                                                 taxId2speciesId[candidateKmerInfos[idx].sequenceID],
+                                                 selectedHammings[k],
+                                                 selectedHammingSum[k],
+                                                 (bool) candidateKmerInfos[idx].redundancy};
+                            matchCnt++;
+                        }
+                        continue;
+                    }
+                    selectedMatches.clear();
+                    selectedHammingSum.clear();
+                    selectedHammings.clear();
+
+                    // Reuse the candidate target k-mers to compare in DNA level if queries are the same at amino acid level but not at DNA level
+                    if (currentQueryAA == AminoAcidPart(queryKmerList[j].ADkmer)) {
+                        compareDna(queryKmerList[j].ADkmer, candidateTargetKmers, selectedMatches,
+                                   selectedHammingSum, selectedHammings,queryKmerList[j].info.frame);
+                        currMatchNum = selectedMatches.size();
+
+                        // If local buffer is full, copy them to the shared buffer.
+                        if (matchCnt + currMatchNum > localBufferSize) {
+                            // Check if the shared buffer is full.
+                            posToWrite = matchBuffer->reserveMemory(matchCnt);
+                            if (posToWrite + matchCnt >= matchBuffer->bufferSize) {
+                                hasOverflow = true;
+                                querySplits[i].start = lastMovedQueryIdx + 1;
+                                __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt);
+                                break;
+                            } else { // not full -> copy matches to the shared buffer
+                                moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt);
+                                lastMovedQueryIdx = j;
+                            }
+                        }
+                        for (int k = 0; k < currMatchNum; k++) {
+                            idx = selectedMatches[k];
+                            matches[matchCnt] = {queryKmerList[j].info,
+                                                 candidateKmerInfos[idx].sequenceID,
+                                                 taxId2genusId[candidateKmerInfos[idx].sequenceID],
+                                                 taxId2speciesId[candidateKmerInfos[idx].sequenceID],
+                                                 selectedHammings[k],
+                                                 selectedHammingSum[k],
+                                                 (bool) candidateKmerInfos[idx].redundancy};
+                            matchCnt++;
+                        }
+                        currentQuery = queryKmerList[j].ADkmer;
+                        currentQueryAA = AminoAcidPart(currentQuery);
+                        currentQueryInfo = queryKmerList[j].info;
+                        continue;
+                    }
+                    candidateTargetKmers.clear();
+                    candidateKmerInfos.clear();
+
+                    // Get next query, and start to find
+                    currentQuery = queryKmerList[j].ADkmer;
+                    currentQueryAA = AminoAcidPart(currentQuery);
+                    currentQueryInfo = queryKmerList[j].info;
+
+                    // Skip target k-mers that are not matched in amino acid level
+                    while (diffIdxPos != numOfDiffIdx
+                           && (currentQueryAA > AminoAcidPart(currentTargetKmer))) {
+                        if (unlikely(BufferSize < diffIdxBufferIdx + 7)){
+                            loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 );
+                        }
+                        currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer,
+                                                              diffIdxBufferIdx, diffIdxPos);
+                        kmerInfoBufferIdx ++;
+                    }
+
+                    if (currentQueryAA != AminoAcidPart(currentTargetKmer)) // Move to next query k-mer if there isn't any match.
+                        continue;
+
+                    // Load target k-mers that are matched in amino acid level
+                    while (diffIdxPos != numOfDiffIdx &&
+                           currentQueryAA == AminoAcidPart(currentTargetKmer)) {
+                        candidateTargetKmers.push_back(currentTargetKmer);
+                        candidateKmerInfos.push_back(getKmerInfo(BufferSize, kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx));
+                        // Print the target k-mer
+//                        if (par.printLog == 1) {
+//                            cout << queryKmerList[j].info.sequenceID << "\t" << queryKmerList[j].info.pos << "\t"
+//                                 << (int) queryKmerList[j].info.frame << endl;
+//                            cout << "Query  k-mer: ";
+//                            print_binary64(64, currentQuery);
+//                            cout << "\t";
+//                            seqIterator.printKmerInDNAsequence(currentQuery);
+//                            cout << endl;
+//                            cout << "Target k-mer: ";
+//                            print_binary64(64, currentTargetKmer);
+//                            cout << "\t";
+//                            seqIterator.printKmerInDNAsequence(currentTargetKmer);
+//                            cout << "\t" << kmerInfoBuffer[kmerInfoBufferIdx].sequenceID
+//                                 << "\t" << taxId2speciesId[kmerInfoBuffer[kmerInfoBufferIdx].sequenceID] << endl;
+//                            cout << (int) getHammingDistanceSum(currentQuery, currentTargetKmer) << "\t";
+//                            print_binary16(16, getHammings(currentQuery, currentTargetKmer)); cout << endl;
+//                        }
+
+                        if (unlikely(BufferSize < diffIdxBufferIdx + 7)){
+                            loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx,
+                                       BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 );
+                        }
+
+                        currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer,
+                                                              diffIdxBufferIdx, diffIdxPos);
+                        kmerInfoBufferIdx ++;
+                    }
+
+                    // Compare the current query and the loaded target k-mers and select
+                    compareDna(currentQuery, candidateTargetKmers, selectedMatches, selectedHammingSum,
+                               selectedHammings, queryKmerList[j].info.frame);
+
+                    // If local buffer is full, copy them to the shared buffer.
+                    currMatchNum = selectedMatches.size();
+                    if (matchCnt + currMatchNum > localBufferSize) {
+                        // Check if the shared buffer is full.
+                        posToWrite = matchBuffer->reserveMemory(matchCnt);
+                        if (posToWrite + matchCnt >= matchBuffer->bufferSize) { // full -> write matches to file first
+                            hasOverflow = true;
+                            querySplits[i].start = lastMovedQueryIdx + 1;
+                            __sync_fetch_and_sub(&matchBuffer->startIndexOfReserve, matchCnt);
+                            break;
+                        } else { // not full -> copy matches to the shared buffer
+                            moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt);
+                            lastMovedQueryIdx = j;
+                        }
+                    }
+
+                    for (int k = 0; k < currMatchNum; k++) {
+                        idx = selectedMatches[k];
+                        matches[matchCnt] = {queryKmerList[j].info,
+                                             candidateKmerInfos[idx].sequenceID,
+                                             taxId2genusId[candidateKmerInfos[idx].sequenceID],
+                                             taxId2speciesId[candidateKmerInfos[idx].sequenceID],
+                                             selectedHammings[k],
+                                             selectedHammingSum[k],
+                                             (bool) candidateKmerInfos[idx].redundancy};
+                        matchCnt++;
+                    }
+                } // End of one split
+
+                // Move matches in the local buffer to the shared buffer
+                posToWrite = matchBuffer->reserveMemory(matchCnt);
+                if (posToWrite + matchCnt >= matchBuffer->bufferSize) {
+                    hasOverflow = true;
+                    querySplits[i].start = lastMovedQueryIdx + 1;
+                    __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt);
+                } else {
+                    moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt);
+                }
+
+                // Check whether current split is completed or not
+                if (querySplits[i].start - 1 == querySplits[i].end) {
+                    splitCheckList[i] = true;
+                    __sync_fetch_and_add(&completedSplitCnt, 1);
+                }
+            } // End of omp for (Iterating for splits)
+            delete[] matches;
+            fclose(diffIdxFp);
+            fclose(kmerInfoFp);
+            free(diffIdxBuffer);
+            free(kmerInfoBuffer);
+        } // End of omp parallel
+        if (hasOverflow) {
+            std::cout << "overflow!!!" << std::endl;
+            return 2;
+        }
+    } // end of while(completeSplitCnt < threadNum)
+    std::cout << "Time spent for the comparison: " << double(time(nullptr) - beforeSearch) << std::endl;
+    munmap(diffIdxSplits.data, diffIdxSplits.fileSize + 1);
+    free(splitCheckList);
+    queryKmerNum = 0;
+
+#ifdef OPENMP
+    omp_set_num_threads(par.threads);
+#endif
+
+    // Sort matches
+    time_t beforeSortMatches = time(nullptr);
+    totalMatchCnt += matchBuffer->startIndexOfReserve;
+    std::cout << "Sorting matches ..." << std::endl;
+    SORT_PARALLEL(matchBuffer->buffer, matchBuffer->buffer + matchBuffer->startIndexOfReserve,
+                  sortMatch());
+    std::cout << "Time spent for sorting matches: " << double(time(nullptr) - beforeSortMatches) << std::endl;
+
+    return 1;
+}
+
+void KmerMatcher::moveMatches(Match *dest, Match *src, int &matchNum) {
+    memcpy(dest, src, sizeof(Match) * matchNum);
+    matchNum = 0;
+}
+
+// It compares query k-mers to target k-mers.
+// If a query has matches, the matches with the smallest hamming distance will be selected
+void KmerMatcher::compareDna(uint64_t query,
+                             std::vector<uint64_t> &targetKmersToCompare,
+                             std::vector<size_t> &selectedMatches,
+                             std::vector<uint8_t> &selectedHammingSum,
+                             std::vector<uint16_t> &selectedHammings, uint8_t frame) {
+
+    size_t size = targetKmersToCompare.size();
+    auto *hammingSums = new uint8_t[size + 1];
+    uint8_t currentHammingSum;
+    uint8_t minHammingSum = UINT8_MAX;
+
+    // Calculate hamming distance
+    for (size_t i = 0; i < size; i++) {
+        currentHammingSum = getHammingDistanceSum(query, targetKmersToCompare[i]);
+        if (currentHammingSum < minHammingSum) {
+            minHammingSum = currentHammingSum;
+        }
+        hammingSums[i] = currentHammingSum;
+    }
+
+    // Select target k-mers that passed hamming criteria
+    for (size_t h = 0; h < size; h++) {
+        if (hammingSums[h] <= minHammingSum + hammingMargin) {
+            selectedMatches.push_back(h);
+            selectedHammingSum.push_back(hammingSums[h]);
+            if (frame < 3) {
+                selectedHammings.push_back(getHammings(query, targetKmersToCompare[h]));
+            } else {
+                selectedHammings.push_back(getHammings_reverse(query, targetKmersToCompare[h]));
+            }
+        }
+    }
+    delete[] hammingSums;
+}
\ No newline at end of file
diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h
new file mode 100644
index 00000000..56379b7a
--- /dev/null
+++ b/src/commons/KmerMatcher.h
@@ -0,0 +1,196 @@
+#ifndef METABULI_KMERMATCHER_H
+#define METABULI_KMERMATCHER_H
+#include "KmerBuffer.h"
+#include "Match.h"
+#include "common.h"
+#include "LocalParameters.h"
+#include <string>
+#include "FileUtil.h"
+#include "Mmap.h"
+#include "BitManipulateMacros.h"
+#include "NcbiTaxonomy.h"
+
+#define BufferSize 16'777'216 //16 * 1024 * 1024 // 16 M
+
+// Input
+// 1. Query K-mers
+// 2. Reference K-mers
+
+// Output
+// 1. Matched K-mers
+
+
+
+class KmerMatcher {
+protected:
+    NcbiTaxonomy * taxonomy;
+    size_t threads;
+    std::string targetDiffIdxFileName, targetInfoFileName, diffIdxSplitFileName;
+    MmapedData<DiffIdxSplit> diffIdxSplits;
+    uint64_t MARKER;
+    int bitsForCodon = 3;
+    uint8_t hammingMargin;
+    size_t totalMatchCnt;
+    uint8_t hammingLookup[8][8] = {
+            {0, 1, 1, 1, 2, 1, 3, 3},
+            {1, 0, 1, 1, 2, 2, 3, 2},
+            {1, 1, 0, 1, 2, 2, 2, 3},
+            {1, 1, 1, 0, 1, 2, 3, 3},
+            {2, 2, 2, 1, 0, 1, 4, 4},
+            {1, 2, 2, 2, 1, 0, 4, 4},
+            {3, 3, 2, 3, 4, 4, 0, 1},
+            {3, 2, 3, 3, 4, 4, 1, 0}};
+    unordered_map<TaxID, TaxID> taxId2speciesId;
+    unordered_map<TaxID, TaxID> taxId2genusId;
+
+
+    struct QueryKmerSplit {
+        QueryKmerSplit(size_t start, size_t end, size_t length, const DiffIdxSplit& diffIdxSplit)
+                : start(start), end(end), length(length), diffIdxSplit(diffIdxSplit) {}
+
+        size_t start; // start idx in query k-mer list
+        size_t end; // end idx in query k-mer list
+        size_t length;
+        DiffIdxSplit diffIdxSplit; // index in target k-mer list from where the search begins.
+    };
+
+    size_t AminoAcidPart(size_t kmer) const { return (kmer) & MARKER; }
+
+    template <typename T>
+    static void loadBuffer(FILE * fp, T * buffer, size_t & bufferIdx, size_t size){
+        fread(buffer, sizeof(T), size, fp);
+        bufferIdx = 0;
+    }
+
+    template <typename T>
+    static void loadBuffer(FILE * fp, T * buffer, size_t & bufferIdx, size_t size, int cnt){
+        fseek(fp, cnt * sizeof(T), SEEK_CUR);
+        fread(buffer, sizeof(T), size, fp);
+        bufferIdx = 0;
+    }
+
+    static uint64_t getNextTargetKmer(uint64_t lookingTarget,
+                                      const uint16_t * diffIdxBuffer,
+                                      size_t & diffBufferIdx,
+                                      size_t & totalPos);
+
+
+    static TargetKmerInfo getKmerInfo(size_t bufferSize,
+                                      FILE *kmerInfoFp,
+                                      TargetKmerInfo *infoBuffer,
+                                      size_t &infoBufferIdx);
+
+    void moveMatches(Match *dest,
+                     Match *src,
+                     int &matchNum);
+
+    void compareDna(uint64_t query,
+                    std::vector<uint64_t> &targetKmersToCompare,
+                    std::vector<size_t> &selectedMatches,
+                    std::vector<uint8_t> &selectedHammingSum,
+                    std::vector<uint16_t> &rightEndHammings,
+                    uint8_t frame);
+
+    virtual uint8_t getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2);
+
+    virtual uint16_t getHammings(uint64_t kmer1, uint64_t kmer2);
+
+    virtual uint16_t getHammings_reverse(uint64_t kmer1, uint64_t kmer2);
+
+public:
+    KmerMatcher(const LocalParameters & par,
+                NcbiTaxonomy * taxonomy);
+
+    virtual ~KmerMatcher();
+
+    int matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer<Match> * matchBuffer);
+
+};
+
+inline
+uint64_t KmerMatcher::getNextTargetKmer(uint64_t lookingTarget,
+                                        const uint16_t *diffIdxBuffer,
+                                        size_t &diffBufferIdx,
+                                        size_t &totalPos) {
+    uint16_t fragment;
+    uint16_t check = 32768; // 2^15
+    uint64_t diffIn64bit = 0;
+    fragment = diffIdxBuffer[diffBufferIdx++];
+    totalPos++;
+    while (!(fragment & check)) { // 27 %
+        diffIn64bit |= fragment;
+        diffIn64bit <<= 15u;
+        fragment = diffIdxBuffer[diffBufferIdx++];
+        totalPos++;
+    }
+    fragment &= ~check; // not; 8.47 %
+    diffIn64bit |= fragment; // or : 23.6%
+    return diffIn64bit + lookingTarget;
+}
+
+inline
+TargetKmerInfo KmerMatcher::getKmerInfo(size_t bufferSize,
+                                        FILE * kmerInfoFp,
+                                        TargetKmerInfo * infoBuffer,
+                                       size_t & infoBufferIdx){
+    if (unlikely(infoBufferIdx >= bufferSize)) {
+        loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize, (int) (infoBufferIdx - bufferSize));
+    }
+    return infoBuffer[infoBufferIdx];
+}
+
+inline uint8_t KmerMatcher::getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2) {//12345678
+    uint8_t hammingSum = 0;
+    hammingSum += hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)];
+    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 3U)][GET_3_BITS(kmer2 >> 3U)];
+    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 6U)][GET_3_BITS(kmer2 >> 6U)];
+    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 9U)][GET_3_BITS(kmer2 >> 9U)];
+    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 12U)][GET_3_BITS(kmer2 >> 12U)];
+    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 15U)][GET_3_BITS(kmer2 >> 15U)];
+    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 18U)][GET_3_BITS(kmer2 >> 18U)];
+    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 21U)][GET_3_BITS(kmer2 >> 21U)];
+    return hammingSum;
+}
+
+inline uint16_t KmerMatcher::getHammings(uint64_t kmer1, uint64_t kmer2) {  //hammings 87654321
+    uint16_t hammings = 0;
+    for (int i = 0; i < 8; i++) {
+        hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * i;
+        kmer1 >>= bitsForCodon;
+        kmer2 >>= bitsForCodon;
+    }
+    return hammings;
+}
+
+inline uint16_t KmerMatcher::getHammings_reverse(uint64_t kmer1, uint64_t kmer2) {  //hammings 87654321
+    uint16_t hammings = 0;
+    for (int i = 0; i < 8; i++) {
+        hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * (7-i);
+        kmer1 >>= bitsForCodon;
+        kmer2 >>= bitsForCodon;
+    }
+    return hammings;
+}
+
+struct sortMatch {
+    bool operator() (const Match& a, const Match& b) const {
+        if (a.qInfo.sequenceID != b.qInfo.sequenceID)
+            return a.qInfo.sequenceID < b.qInfo.sequenceID;
+
+        if (a.genusId != b.genusId)
+            return a.genusId < b.genusId;
+
+        if (a.speciesId != b.speciesId)
+            return a.speciesId < b.speciesId;
+
+        if (a.qInfo.frame != b.qInfo.frame)
+            return a.qInfo.frame < b.qInfo.frame;
+
+        if (a.qInfo.pos != b.qInfo.pos)
+            return a.qInfo.pos < b.qInfo.pos;
+
+        return a.hamming < b.hamming;
+    }
+};
+
+#endif //METABULI_KMERMATCHER_H
diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp
new file mode 100644
index 00000000..039c635a
--- /dev/null
+++ b/src/commons/LocalUtil.cpp
@@ -0,0 +1,40 @@
+#include "LocalUtil.h"
+
+
+std::string LocalUtil::getQueryBaseName(const std::string queryPath) {
+    std::vector<std::string> splits = Util::split(queryPath, ".");
+    std::string baseName;
+    int extentionNum = 1;
+    if (Util::endsWith(".gz", queryPath)) {
+        extentionNum = 2;
+    }
+    for (size_t i = 0; i < splits.size() - extentionNum; ++i) {
+        if (i == splits.size() - extentionNum - 1) {
+            baseName += splits[i];
+        } else {
+            baseName += splits[i] + ".";
+        }
+    }
+    return baseName;
+}
+
+template <typename T>
+T LocalUtil::getQueryKmerNumber(T queryLength, int spaceNum) {
+    return (getMaxCoveredLength(queryLength) / 3 - kmerLength - spaceNum + 1) * 6;
+}
+
+
+void LocalUtil::splitQueryFile(std::vector<SequenceBlock> & sequences, const std::string &queryPath) {
+    KSeqWrapper* kseq = nullptr;
+    kseq = KSeqFactory(queryPath.c_str());
+    while (kseq->ReadEntry()) {
+        const KSeqWrapper::KSeqEntry & e = kseq->entry;
+        sequences.emplace_back(e.headerOffset - 1,
+                               e.sequenceOffset + e.sequence.l,
+                               e.sequenceOffset + e.sequence.l - e.headerOffset + 2,
+                               e.sequence.l);
+    }
+    delete kseq;
+}
+
+
diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h
new file mode 100644
index 00000000..1d34a45c
--- /dev/null
+++ b/src/commons/LocalUtil.h
@@ -0,0 +1,23 @@
+#ifndef METABULI_LOCALUTIL_H
+#define METABULI_LOCALUTIL_H
+
+#include "Util.h"
+#include <string>
+#include "common.h"
+#include "KSeqWrapper.h"
+
+class LocalUtil : public Util {
+public:
+    LocalUtil() = default;
+
+    static std::string getQueryBaseName(const std::string queryPath);
+
+    template<typename T>
+    static T getQueryKmerNumber(T queryLength, int spaceNum);
+
+    static void splitQueryFile(vector<SequenceBlock> & seqSegments, const string & queryPath);
+
+};
+
+
+#endif //METABULI_LOCALUTIL_H
diff --git a/src/commons/Mmap.h b/src/commons/Mmap.h
index 8b680e55..5d530716 100644
--- a/src/commons/Mmap.h
+++ b/src/commons/Mmap.h
@@ -1,7 +1,3 @@
-//
-// Created by KJB on 26/08/2020.
-//
-
 #ifndef ADKMER3_MMAP_H
 #define ADKMER3_MMAP_H
 #include <cerrno>
diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp
new file mode 100644
index 00000000..636f65d2
--- /dev/null
+++ b/src/commons/QueryFilter.cpp
@@ -0,0 +1,52 @@
+#include "QueryFilter.h"
+
+QueryFilter::QueryFilter(LocalParameters & par) {
+    if (par.reducedAA == 1) {
+        classifier = new ReducedClassifier(par);
+    } else {
+        classifier = new Classifier(par);
+    }
+    queryIndexer = new QueryIndexer(par);
+
+    setInputAndOutputFiles(par);
+}
+
+QueryFilter::~QueryFilter() {
+    delete queryIndexer;
+    delete classifier;
+}
+
+void QueryFilter::setInputAndOutputFiles(const LocalParameters & par) {
+    // Get the base name of in1
+    in1 = par.filenames[0];
+    string baseName = LocalUtil::getQueryBaseName(in1);
+
+    // Set the output file names
+    out1 = baseName + "_filtered.fna.gz";
+    reportFileName = baseName + "_filter_report.tsv";
+
+    // For paired-end reads
+    if (par.seqMode == 2) {
+        in2 = par.filenames[1];
+        out2 = LocalUtil::getQueryBaseName(in2) + "_filtered.fna.gz";
+    }
+}
+
+void QueryFilter::filterReads(LocalParameters & par) {
+
+    cout << "Indexing query file ...";
+    queryIndexer->indexQueryFile();
+    size_t numOfSeq = queryIndexer->getReadNum_1();
+    size_t totalReadLength = queryIndexer->getTotalReadLength();
+    const vector<QuerySplit> & queryReadSplit = queryIndexer->getQuerySplits();
+    cout << "Done" << endl;
+    cout << "Total number of sequences: " << numOfSeq << endl;
+    cout << "Total read length: " << totalReadLength <<  "nt" << endl;
+
+    QueryKmerBuffer kmerBuffer;
+    Buffer<Match> matchBuffer;
+    vector<Query> queryList;
+
+
+}
+
diff --git a/src/commons/QueryFilter.h b/src/commons/QueryFilter.h
new file mode 100644
index 00000000..33fa7de1
--- /dev/null
+++ b/src/commons/QueryFilter.h
@@ -0,0 +1,23 @@
+#ifndef METABULI_FILTERER_H
+#define METABULI_FILTERER_H
+
+#include "LocalUtil.h"
+#include "QueryIndexer.h"
+#include "ReducedKmerMatcher.h"
+class QueryFilter {
+private:
+    QueryIndexer * queryIndexer;
+    KmerMatcher * kmerMatcher;
+
+    std::string in1, in2, out1, out2, reportFileName; // input and output file names
+
+    void setInputAndOutputFiles(const LocalParameters & par);
+
+public:
+    void filterReads(LocalParameters & par);
+    explicit QueryFilter(LocalParameters & par);
+    ~QueryFilter();
+};
+
+
+#endif //METABULI_FILTERER_H
diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp
new file mode 100644
index 00000000..f5a30ff9
--- /dev/null
+++ b/src/commons/QueryIndexer.cpp
@@ -0,0 +1,102 @@
+#include "QueryIndexer.h"
+
+QueryIndexer::QueryIndexer(const LocalParameters & par) {
+    seqMode = par.seqMode;
+    if (seqMode == 1 || seqMode == 3) {
+        queryPath_1 = par.filenames[0];
+        queryPath_2 = "";
+    } else {
+        queryPath_1 = par.filenames[0];
+        queryPath_2 = par.filenames[1];
+    }
+
+    matchPerKmer = par.matchPerKmer;
+    maxRam = par.ramUsage;
+    threads = par.threads;
+    bytesPerKmer = sizeof(QueryKmer) + matchPerKmer * sizeof(Match);
+    readNum_1 = 0;
+    readNum_2 = 0;
+    spaceNum = par.spaceMask.length() - kmerLength;
+
+    setAvailableRam();
+}
+
+void QueryIndexer::setAvailableRam() {
+    availableRam = ((size_t) maxRam * (size_t) 1024 * 1024 * 1024)
+                         - ((size_t) 134217728 * (size_t) threads);
+}
+
+void QueryIndexer::indexQueryFile() {
+    // Read 1
+    KSeqWrapper* kseq;
+    kseq = KSeqFactory(queryPath_1.c_str());
+    size_t kmerCnt = 0;
+    size_t seqCnt = 0;
+    size_t start = 0;
+    while (kseq->ReadEntry()) {
+        readNum_1++;
+        const KSeqWrapper::KSeqEntry &e = kseq->entry;
+        totalReadLength += e.sequence.l;
+        size_t currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(e.sequence.l, spaceNum);
+        kmerCnt += currentKmerCnt;
+        seqCnt++;
+        if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) {
+            querySplits.emplace_back(start, readNum_1, kmerCnt - currentKmerCnt);
+            kmerCnt = currentKmerCnt;
+            start = readNum_1;
+            seqCnt = 1;
+        }
+    }
+    querySplits.emplace_back(start, readNum_1, kmerCnt);
+    delete kseq;
+
+    // Read 2
+    if (seqMode == 2) {
+        kseq = KSeqFactory(queryPath_2.c_str());
+        kmerCnt = 0;
+        seqCnt = 0;
+        start = 0;
+        while (kseq->ReadEntry()) {
+            readNum_2++;
+            const KSeqWrapper::KSeqEntry &e = kseq->entry;
+            totalReadLength += e.sequence.l;
+            size_t currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(e.sequence.l, spaceNum);
+            kmerCnt += currentKmerCnt;
+            seqCnt++;
+            if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) {
+                querySplits.emplace_back(start, readNum_2, kmerCnt - currentKmerCnt);
+                kmerCnt = currentKmerCnt;
+                start = readNum_2;
+                seqCnt = 1;
+            }
+        }
+        querySplits.emplace_back(start, readNum_2, kmerCnt);
+        delete kseq;
+
+        // Check if the number of reads in the two files are equal
+        if (readNum_1 != readNum_2) {
+            Debug(Debug::ERROR) << "The number of reads in the two files are not equal." << "\n";
+            EXIT(EXIT_FAILURE);
+        }
+    }
+}
+
+size_t QueryIndexer::getReadNum_1() const {
+    return readNum_1;
+}
+
+size_t QueryIndexer::getReadNum_2() const {
+    return readNum_2;
+}
+
+const std::vector<QuerySplit> & QueryIndexer::getQuerySplits() const {
+    return querySplits;
+}
+
+std::size_t QueryIndexer::getTotalReadLength() const {
+    return totalReadLength;
+}
+
+size_t QueryIndexer::getAvailableRam() const {
+    return availableRam;
+}
\ No newline at end of file
diff --git a/src/commons/QueryIndexer.h b/src/commons/QueryIndexer.h
new file mode 100644
index 00000000..93e308eb
--- /dev/null
+++ b/src/commons/QueryIndexer.h
@@ -0,0 +1,67 @@
+#ifndef METABULI_QUERYINDEXOR_H
+#define METABULI_QUERYINDEXOR_H
+
+#include "LocalParameters.h"
+#include "Kmer.h"
+#include "Match.h"
+#include "KSeqWrapper.h"
+#include "LocalUtil.h"
+#include "Debug.h"
+
+struct QuerySplit {
+    size_t start;
+    size_t end;
+    size_t kmerCnt;
+
+    QuerySplit(size_t start, size_t end, size_t kmerCnt) : start(start), end(end), kmerCnt(kmerCnt) {}
+};
+
+// Input
+// 1. A set of reads
+
+// Output
+// 1. size_t numOfSeq;
+// 2. vector<QuerySplit> querySplits;
+
+class QueryIndexer {
+private:
+    // Input
+    std::string queryPath_1;
+    std::string queryPath_2;
+    size_t seqMode;
+    size_t matchPerKmer;
+    size_t maxRam;
+    size_t threads;
+    int spaceNum;
+
+    // Internal
+    size_t availableRam;
+    size_t bytesPerKmer;
+
+    // Output
+    std::size_t readNum_1;
+    std::size_t readNum_2;
+    std::vector<QuerySplit> querySplits;
+    std::size_t totalReadLength;
+
+public:
+    explicit QueryIndexer(const LocalParameters & par);
+    ~QueryIndexer() = default;
+
+    void indexQueryFile();
+
+    // Getters
+    size_t getReadNum_1() const;
+    size_t getReadNum_2() const;
+    const std::vector<QuerySplit> & getQuerySplits() const;
+    std::size_t getTotalReadLength() const;
+    size_t getAvailableRam() const;
+
+
+    // Setters
+    void setAvailableRam();
+
+};
+
+
+#endif //METABULI_QUERYINDEXOR_H
diff --git a/src/commons/ReducedClassifier.cpp b/src/commons/ReducedClassifier.cpp
deleted file mode 100644
index bd5b5f93..00000000
--- a/src/commons/ReducedClassifier.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-//
-// Created by 김재범 on 2022/06/28.
-//
-
-#include "ReducedClassifier.h"
-
-ReducedClassifier::ReducedClassifier(LocalParameters & par)
-: Classifier(par){
-    setMarker(0Xffffffff);
-    setNumOfBitsForCodon(4);
-}
\ No newline at end of file
diff --git a/src/commons/ReducedClassifier.h b/src/commons/ReducedKmerMatcher.h
similarity index 80%
rename from src/commons/ReducedClassifier.h
rename to src/commons/ReducedKmerMatcher.h
index b3e4ab0c..dd43e646 100644
--- a/src/commons/ReducedClassifier.h
+++ b/src/commons/ReducedKmerMatcher.h
@@ -1,13 +1,11 @@
-//
-// Created by 김재범 on 2022/06/28.
-//
+#ifndef METABULI_REDUCEDKMERMATCHER_H
+#define METABULI_REDUCEDKMERMATCHER_H
 
-#ifndef METABULI_REDUCEDCLASSIFIER_H
-#define METABULI_REDUCEDCLASSIFIER_H
+#include "KmerMatcher.h"
+#include <unordered_map>
+#include "NcbiTaxonomy.h"
 
-#include "Classifier.h"
-
-class ReducedClassifier : public Classifier {
+class ReducedKmerMatcher : public KmerMatcher {
 protected:
     uint8_t hammingLookup[11][11] = {
             {0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3},
@@ -22,7 +20,6 @@ class ReducedClassifier : public Classifier {
             {3, 2, 3, 3, 4, 4, 4, 4, 4, 0, 4},
             {3, 3, 2, 3, 4, 4, 4, 4, 4, 4, 0}};
 
-
 public:
     uint8_t getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2) override {
         uint8_t hammingSum = 0;
@@ -58,8 +55,14 @@ class ReducedClassifier : public Classifier {
         return hammings;
     }
 
-    ReducedClassifier(LocalParameters & par);
+    explicit ReducedKmerMatcher(LocalParameters & par,
+                                NcbiTaxonomy * taxonomy)
+                                : KmerMatcher(par,taxonomy) {
+        MARKER = 0Xffffffff;
+    }
+
+    ~ReducedKmerMatcher() override = default;
 };
 
 
-#endif //METABULI_REDUCEDCLASSIFIER_H
+#endif //METABULI_REDUCEDKMERMATCHER_H
diff --git a/src/commons/Reporter.cpp b/src/commons/Reporter.cpp
new file mode 100644
index 00000000..288aecdb
--- /dev/null
+++ b/src/commons/Reporter.cpp
@@ -0,0 +1,94 @@
+#include "Reporter.h"
+#include "taxonomyreport.cpp"
+
+Reporter::Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxonomy(taxonomy){
+    if (par.seqMode == 2) {
+        outDir = par.filenames[3];
+        jobId = par.filenames[4];
+    } else {
+        outDir = par.filenames[2];
+        jobId = par.filenames[3];
+    }
+}
+
+void Reporter::openReadClassificationFile() {
+    readClassificationFile.open(outDir + "/" + jobId + "_classifications.tsv");
+}
+
+void Reporter::writeReadClassification(const vector<Query> & queryList) {
+    for (size_t i = 0; i < queryList.size(); i++) {
+        readClassificationFile << queryList[i].isClassified << "\t" << queryList[i].name << "\t"
+                               << queryList[i].classification << "\t"
+                               << queryList[i].queryLength + queryList[i].queryLength2 << "\t"
+                               << queryList[i].score << "\t"
+                               << queryList[i].coverage << "\t"
+                               << queryList[i].hammingDist << "\t"
+                               << taxonomy->getString(taxonomy->taxonNode(queryList[i].classification)->rankIdx) << "\t";
+        for (auto it = queryList[i].taxCnt.begin(); it != queryList[i].taxCnt.end(); ++it) {
+            readClassificationFile << it->first << ":" << it->second << " ";
+        }
+        readClassificationFile << "\n";
+    }
+}
+
+void Reporter::closeReadClassificationFile() {
+    readClassificationFile.close();
+}
+
+void Reporter::writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt) {
+    unordered_map<TaxID, TaxonCounts> cladeCounts = taxonomy->getCladeCounts(taxCnt);
+    FILE *fp;
+    fp = fopen((outDir + + "/" + jobId + "_report.tsv").c_str(), "w");
+    writeReport(fp, cladeCounts, numOfQuery);
+    fclose(fp);
+
+    // Write Krona chart
+    FILE *kronaFile = fopen((outDir + "/" + jobId + "_krona.html").c_str(), "w");
+    fwrite(krona_prelude_html, krona_prelude_html_len, sizeof(char), kronaFile);
+    fprintf(kronaFile, "<node name=\"all\"><magnitude><val>%zu</val></magnitude>", numOfQuery);
+    kronaReport(kronaFile, *taxonomy, cladeCounts, numOfQuery);
+    fprintf(kronaFile, "</node></krona></div></body></html>");
+
+}
+
+void Reporter::writeReport(FILE *FP, const std::unordered_map<TaxID, TaxonCounts> &cladeCounts,
+                             unsigned long totalReads, TaxID taxID, int depth) {
+    std::unordered_map<TaxID, TaxonCounts>::const_iterator it = cladeCounts.find(taxID);
+    unsigned int cladeCount = it == cladeCounts.end() ? 0 : it->second.cladeCount;
+    unsigned int taxCount = it == cladeCounts.end() ? 0 : it->second.taxCount;
+    if (taxID == 0) {
+        if (cladeCount > 0) {
+            fprintf(FP, "%.4f\t%i\t%i\tno rank\t0\tunclassified\n",
+                    100 * cladeCount / double(totalReads),
+                    cladeCount, taxCount);
+        }
+        writeReport(FP, cladeCounts, totalReads, 1);
+    } else {
+        if (cladeCount == 0) {
+            return;
+        }
+        const TaxonNode *taxon = taxonomy->taxonNode(taxID);
+        fprintf(FP, "%.4f\t%i\t%i\t%s\t%i\t%s%s\n",
+                100 * cladeCount / double(totalReads), cladeCount, taxCount,
+                taxonomy->getString(taxon->rankIdx), taxID, std::string(2 * depth, ' ').c_str(), taxonomy->getString(taxon->nameIdx));
+        std::vector<TaxID> children = it->second.children;
+        SORT_SERIAL(children.begin(), children.end(), [&](int a, int b) { return cladeCountVal(cladeCounts, a) > cladeCountVal(cladeCounts, b); });
+        for (size_t i = 0; i < children.size(); ++i) {
+            TaxID childTaxId = children[i];
+            if (cladeCounts.count(childTaxId)) {
+                writeReport(FP, cladeCounts, totalReads, childTaxId, depth + 1);
+            } else {
+                break;
+            }
+        }
+    }
+}
+
+unsigned int Reporter::cladeCountVal(const std::unordered_map<TaxID, TaxonCounts> &map, TaxID key) {
+    typename std::unordered_map<TaxID, TaxonCounts>::const_iterator it = map.find(key);
+    if (it == map.end()) {
+        return 0;
+    } else {
+        return it->second.cladeCount;
+    }
+}
\ No newline at end of file
diff --git a/src/commons/Reporter.h b/src/commons/Reporter.h
new file mode 100644
index 00000000..4de0f32c
--- /dev/null
+++ b/src/commons/Reporter.h
@@ -0,0 +1,42 @@
+#ifndef METABULI_REPORTER_H
+#define METABULI_REPORTER_H
+#include "common.h"
+#include "iostream"
+#include "fstream"
+#include <unordered_map>
+#include "NcbiTaxonomy.h"
+#include "LocalParameters.h"
+
+using namespace std;
+
+
+class Reporter {
+private:
+    string outDir;
+    string jobId;
+    NcbiTaxonomy * taxonomy;
+
+    // Output
+    ofstream readClassificationFile;
+
+
+public:
+    Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy);
+    // Write report
+
+    // Read by read classification results
+    void openReadClassificationFile();
+    void writeReadClassification(const vector<Query> & queryList);
+    void closeReadClassificationFile();
+
+    void writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt);
+
+    void writeReport(FILE *FP, const std::unordered_map<TaxID, TaxonCounts> &cladeCounts,
+                     unsigned long totalReads, TaxID taxID = 0, int depth = 0);
+
+    unsigned int cladeCountVal(const std::unordered_map<TaxID, TaxonCounts> &map, TaxID key);
+
+};
+
+
+#endif //METABULI_REPORTER_H
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
new file mode 100644
index 00000000..c41ddea9
--- /dev/null
+++ b/src/commons/Taxonomer.cpp
@@ -0,0 +1,1164 @@
+#include "Taxonomer.h"
+
+
+Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxonomy(taxonomy) {
+    // Parameters
+    auto mask = new uint32_t[par.spaceMask.length()];
+    for(size_t i = 0, j = 0; i < par.spaceMask.length(); i++){
+        mask[i] = par.spaceMask[i] - 48;
+        spaceNum += (mask[i] == 0);
+        if(par.spaceMask[i]==1){
+            unmaskedPos[j] = (int) i;
+            j++;
+        }
+    }
+    delete[] mask;
+    maxGap = par.maxGap;
+    minCoveredPos = par.minCoveredPos;
+}
+
+Taxonomer::~Taxonomer() {
+
+}
+
+void Taxonomer::assignTaxonomy(const Match *matchList,
+                               size_t numOfMatches,
+                               std::vector<Query> &queryList,
+                               const LocalParameters &par) {
+    time_t beforeAnalyze = time(nullptr);
+    cout << "Analyzing matches ..." << endl;
+
+    // Divide matches into blocks for multi threading
+    size_t seqNum = queryList.size();
+    MatchBlock *matchBlocks = new MatchBlock[seqNum];
+    size_t matchIdx = 0;
+    size_t blockIdx = 0;
+    uint32_t currentQuery;
+    while (matchIdx < numOfMatches) {
+        currentQuery = matchList[matchIdx].qInfo.sequenceID;
+        matchBlocks[blockIdx].id = currentQuery;
+        matchBlocks[blockIdx].start = matchIdx;
+        while ((currentQuery == matchList[matchIdx].qInfo.sequenceID) && (matchIdx < numOfMatches)) ++matchIdx;
+        matchBlocks[blockIdx].end = matchIdx - 1;
+        blockIdx++;
+    }
+
+    // Process each block
+#pragma omp parallel default(none), shared(cout, matchBlocks, matchList, seqNum, queryList, blockIdx, par)
+    {
+#pragma omp for schedule(dynamic, 1)
+        for (size_t i = 0; i < blockIdx; ++i) {
+            chooseBestTaxon(matchBlocks[i].id,
+                            matchBlocks[i].start,
+                            matchBlocks[i].end,
+                            matchList,
+                            queryList,
+                            par);
+        }
+    }
+
+    for (size_t i = 0; i < seqNum; i++) {
+        ++taxCounts[queryList[i].classification];
+    }
+    delete[] matchBlocks;
+    cout << "Time spent for analyzing: " << double(time(nullptr) - beforeAnalyze) << endl;
+
+}
+
+void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
+                                 size_t offset,
+                                 size_t end,
+                                 const Match *matchList,
+                                 vector<Query> & queryList,
+                                 const LocalParameters &par) {
+    TaxID selectedTaxon;
+//    if (par.printLog) {
+//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
+//        for (size_t i = offset; i < end + 1; i++) {
+//            cout << taxId2genusId[matchList[i].targetId] << " " << taxId2speciesId[matchList[i].targetId] <<
+//            " "  << matchList[i].targetId << " " << matchList[i].qInfo.frame << " ";
+//            print_binary16(16, matchList[i].rightEndHamming);
+//            cout << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
+//        }
+//    }
+
+    // Get the best genus for current query
+    vector<Match> genusMatches;
+    genusMatches.reserve(end - offset + 1);
+
+    int res;
+    TaxonScore genusScore(0, 0, 0, 0);
+    if (par.seqMode == 2) {
+        if (par.spaceMask != "11111111"){
+            genusScore = getBestGenusMatches_spaced(genusMatches, matchList, end, offset,
+                                                    queryList[currentQuery].queryLength,
+                                                    queryList[currentQuery].queryLength2);
+        } else {
+            genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
+                                             queryList[currentQuery].queryLength,
+                                             queryList[currentQuery].queryLength2, par);
+        }
+    } else {
+        if (par.spaceMask != "11111111") {
+            genusScore = getBestGenusMatches_spaced(genusMatches, matchList, end, offset,
+                                                    queryList[currentQuery].queryLength);
+        } else {
+            genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
+                                             queryList[currentQuery].queryLength, par);
+        }
+    }
+
+//    if (par.printLog) {
+//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n";
+//        for (size_t i = 0; i < genusMatches.size(); i++) {
+//            cout << taxId2genusId[genusMatches[i].targetId] << " " << taxId2speciesId[genusMatches[i].targetId] <<
+//                 " "  << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " ";
+//            print_binary16(16, genusMatches[i].rightEndHamming);
+//            cout << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) <<  " "  << int(genusMatches[i].redundancy) << endl;
+//        }
+//        cout << "Genus score: " << genusScore.score << "\n";
+//    }
+
+    // If there is no proper genus for current query, it is un-classified.
+    if (genusScore.score == 0 || genusScore.coverage < par.minCoverage || genusScore.score < par.minScore) {
+        queryList[currentQuery].isClassified = false;
+        queryList[currentQuery].classification = 0;
+        queryList[currentQuery].score = genusScore.score;
+        queryList[currentQuery].coverage = genusScore.coverage;
+        queryList[currentQuery].hammingDist = genusScore.hammingDist;
+        queryList[currentQuery].newSpecies = false;
+        return;
+    }
+
+    // If there are two or more good genus level candidates, find the LCA.
+    if (genusScore.taxId == 0) {
+        vector<TaxID> genusList;
+        genusList.reserve(genusMatches.size());
+        for (auto & genusMatch : genusMatches) {
+            genusList.push_back(genusMatch.genusId);
+        }
+        selectedTaxon = taxonomy->LCA(genusList)->taxId;
+        queryList[currentQuery].isClassified = true;
+        queryList[currentQuery].classification = selectedTaxon;
+        queryList[currentQuery].score = genusScore.score;
+        queryList[currentQuery].coverage = genusScore.coverage;
+        queryList[currentQuery].hammingDist = genusScore.hammingDist;
+        for (auto & genusMatch : genusMatches) {
+            queryList[currentQuery].taxCnt[genusMatch.targetId]++;
+        }
+        return;
+    }
+
+    // Choose the species with the highest coverage.
+    TaxID selectedSpecies;
+    TaxonScore speciesScore;
+    vector<TaxID> species;
+    unordered_map<TaxID, pair<int, int>> speciesMatchRange;
+    if (par.seqMode == 2) {
+        speciesScore = chooseSpecies(genusMatches,
+                                     queryList[currentQuery].queryLength,
+                                     queryList[currentQuery].queryLength2,
+                                     species,
+                                     speciesMatchRange);
+    } else {
+        speciesScore = chooseSpecies(genusMatches,
+                                     queryList[currentQuery].queryLength,
+                                     species,
+                                     speciesMatchRange);
+    }
+
+
+    // Classify to LCA if more than one species are selected
+    if (species.size() > 1) {
+        queryList[currentQuery].isClassified = true;
+        queryList[currentQuery].classification = taxonomy->LCA(species)->taxId;
+        queryList[currentQuery].score = genusScore.score;
+        queryList[currentQuery].coverage = genusScore.coverage;
+        queryList[currentQuery].hammingDist = genusScore.hammingDist;
+        for (auto & genusMatch : genusMatches) {
+            queryList[currentQuery].taxCnt[genusMatch.targetId]++;
+        }
+        return;
+    }
+
+    // If score is not enough, classify to the parent of the selected species
+    if (speciesScore.score < par.minSpScore) {
+        queryList[currentQuery].isClassified = true;
+        queryList[currentQuery].classification = taxonomy->taxonNode(
+                taxonomy->getTaxIdAtRank(species[0], "species"))->parentTaxId;
+        queryList[currentQuery].score = genusScore.score;
+        queryList[currentQuery].coverage = genusScore.coverage;
+        queryList[currentQuery].hammingDist = genusScore.hammingDist;
+        for (auto & genusMatch : genusMatches) {
+            if(genusMatch.speciesId == species[0]){
+                queryList[currentQuery].taxCnt[genusMatch.targetId]++;
+            }
+        }
+        return;
+    }
+
+    // Sort matches by the position of the query sequence
+    selectedSpecies = species[0];
+//    sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first,
+//         genusMatches.begin() + speciesMatchRange[selectedSpecies].second,
+//         [](const Match & a, const Match & b) {
+//        if (a.qInfo.position / 3 == b.qInfo.position / 3)
+//            return a.hamming < b.hamming;
+//        else
+//            return a.qInfo.position / 3 < b.qInfo.position / 3;
+//    });
+
+    sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first,
+         genusMatches.begin() + speciesMatchRange[selectedSpecies].second,
+         [](const Match & a, const Match & b) { return a.qInfo.pos > b.qInfo.pos; });
+
+
+    TaxID result = lowerRankClassification(genusMatches, speciesMatchRange[selectedSpecies], selectedSpecies);
+
+    // Record matches of selected species
+    for (size_t i = speciesMatchRange[selectedSpecies].first; i < speciesMatchRange[selectedSpecies].second; i++) {
+        queryList[currentQuery].taxCnt[genusMatches[i].targetId]++;
+    }
+
+
+    // Store classification results
+    queryList[currentQuery].isClassified = true;
+    queryList[currentQuery].classification = result;
+    queryList[currentQuery].score = speciesScore.score;
+    queryList[currentQuery].coverage = speciesScore.coverage;
+    queryList[currentQuery].hammingDist = speciesScore.hammingDist;
+    queryList[currentQuery].newSpecies = false;
+//    if (par.printLog) {
+//        cout << "# " << currentQuery << endl;
+//        for (size_t i = 0; i < genusMatches.size(); i++) {
+//            cout << i << " " << genusMatches[i].qInfo.pos << " " <<
+//            genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl;
+//        }
+//        cout << "Score: " << speciesScore.score << "  " << selectedSpecies << " "
+//             << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx)
+//
+//             << endl;
+//    }
+}
+
+TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, pair<int, int> &matchRange, TaxID spTaxId) {
+    int i = matchRange.second - 1;
+    unordered_map<TaxID, unsigned int> taxCnt;
+
+    while ( i >= matchRange.first ) {
+        size_t currQuotient = matches[i].qInfo.pos / 3;
+        uint8_t minHamming = matches[i].hamming;
+        Match * minHammingMatch = & matches[i];
+        TaxID minHammingTaxId = minHammingMatch->targetId;
+        bool first = true;
+        i --;
+        while ( (i >= matchRange.first) && (currQuotient == matches[i].qInfo.pos / 3) ) {
+            if (matches[i].hamming < minHamming) {
+                minHamming = matches[i].hamming;
+                minHammingMatch = & matches[i];
+                minHammingTaxId = minHammingMatch->targetId;
+            } else if (matches[i].hamming == minHamming) {
+                minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId);
+                minHammingMatch->redundancy = true;
+                matches[i].redundancy = true;
+            }
+            i--;
+        }
+        taxCnt[minHammingTaxId]++;
+    }
+
+    unordered_map<TaxID, TaxonCounts> cladeCnt;
+    getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
+
+    return BFS(cladeCnt, spTaxId);
+}
+
+void Taxonomer::getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> &taxCnt,
+                                       unordered_map<TaxID, TaxonCounts> & cladeCount,
+                                       TaxID speciesTaxID) {
+    for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) {
+//        cladeCount[it->first].taxCount = it->second;
+//        cladeCount[it->first].cladeCount += it->second;
+        TaxonNode const * taxon = taxonomy->taxonNode(it->first);
+        cladeCount[taxon->taxId].taxCount = it->second;
+        cladeCount[taxon->taxId].cladeCount += it->second;
+        while (taxon->taxId != speciesTaxID) {
+            if (find(cladeCount[taxon->parentTaxId].children.begin(),
+                     cladeCount[taxon->parentTaxId].children.end(),
+                     taxon->taxId) == cladeCount[taxon->parentTaxId].children.end()) {
+                cladeCount[taxon->parentTaxId].children.push_back(taxon->taxId);
+            }
+            cladeCount[taxon->parentTaxId].cladeCount += it->second;
+            taxon = taxonomy->taxonNode(taxon->parentTaxId);
+        }
+    }
+}
+
+TaxID Taxonomer::BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID root) {
+    if (cladeCnt.at(root).children.empty()) { // root is a leaf
+        return root;
+    }
+    unsigned int maxCnt = 3;
+    unsigned int currentCnt;
+    vector<TaxID> bestChildren;
+    for (auto it = cladeCnt.at(root).children.begin(); it != cladeCnt.at(root).children.end(); it++) {
+        currentCnt = cladeCnt.at(*it).cladeCount;
+        if (currentCnt > maxCnt) {
+            bestChildren.clear();
+            bestChildren.push_back(*it);
+            maxCnt = currentCnt;
+        } else if (currentCnt == maxCnt) {
+            bestChildren.push_back(*it);
+        }
+    }
+    if (bestChildren.size() == 1) {
+        return BFS(cladeCnt, bestChildren[0]);
+    } else {
+        return root;
+    }
+}
+
+TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
+                                           size_t offset, int readLength1, int readLength2, const LocalParameters & par) {
+    TaxID currentGenus;
+    TaxID currentSpecies;
+
+    vector<const Match *> filteredMatches;
+    vector<vector<const Match *>> matchesForEachGenus;
+    vector<TaxonScore> genusScores;
+    TaxonScore bestScore;
+    size_t i = offset;
+    uint8_t curFrame;
+    vector<const Match *> curFrameMatches;
+    while (i  < end + 1) {
+//        currentGenus = taxId2genusId[matchList[i].targetId];
+        currentGenus = matchList[i].genusId;
+        // For current genus
+        while ((i < end + 1) && currentGenus == matchList[i].genusId) {
+//            currentSpecies = taxId2speciesId[matchList[i].targetId];
+            currentSpecies = matchList[i].speciesId;
+//            if (par.printLog) {
+//                cout << currentGenus << " " << currentSpecies << endl;
+//            }
+            // For current species
+            while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
+                curFrame = matchList[i].qInfo.frame;
+                curFrameMatches.clear();
+
+                // For current frame
+                while ((i < end + 1) && currentSpecies == matchList[i].speciesId
+                       && curFrame == matchList[i].qInfo.frame) {
+                    curFrameMatches.push_back(&matchList[i]);
+                    i ++;
+                }
+                if (curFrameMatches.size() > 1) {
+                    remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus, par);
+                }
+            }
+        }
+
+        // Construct a match combination using filtered matches of current genus
+        // so that it can best cover the query, and score the combination
+        if (!filteredMatches.empty()) {
+            matchesForEachGenus.push_back(filteredMatches);
+            genusScores.push_back(scoreGenus(filteredMatches, readLength1, readLength2));
+        }
+        filteredMatches.clear();
+    }
+
+    // If there are no meaningful genus
+    if (genusScores.empty()) {
+        bestScore.score = 0;
+        return bestScore;
+    }
+
+    TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
+                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
+
+    vector<size_t> maxIdx;
+    for (size_t g = 0; g < genusScores.size(); g++) {
+        if (genusScores[g].score > maxScore.score * 0.95f) {
+            maxIdx.push_back(g);
+        }
+    }
+    bestScore = maxScore;
+
+    for (unsigned long g : maxIdx) {
+        for (const Match * m : matchesForEachGenus[g]) {
+            genusMatches.push_back(*m);
+        }
+    }
+
+
+
+    // More than one genus
+    if (maxIdx.size() > 1) {
+        bestScore.taxId = 0;
+        return bestScore;
+    }
+
+    return bestScore;
+
+    //Three cases
+    //1. one genus
+    //2. more than one genus
+    //4. no genus
+}
+
+void Taxonomer::remainConsecutiveMatches(vector<const Match *> & curFrameMatches,
+                                          vector<const Match *> & filteredMatches,
+                                          TaxID genusId,
+                                          const LocalParameters & par) {
+    size_t i = 0;
+    size_t end = curFrameMatches.size();
+    vector<pair<const Match *, size_t>> curPosMatches; // <match, index>
+    vector<pair<const Match *, size_t>> nextPosMatches;
+    map<size_t, vector<size_t>> linkedMatches; // <index, linked indexes>
+
+    size_t currPos = curFrameMatches[0]->qInfo.pos;
+    while ( i < end && curFrameMatches[i]->qInfo.pos == currPos) {
+        curPosMatches.emplace_back(curFrameMatches[i], i);
+        i++;
+    }
+    while (i < end) {
+        uint32_t nextPos = curFrameMatches[i]->qInfo.pos;
+        while (i < end  && nextPos == curFrameMatches[i]->qInfo.pos) {
+            nextPosMatches.emplace_back(curFrameMatches[i], i);
+            ++ i;
+        }
+        // Check if current position and next position are consecutive
+        if (currPos + 3 == nextPos) {
+            // Compare curPosMatches and nextPosMatches
+            for (auto &curPosMatch: curPosMatches) {
+                for (auto &nextPosMatch: nextPosMatches) {
+                    if (isConsecutive(curPosMatch.first, nextPosMatch.first)) {
+                        linkedMatches[curPosMatch.second].push_back(nextPosMatch.second);
+                    }
+                }
+            }
+
+        }
+        // Update curPosMatches and nextPosMatches
+        curPosMatches = nextPosMatches;
+        nextPosMatches.clear();
+        currPos = nextPos;
+    }
+    // Print linkedMatches
+//    if (par.printLog) {
+//        cout << "linkedMatches: " << endl;
+//        for (const auto &entry: linkedMatches) {
+//            cout << entry.first << ": ";
+//            for (auto &idx: entry.second) {
+//                cout << idx << " ";
+//            }
+//            cout << endl;
+//        }
+//    }
+
+    // Iterate linkedMatches to get filteredMatches
+    int MIN_DEPTH = par.minConsCnt - 1;
+    if (taxonomy->IsAncestor(par.eukaryotaTaxId, genusId)) {
+        MIN_DEPTH = par.minConsCntEuk - 1;
+    }
+    unordered_set<size_t> used;
+    vector<size_t> filteredMatchIdx;
+    unordered_map<size_t, size_t> idx2depth;
+    for (const auto& entry : linkedMatches) {
+        if (!used.count(entry.first)) {
+            used.insert(entry.first);
+            vector<const Match*> curMatches;
+            DFS(entry.first, linkedMatches, filteredMatchIdx, 0, MIN_DEPTH, used, idx2depth);
+        }
+    }
+
+//    if (par.printLog) {
+//        cout << "filteredMatchIdx: ";
+//        for (auto &idx: filteredMatchIdx) {
+//            cout << idx << " ";
+//        }
+//        cout << endl;
+//    }
+
+    for (auto &idx: filteredMatchIdx) {
+        filteredMatches.push_back(curFrameMatches[idx]);
+    }
+}
+
+
+size_t Taxonomer::DFS(size_t curMatchIdx, const map<size_t, vector<size_t>> & linkedMatches,
+                       vector<size_t>& filteredMatches, size_t depth, size_t MIN_DEPTH, unordered_set<size_t>& used,
+                       unordered_map<size_t, size_t> & idx2depth) {
+    depth++;
+    size_t maxDepth = 0;
+    size_t returnDepth = 0;
+    if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { //|| linkedMatches.at(curMatchIdx).empty()) {
+        // reached a leaf node
+        idx2depth[curMatchIdx] = depth;
+        if (depth > MIN_DEPTH) {
+            filteredMatches.push_back(curMatchIdx);
+        }
+        return depth;
+    } else { // not a leaf node
+        for (auto &nextMatchIdx: linkedMatches.at(curMatchIdx)) {
+            used.insert(nextMatchIdx);
+            if (idx2depth.find(nextMatchIdx) != idx2depth.end()) {
+                returnDepth = idx2depth[nextMatchIdx];
+                maxDepth = max(maxDepth, returnDepth);
+                continue;
+            }
+            returnDepth = DFS(nextMatchIdx, linkedMatches, filteredMatches, depth, MIN_DEPTH, used, idx2depth);
+            maxDepth = max(maxDepth, returnDepth);
+        }
+        if (maxDepth > MIN_DEPTH) {
+            filteredMatches.push_back(curMatchIdx);
+            idx2depth[curMatchIdx] = maxDepth;
+        }
+    }
+    return maxDepth;
+}
+
+TaxonScore Taxonomer::getBestGenusMatches_spaced(vector<Match> &genusMatches, const Match *matchList, size_t end,
+                                                  size_t offset, int readLength1, int readLength2) {
+    TaxID currentGenus;
+    TaxID currentSpecies;
+
+    vector<const Match *> tempMatchContainer;
+    vector<const Match *> filteredMatches;
+    vector<vector<const Match *>> matchesForEachGenus;
+    vector<bool> conservedWithinGenus;
+    vector<TaxonScore> genusScores;
+    TaxonScore bestScore;
+    size_t i = offset;
+    bool lastIn;
+    while (i + 1 < end + 1) {
+        currentGenus = matchList[i].genusId;
+        // For current genus
+        while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) {
+//            currentSpecies = taxId2speciesId[matchList[i].targetId];
+            currentSpecies = matchList[i].speciesId;
+            // For current species
+            // Filter un-consecutive matches (probably random matches)
+            lastIn = false;
+            int distance = 0;
+            int diffPosCntOfCurrRange = 1;
+            int dnaDist = 0;
+
+            // For the same species
+            while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) {
+                distance = matchList[i+1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3;
+                dnaDist = matchList[i+1].qInfo.pos - matchList[i].qInfo.pos;
+                if (distance == 0) { // At the same position
+                    tempMatchContainer.push_back(matchList + i);
+                } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping
+                    lastIn = true;
+                    tempMatchContainer.push_back(matchList + i);
+                    diffPosCntOfCurrRange ++;
+                } else { // Not consecutive --> End range
+                    if (lastIn){
+                        tempMatchContainer.push_back(matchList + i);
+                        if (diffPosCntOfCurrRange >= minCoveredPos) {
+                            filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
+                                                   tempMatchContainer.end());
+                        }
+                    }
+                    lastIn = false;
+                    // Initialize range info
+                    tempMatchContainer.clear();
+                    diffPosCntOfCurrRange = 1;
+                }
+                i++;
+            }
+
+            // Met next species
+            if (lastIn) {
+                tempMatchContainer.push_back(matchList + i);
+                if (diffPosCntOfCurrRange >= minCoveredPos) {
+                    filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
+                                           tempMatchContainer.end());
+                }
+            }
+            tempMatchContainer.clear();
+            i++;
+        }
+
+        // Construct a match combination using filtered matches of current genus
+        // so that it can best cover the query, and score the combination
+        if (!filteredMatches.empty()) {
+            genusScores.push_back(scoreGenus(filteredMatches, readLength1, readLength2));
+        }
+        filteredMatches.clear();
+    }
+
+    // If there are no meaningful genus
+    if (genusScores.empty()) {
+        bestScore.score = 0;
+        return bestScore;
+    }
+
+    TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
+                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
+
+    vector<size_t> maxIdx;
+    for (size_t g = 0; g < genusScores.size(); g++) {
+        if (genusScores[g].score > maxScore.score * 0.95f) {
+            maxIdx.push_back(g);
+        }
+    }
+    bestScore = maxScore;
+
+    for (unsigned long g : maxIdx) {
+        for (const Match * m : matchesForEachGenus[g]) {
+            genusMatches.push_back(*m);
+        }
+    }
+
+    // More than one genus
+    if (maxIdx.size() > 1) {
+        bestScore.taxId = 0;
+        return bestScore;
+    }
+    return bestScore;
+
+    //Three cases
+    //1. one genus
+    //2. more than one genus
+    //4. no genus
+}
+
+TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
+                                           size_t offset, int queryLength, const LocalParameters & par) {
+    TaxID currentGenus;
+    TaxID currentSpecies;
+
+    vector<const Match *> filteredMatches;
+    vector<vector<const Match *>> matchesForEachGenus;
+    vector<TaxonScore> genusScores;
+    TaxonScore bestScore;
+    size_t i = offset;
+    uint8_t curFrame;
+    vector<const Match *> curFrameMatches;
+    while (i  < end + 1) {
+        currentGenus = matchList[i].genusId;
+        // For current genus
+        while ((i < end + 1) && currentGenus == matchList[i].genusId) {
+            currentSpecies = matchList[i].speciesId;
+
+            // For current species
+            while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
+                curFrame = matchList[i].qInfo.frame;
+                curFrameMatches.clear();
+
+                // For current frame
+                while ((i < end + 1) && currentSpecies == matchList[i].speciesId
+                       && curFrame == matchList[i].qInfo.frame) {
+                    curFrameMatches.push_back(&matchList[i]);
+                    i ++;
+                }
+                if (curFrameMatches.size() > 1) {
+                    remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus, par);
+                }
+            }
+        }
+
+        // Construct a match combination using filtered matches of current genus
+        // so that it can best cover the query, and score the combination
+
+        if (!filteredMatches.empty()) {
+            matchesForEachGenus.push_back(filteredMatches);
+            genusScores.push_back(scoreGenus(filteredMatches, queryLength));
+        }
+        filteredMatches.clear();
+    }
+
+    // If there are no meaningful genus
+    if (genusScores.empty()) {
+        bestScore.score = 0;
+        return bestScore;
+    }
+
+    TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
+                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
+
+    vector<size_t> maxIdx;
+    for (size_t g = 0; g < genusScores.size(); g++) {
+        if (genusScores[g].score > maxScore.score * 0.95f) {
+            maxIdx.push_back(g);
+        }
+    }
+
+    bestScore = maxScore;
+
+    for (unsigned long g : maxIdx) {
+        for (const Match * m : matchesForEachGenus[g]) {
+            genusMatches.push_back(*m);
+        }
+    }
+
+    // More than one genus
+    if (maxIdx.size() > 1) {
+        bestScore.taxId = 0;
+        return bestScore;
+    }
+    return bestScore;
+
+    //Three cases
+    //1. one genus
+    //2. more than one genus
+    //4. no genus
+}
+
+TaxonScore Taxonomer::getBestGenusMatches_spaced(vector<Match> &genusMatches, const Match *matchList, size_t end,
+                                                 size_t offset, int readLength) {
+    TaxID currentGenus;
+    TaxID currentSpecies;
+
+    vector<const Match *> tempMatchContainer;
+    vector<const Match *> filteredMatches;
+    vector<vector<Match>> matchesForEachGenus;
+    vector<bool> conservedWithinGenus;
+    vector<TaxonScore> genusScores;
+    TaxonScore bestScore;
+    size_t i = offset;
+    bool lastIn;
+    size_t speciesMatchCnt;
+    while (i + 1 < end + 1) {
+        currentGenus = matchList[i].genusId;
+        // For current genus
+        while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) {
+            currentSpecies = matchList[i].speciesId;
+            // For current species
+            // Filter un-consecutive matches (probably random matches)
+            lastIn = false;
+            int distance = 0;
+            int diffPosCntOfCurrRange = 1;
+            int dnaDist = 0;
+
+            // For the same species
+            while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) {
+                distance = matchList[i + 1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3;
+                dnaDist = matchList[i + 1].qInfo.pos - matchList[i].qInfo.pos;
+                if (distance == 0) { // At the same position
+                    tempMatchContainer.push_back(matchList + i);
+                } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping
+                    lastIn = true;
+                    tempMatchContainer.push_back(matchList + i);
+                    diffPosCntOfCurrRange++;
+                } else { // Not consecutive --> End range
+                    if (lastIn) {
+                        tempMatchContainer.push_back(matchList + i);
+                        if (diffPosCntOfCurrRange >= minCoveredPos) {
+                            filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
+                                                   tempMatchContainer.end());
+                        }
+                    }
+                    lastIn = false;
+                    // Initialize range info
+                    tempMatchContainer.clear();
+                    diffPosCntOfCurrRange = 1;
+                }
+                i++;
+            }
+
+            // Met next species
+            if (lastIn) {
+                tempMatchContainer.push_back(matchList + i);
+                if (diffPosCntOfCurrRange >= minCoveredPos) {
+                    filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
+                                           tempMatchContainer.end());
+                }
+            }
+            tempMatchContainer.clear();
+            i++;
+        }
+
+        // Construct a match combination using filtered matches of current genus
+        // so that it can best cover the query, and score the combination
+        if (!filteredMatches.empty()) {
+            genusScores.push_back(scoreGenus(filteredMatches, readLength));
+        }
+        filteredMatches.clear();
+    }
+
+    // If there are no meaningful genus
+    if (genusScores.empty()) {
+        bestScore.score = 0;
+        return bestScore;
+    }
+
+    TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
+                                       [](const TaxonScore &a, const TaxonScore &b) { return a.score < b.score; });
+
+    vector<size_t> maxIdx;
+    for (size_t g = 0; g < genusScores.size(); g++) {
+        if (genusScores[g].score > maxScore.score * 0.95f) {
+            maxIdx.push_back(g);
+        }
+    }
+    bestScore = maxScore;
+
+    for (unsigned long g: maxIdx) {
+        genusMatches.insert(genusMatches.end(),
+                            matchesForEachGenus[g].begin(),
+                            matchesForEachGenus[g].end());
+    }
+
+    // More than one genus
+    if (maxIdx.size() > 1) {
+        bestScore.taxId = 0;
+        return bestScore;
+    }
+    return bestScore;
+
+    //Three cases
+    //1. one genus
+    //2. more than one genus
+    //4. no genus
+}
+
+TaxonScore Taxonomer::scoreGenus(vector<const Match *> &filteredMatches,
+                                  int queryLength) {
+    // Calculate Hamming distance & covered length
+    int coveredPosCnt = 0;
+    uint16_t currHammings;
+    int aminoAcidNum = (int) queryLength / 3;
+    int currPos;
+    size_t matchNum = filteredMatches.size();
+    size_t f = 0;
+
+    // Get the largest hamming distance at each position of query
+    auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1];
+    memset(hammingsAtEachPos, -1, (aminoAcidNum + 1));
+    while (f < matchNum) {
+        currPos = filteredMatches[f]->qInfo.pos / 3;
+        currHammings = filteredMatches[f]->rightEndHamming;
+        if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]])
+            hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings);
+        if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]])
+            hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2);
+        if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]])
+            hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4);
+        if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]])
+            hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6);
+        if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]])
+            hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8);
+        if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]])
+            hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10);
+        if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]])
+            hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12);
+        if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]])
+            hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14);
+        f++;
+    }
+
+    // Sum up hamming distances and count the number of position covered by the matches.
+    float hammingSum = 0;
+    for (int h = 0; h < aminoAcidNum; h++) {
+        if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
+            coveredPosCnt++;
+        } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
+            hammingSum += 1.0f + (0.5f * hammingsAtEachPos[h]);
+            coveredPosCnt++;
+        }
+    }
+    delete[] hammingsAtEachPos;
+
+    // Score current genus
+    int coveredLength = coveredPosCnt * 3;
+    if (coveredLength > queryLength) coveredLength = queryLength;
+    float score = ((float) coveredLength - hammingSum) / (float) queryLength;
+    float coverage = (float) (coveredLength) / (float) (queryLength);
+
+    return {filteredMatches[0]->genusId, score, coverage, (int) hammingSum};
+}
+
+TaxonScore Taxonomer::scoreGenus(vector<const Match *> &filteredMatches,
+                                  int readLength1,
+                                  int readLength2) {
+
+    // Calculate Hamming distance & covered length
+    uint16_t currHammings;
+    int aminoAcidNum_total = ((int) readLength1 / 3) + ((int) readLength2 / 3);
+    int aminoAcidNum_read1 = ((int) readLength1 / 3);
+    int currPos;
+    size_t matchNum = filteredMatches.size();
+    size_t f = 0;
+
+    // Get the largest hamming distance at each position of query
+    auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3];
+    memset(hammingsAtEachPos, -1, (aminoAcidNum_total + 3));
+    while (f < matchNum) {
+        currPos = (int) filteredMatches[f]->qInfo.pos / 3;
+        currHammings = filteredMatches[f]->rightEndHamming;
+        if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]])
+            hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings);
+        if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]])
+            hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2);
+        if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]])
+            hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4);
+        if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]])
+            hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6);
+        if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]])
+            hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8);
+        if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]])
+            hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10);
+        if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]])
+            hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12);
+        if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]])
+            hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14);
+        f++;
+    }
+
+    // Sum up hamming distances and count the number of position covered by the matches.
+    float hammingSum = 0;
+    int coveredPosCnt_read1 = 0;
+    int coveredPosCnt_read2 = 0;
+    for (int h = 0; h < aminoAcidNum_total; h++) {
+        // Read 1
+        if (h < aminoAcidNum_read1) {
+            if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
+                coveredPosCnt_read1++;
+            } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
+                hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]);
+                coveredPosCnt_read1++;
+            }
+        }
+            // Read 2
+        else {
+            if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
+                coveredPosCnt_read2++;
+            } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
+                hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]);
+                coveredPosCnt_read2++;
+            }
+        }
+    }
+    delete[] hammingsAtEachPos;
+
+    // Score current genus
+    int coveredLength_read1 = coveredPosCnt_read1 * 3;
+    int coveredLength_read2 = coveredPosCnt_read2 * 3;
+    if (coveredLength_read1 > readLength1) coveredLength_read1 = readLength1;
+    if (coveredLength_read2 > readLength2) coveredLength_read2 = readLength2;
+    float score =
+            ((float) (coveredLength_read1 + coveredLength_read2) - hammingSum) / (float) (readLength1 + readLength2);
+    float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (readLength1 + readLength2);
+
+//    matchesForEachGenus.push_back(move(filteredMatches));
+    return {filteredMatches[0]->genusId, score, coverage, (int) hammingSum};
+}
+
+TaxonScore Taxonomer::chooseSpecies(const vector<Match> &matches,
+                                     int queryLength,
+                                     vector<TaxID> &species,
+                                     unordered_map<TaxID, pair<int, int>> & speciesMatchRange) {
+    // Score each species
+    std::unordered_map<TaxID, TaxonScore> speciesScores;
+    size_t i = 0;
+    TaxID currentSpeices;
+    size_t numOfMatch = matches.size();
+    size_t speciesBegin, speciesEnd;
+    while (i < numOfMatch) {
+        currentSpeices = matches[i].speciesId;
+        speciesBegin = i;
+        while ((i < numOfMatch) && currentSpeices == matches[i].speciesId) {
+            i++;
+        }
+        speciesEnd = i;
+        speciesScores[currentSpeices] = scoreSpecies(matches, speciesBegin, speciesEnd, queryLength);
+        speciesMatchRange[currentSpeices] = {(int) speciesBegin, (int) speciesEnd};
+        speciesScores[currentSpeices].taxId = currentSpeices;
+    }
+
+    // Get the best species
+    TaxonScore bestScore;
+    for (auto & sp : speciesScores) {
+        if (sp.second.score > bestScore.score) {
+            species.clear();
+            species.push_back(sp.first);
+            bestScore = sp.second;
+        } else if (sp.second.coverage == bestScore.coverage) {
+            species.push_back(sp.first);
+        }
+    }
+    return bestScore;
+}
+
+TaxonScore Taxonomer::chooseSpecies(const vector<Match> &matches,
+                                     int read1Length,
+                                     int read2Length,
+                                     vector<TaxID> &species,
+                                     unordered_map<TaxID, pair<int, int>> & speciesMatchRange) {
+    // Score each species
+    std::unordered_map<TaxID, TaxonScore> speciesScores;
+
+
+    size_t i = 0;
+    TaxID currentSpeices;
+    size_t numOfMatch = matches.size();
+    size_t speciesBegin, speciesEnd;
+    while (i < numOfMatch) {
+        currentSpeices = matches[i].speciesId;
+        speciesBegin = i;
+        while ((i < numOfMatch) && currentSpeices == matches[i].speciesId) {
+            i++;
+        }
+        speciesEnd = i;
+        speciesScores[currentSpeices] = scoreSpecies(matches, speciesBegin, speciesEnd, read1Length, read2Length);
+        speciesMatchRange[currentSpeices] = {(int) speciesBegin, (int) speciesEnd};
+        speciesScores[currentSpeices].taxId = currentSpeices;
+    }
+
+    // Get the best species
+    TaxonScore bestScore;
+    for (auto & sp : speciesScores) {
+        if (sp.second.score > bestScore.score) {
+            species.clear();
+            species.push_back(sp.first);
+            bestScore = sp.second;
+        } else if (sp.second.coverage == bestScore.coverage) {
+            species.push_back(sp.first);
+        }
+    }
+    return bestScore;
+}
+
+TaxonScore Taxonomer::scoreSpecies(const vector<Match> &matches,
+                                    size_t begin,
+                                    size_t end,
+                                    int queryLength) {
+
+    // Get the largest hamming distance at each position of query
+    int aminoAcidNum = queryLength / 3;
+    auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1];
+    memset(hammingsAtEachPos, -1, (aminoAcidNum + 1));
+    int currPos;
+    size_t walker = begin;
+    uint16_t currHammings;
+    while (walker < end) {
+        currPos = matches[walker].qInfo.pos / 3;
+        currHammings = matches[walker].rightEndHamming;
+        if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]])
+            hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings);
+        if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]])
+            hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2);
+        if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]])
+            hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4);
+        if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]])
+            hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6);
+        if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]])
+            hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8);
+        if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]])
+            hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10);
+        if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]])
+            hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12);
+        if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]])
+            hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14);
+        walker++;
+    }
+
+    // Sum up hamming distances and count the number of position covered by the matches.
+    float hammingSum = 0;
+    int hammingDist = 0;
+    int coveredPosCnt = 0;
+    for (int h = 0; h < aminoAcidNum; h++) {
+        if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
+            coveredPosCnt++;
+        } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
+            hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]);
+            hammingDist += hammingsAtEachPos[h];
+            coveredPosCnt++;
+        }
+    }
+    delete[] hammingsAtEachPos;
+    // Score
+    int coveredLength = coveredPosCnt * 3;
+    if (coveredLength >= queryLength) coveredLength = queryLength;
+
+    float score = ((float)coveredLength - hammingSum) / (float) queryLength;
+    float coverage = (float) coveredLength / (float) (queryLength);
+
+    return {0, score, coverage, hammingDist};
+}
+
+TaxonScore Taxonomer::scoreSpecies(const vector<Match> &matches,
+                                    size_t begin,
+                                    size_t end,
+                                    int queryLength,
+                                    int queryLength2) {
+
+    // Get the smallest hamming distance at each position of query
+    int aminoAcidNum_total = queryLength / 3 + queryLength2 / 3;
+    int aminoAcidNum_read1 = queryLength / 3;
+    auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3];
+    memset(hammingsAtEachPos, -1, (aminoAcidNum_total + 3));
+
+    int currPos;
+    size_t walker = begin;
+    uint16_t currHammings;
+
+    while (walker < end) {
+        currPos = matches[walker].qInfo.pos / 3;
+        currHammings = matches[walker].rightEndHamming;
+        if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]])
+            hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings);
+        if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]])
+            hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2);
+        if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]])
+            hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4);
+        if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]])
+            hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6);
+        if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]])
+            hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8);
+        if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]])
+            hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10);
+        if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]])
+            hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12);
+        if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]])
+            hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14);
+        walker++;
+    }
+
+    // Sum up hamming distances and count the number of position covered by the matches.
+    float hammingSum = 0;
+    int hammingDist = 0;
+    int coveredPosCnt_read1 = 0;
+    int coveredPosCnt_read2 = 0;
+    for (int h = 0; h < aminoAcidNum_total; h++) {
+        // Read 1
+        if (h < aminoAcidNum_read1) {
+            if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
+                coveredPosCnt_read1++;
+            } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
+                hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]);
+                hammingDist += hammingsAtEachPos[h];
+                coveredPosCnt_read1++;
+            }
+        }
+            // Read 2
+        else {
+            if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
+                coveredPosCnt_read2++;
+            } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
+                hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]);
+                hammingDist += hammingsAtEachPos[h];
+                coveredPosCnt_read2++;
+            }
+        }
+    }
+    delete[] hammingsAtEachPos;
+
+    // Score
+    int coveredLength_read1 = coveredPosCnt_read1 * 3;
+    int coveredLength_read2 = coveredPosCnt_read2 * 3;
+    if (coveredLength_read1 >= queryLength) coveredLength_read1 = queryLength;
+    if (coveredLength_read2 >= queryLength2) coveredLength_read2 = queryLength2;
+
+    float score = ((float) (coveredLength_read1 + coveredLength_read2) - hammingSum) / (float) (queryLength + queryLength2);
+    float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (queryLength + queryLength2);
+
+    return {0, score, coverage, hammingDist};
+}
+
+bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) {
+    return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF);
+}
\ No newline at end of file
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
new file mode 100644
index 00000000..d7de78ac
--- /dev/null
+++ b/src/commons/Taxonomer.h
@@ -0,0 +1,133 @@
+#ifndef METABULI_TAXONOMER_H
+#define METABULI_TAXONOMER_H
+#include "NcbiTaxonomy.h"
+#include "LocalParameters.h"
+#include "Match.h"
+#include "common.h"
+#include "BitManipulateMacros.h"
+#include <unordered_set>
+
+using namespace std;
+
+struct TaxonScore {
+    TaxID taxId;
+    float score;
+    float coverage;
+    int hammingDist;
+    TaxonScore(TaxID taxId, float score, float coverage, int hammingDist) :
+            taxId(taxId), score(score), coverage(coverage), hammingDist(hammingDist) {}
+    TaxonScore() : taxId(0), score(0.0f), coverage(0.0f), hammingDist(0) {}
+};
+
+class Taxonomer {
+private:
+    NcbiTaxonomy * taxonomy;
+
+    // spaced k-mer
+    int unmaskedPos[9];
+    int spaceNum;
+
+    // Parameters
+    int maxGap;
+    int minCoveredPos;
+
+    struct MatchBlock {
+        MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {}
+        MatchBlock() : start(0), end(0), id(0) {}
+        size_t start;
+        size_t end;
+        uint32_t id;
+    };
+
+
+
+    // Output
+    unordered_map<TaxID, unsigned int> taxCounts;
+
+
+public:
+    Taxonomer(const LocalParameters & par, NcbiTaxonomy * taxonomy);
+    ~Taxonomer();
+
+    void assignTaxonomy(const Match *matchList,
+                        size_t numOfMatches,
+                        std::vector<Query> & queryList,
+                        const LocalParameters &par);
+
+    void chooseBestTaxon(uint32_t currentQuery,
+                         size_t offset,
+                         size_t end,
+                         const Match *matchList,
+                         vector<Query> & queryList,
+                         const LocalParameters &par);
+
+    void remainConsecutiveMatches(vector<const Match *> & curFrameMatches,
+                                  vector<const Match *> & filteredMatches,
+                                  TaxID genusId,
+                                  const LocalParameters & par);
+
+    size_t DFS(size_t curMatchIdx, const map<size_t, vector<size_t>>& linkedMatches,
+               vector<size_t>& fiteredMatchIdx, size_t depth, size_t MIN_DEPTH, unordered_set<size_t>& used,
+               unordered_map<size_t, size_t> & idx2depth);
+
+    static bool isConsecutive(const Match * match1, const Match * match2);
+
+    TaxonScore getBestGenusMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end,
+                                   size_t offset, int queryLength, const LocalParameters &par);
+
+    TaxonScore getBestGenusMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
+                                   int readLength1, int readLength2, const LocalParameters &par);
+
+    TaxonScore getBestGenusMatches_spaced(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
+                                          int readLength1, int readLength2);
+    TaxonScore getBestGenusMatches_spaced(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
+                                          int readLength1);
+
+    TaxonScore scoreGenus(vector<const Match *> &filteredMatches,
+                          int queryLength);
+
+    TaxonScore scoreGenus(vector<const Match *> &filteredMatches,
+                          int readLength1,
+                          int readLength2);
+
+    void scoreGenus_ExtensionScore(vector<Match> &filteredMatches,
+                                   vector<vector<Match>> &matchesForEachGenus,
+                                   vector<float> &scoreOfEachGenus,
+                                   int readLength1, int readLength2);
+
+    TaxonScore chooseSpecies(const std::vector<Match> &matches,
+                             int queryLength,
+                             vector<TaxID> &species,
+                             unordered_map<TaxID, pair<int, int>> & speciesMatchRange);
+
+    TaxonScore chooseSpecies(const std::vector<Match> &matches,
+                             int read1Length,
+                             int read2Length,
+                             vector<TaxID> &species,
+                             unordered_map<TaxID, pair<int, int>> & speciesMatchRange);
+
+    TaxonScore scoreSpecies(const vector<Match> &matches,
+                            size_t begin,
+                            size_t end,
+                            int queryLength);
+
+    TaxonScore scoreSpecies(const vector<Match> &matches,
+                            size_t begin,
+                            size_t end,
+                            int queryLength,
+                            int queryLength2);
+
+    TaxID lowerRankClassification(vector<Match> &matches, pair<int, int> &matchRange, TaxID speciesID);
+
+    void getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> & taxCnt,
+                               unordered_map<TaxID, TaxonCounts> & cladeCnt,
+                               TaxID spciesID);
+
+    TaxID BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID root);
+
+    // Getters
+    unordered_map<TaxID, unsigned int> & getTaxCounts() { return taxCounts; }
+};
+
+
+#endif //METABULI_TAXONOMER_H
diff --git a/src/commons/common.h b/src/commons/common.h
index 615b3af5..9b499da2 100644
--- a/src/commons/common.h
+++ b/src/commons/common.h
@@ -6,6 +6,7 @@
 
 #define likely(x) __builtin_expect((x),1)
 #define unlikely(x) __builtin_expect((x),0)
+#define kmerLength 8
 
 struct SequenceBlock{
     SequenceBlock(size_t start, size_t end, size_t length, size_t seqLength = 0)
@@ -44,6 +45,31 @@ struct Query{
               queryLength2(0), kmerCnt(0), isClassified(false), newSpecies(false) {}
 };
 
+template<typename T>
+struct Buffer {
+    T *buffer;
+    size_t startIndexOfReserve;
+    size_t bufferSize;
+
+    explicit Buffer(size_t sizeOfBuffer=100) {
+        buffer = (T *) malloc(sizeof(T) * sizeOfBuffer);
+        bufferSize = sizeOfBuffer;
+        startIndexOfReserve = 0;
+    };
+
+    size_t reserveMemory(size_t numOfKmer) {
+        size_t offsetToWrite = __sync_fetch_and_add(&startIndexOfReserve, numOfKmer);
+        return offsetToWrite;
+    };
+
+    void reallocateMemory(size_t sizeOfBuffer) {
+        if (sizeOfBuffer > bufferSize) {
+            buffer = (T *) realloc(buffer, sizeof(T) * sizeOfBuffer);
+            bufferSize = sizeOfBuffer;
+        }
+    };
+};
+
 inline bool fileExist(const std::string& name) {
     if (FILE *file = fopen(name.c_str(), "r")) {
         fclose(file);
diff --git a/src/metabuli.cpp b/src/metabuli.cpp
index f41040d1..8918c64e 100644
--- a/src/metabuli.cpp
+++ b/src/metabuli.cpp
@@ -12,7 +12,7 @@ const char* tool_name = "metabuli";
 const char* tool_introduction = "Metabuli is a taxonomical classifier that jointly analyzes amino acid and DNA sequences.";
 const char* main_author = "Jaebeom Kim <jbeom0731@gmail.com> ";
 const char* show_extended_help = "1";
-const char* show_bash_info = NULL;
+const char* show_bash_info = nullptr;
 bool hide_base_commands = true;
 extern const char* MMSEQS_CURRENT_INDEX_VERSION;
 const char* index_version_compatible = MMSEQS_CURRENT_INDEX_VERSION;
@@ -26,7 +26,7 @@ LocalParameters& localPar = LocalParameters::getLocalInstance();
 std::vector<Command> commands = {
         {"databases",            databases,            &localPar.databases,            COMMAND_DATABASE_CREATION,
                 "List and download databases",
-                NULL,
+                nullptr,
                 "Milot Mirdita <milot@mirdita.de>",
                 "<name> <o:sequenceDB> <tmpDir>",
                 CITATION_SPACEPHARER, {{"selection", 0, DbType::ZERO_OR_ALL, &DbValidator::empty },
@@ -34,7 +34,7 @@ std::vector<Command> commands = {
                                           {"tmpDir",     DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
        {"build", build, &localPar.build, COMMAND_DATABASE_CREATION,
                 "Build database based on the list of FASTA files.",
-                NULL,
+               nullptr,
                 "Jaebeom Kim <jbeom0731@gmail.com>",
                 "<DB dir> <FASTA list> <Accesssion2taxid>",
                 CITATION_SPACEPHARER,
@@ -43,14 +43,14 @@ std::vector<Command> commands = {
                  {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
         {"database-report", databaseReport, &localPar.databaseReport, COMMAND_DATABASE_CREATION,
                 "It generates a report of taxa in a database.",
-                NULL,
+                nullptr,
                 "Jaebeom Kim <jbeom0731@gmail.com>",
                 "<i: DBDIR> ",
                 CITATION_SPACEPHARER,
                 {{"Directory where the DB will be generated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
-       {"updateDB", build, &localPar.build, COMMAND_DATABASE_CREATION,
+       {"updateDB", build, &localPar.build, COMMAND_DB,
                "Update database based on the list of FASTA files.",
-               NULL,
+                nullptr,
                "Jaebeom Kim <jbeom0731@gmail.com>",
                "<DB dir> <FASTA list> <Accesssion2taxid>",
                CITATION_SPACEPHARER,
@@ -59,18 +59,25 @@ std::vector<Command> commands = {
                        {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
         {"classify", classify, &localPar.classify, COMMAND_TAXONOMY,
          "Assigning taxonomy label to query reads",
-         NULL,
+                nullptr,
          "Jaebeom Kim <jbeom0731@gmail.com>",
-         "<i:FASTA> <i:DB dir> <o:out dir> <job ID> ",
+         "<i:QUERY> <i:DB dir> <o:out dir> <job ID> ",
          CITATION_SPACEPHARER,
          {{"FASTA", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile},
           {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
           {"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory},
           {"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
-
+        {"filter", classify, &localPar.classify, COMMAND_TAXONOMY,
+                "Filtering reads based on the classification result",
+                nullptr,
+                "Jaebeom Kim <jbeom0731@gmail.com>",
+                "<i:READ FILE> <i:FILTER DB>",
+                CITATION_SPACEPHARER,
+                {{"READ FILE", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile},
+                        {"FILTER DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}},
         {"grade", grade, &localPar.grade, COMMAND_EXPERT,
                 "Grade the classification result (only for benchmarking)",
-                NULL,
+                nullptr,
                 "Jaebeom Kim <jbeom0731@gmail.com>",
                 "<i:Result list> <i:Answer sheet list> <i:Taxonomy Dir>",
                 CITATION_SPACEPHARER,
@@ -79,7 +86,7 @@ std::vector<Command> commands = {
                         {"taxonomy dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::directory}}},
         {"seqHeader2TaxId", seqHeader2TaxId, &localPar.seqHeader2TaxId, COMMAND_EXPERT,
                 "It extracts k-mers from query sequences, and compares them to the target database",
-                NULL,
+                nullptr,
                 "Jaebeom Kim <jbeom0731@gmail.com>",
                 "<i:read-classification> <i:mapping>",
                 CITATION_SPACEPHARER,
@@ -88,7 +95,7 @@ std::vector<Command> commands = {
         {"add-to-library", addToLibrary, &localPar.addToLibrary, COMMAND_DATABASE_CREATION,
                     "It bins sequences into distinct files according to their species referring their accession number.\n "
                     "It requires a mapping file (accession to tax ID) and NCBI style tax dump files in a taxonomy directory.",
-                    NULL,
+                nullptr,
                     "Jaebeom Kim <jbeom0731@gmail.com>",
                     "<I: FASTA list> <I: accession2taxid> <I: DB DIR>",
                     CITATION_SPACEPHARER,
@@ -97,7 +104,7 @@ std::vector<Command> commands = {
                      {"DB directory", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}},
         {"apply-threshold", applyThreshold, &localPar.applyThreshold, COMMAND_MAIN,
                 "Assigning taxonomy label to query reads",
-                NULL,
+                nullptr,
                 "Jaebeom Kim <jbeom0731@gmail.com>",
                 "<i:Old readclassification> <o:OUT DIR> <o:JOB ID> <i: TAXONOMY DIR> ",
                 CITATION_SPACEPHARER,
@@ -107,7 +114,7 @@ std::vector<Command> commands = {
                         {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
         {"binning2report", binning2report, &localPar.binning2report, COMMAND_FORMAT_CONVERSION,
                 "It generates Kraken style report file from binning results",
-                NULL,
+                nullptr,
                 "Jaebeom Kim <jbeom0731@gmail.com>",
                 "<i:Binning Result> <o:OUT DIR> <o:JOB ID> <i: TAXONOMY DIR> ",
                 CITATION_SPACEPHARER,
@@ -117,7 +124,7 @@ std::vector<Command> commands = {
                         {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
         {"filter-by-genus", filterByGenus, &localPar.filterByGenus, COMMAND_EXPERT,
                     "It filters out reads classified as a specific genus",
-                    NULL,
+                nullptr,
                     "Jaebeom Kim <jbeom0731@gmail.com>",
                     "<i:Binning Result> <i:Genus list> <i: TAXONOMY DIR> ",
                     CITATION_SPACEPHARER,
diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt
index 6f4819ad..66d1fb0c 100644
--- a/src/workflow/CMakeLists.txt
+++ b/src/workflow/CMakeLists.txt
@@ -3,4 +3,5 @@ set(workflow_source_files
         workflow/updateDB.cpp
         workflow/add_to_library.cpp
         workflow/build.cpp
+        workflow/filter.cpp
         PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp
index e88a4aa4..21a45f17 100644
--- a/src/workflow/classify.cpp
+++ b/src/workflow/classify.cpp
@@ -1,13 +1,10 @@
 #include "Classifier.h"
-#include "ReducedClassifier.h"
 #include "Parameters.h"
 #include "LocalParameters.h"
-#include "NcbiTaxonomy.h"
 #include "FileUtil.h"
 
 void setClassifyDefaults(LocalParameters & par){
     par.seqMode = 2;
-    par.memoryMode = 1;
     par.reducedAA = 0;
     par.minScore = 0;
     par.minCoverage = 0;
@@ -49,13 +46,7 @@ int classify(int argc, const char **argv, const Command& command)
 #endif
 
     cout << "Number of threads: " << par.threads << endl;
-    Classifier * classifier;
-    if(par.reducedAA == 1){
-        classifier = new ReducedClassifier(par);
-    } else {
-        classifier = new Classifier(par);
-    }
-
+    Classifier * classifier = new Classifier(par);
     classifier->startClassify(par);
     delete classifier;
     return 0;
diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp
new file mode 100644
index 00000000..c8e3894a
--- /dev/null
+++ b/src/workflow/filter.cpp
@@ -0,0 +1,57 @@
+#include "Classifier.h"
+#include "Parameters.h"
+#include "LocalParameters.h"
+#include "FileUtil.h"
+#include "QueryFilter.h"
+
+void setFilterDefaults(LocalParameters & par){
+    par.seqMode = 2;
+    par.reducedAA = 0;
+    par.minScore = 0.7;
+    par.minCoverage = 0;
+    par.minSpScore = 0;
+    par.spaceMask = "11111111";
+    par.hammingMargin = 0;
+    par.verbosity = 3;
+    par.ramUsage = 128;
+    par.minCoveredPos = 4;
+    par.printLog = 0;
+    par.maxGap = 0;
+    par.taxonomyPath = "DBDIR/taxonomy/" ;
+    par.minConsCnt = 4;
+    par.minConsCntEuk = 9;
+    par.eukaryotaTaxId = 2759;
+    par.maskMode = 0;
+    par.maskProb = 0.9;
+    par.matchPerKmer = 4;
+}
+
+int filter(int argc, const char **argv, const Command& command)
+{
+    LocalParameters & par = LocalParameters::getLocalInstance();
+    setFilterDefaults(par);
+    par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0);
+
+    if (par.seqMode == 2) {
+        if (!FileUtil::directoryExists(par.filenames[3].c_str())) {
+            FileUtil::makeDir(par.filenames[3].c_str());
+        }
+    } else {
+        if (!FileUtil::directoryExists(par.filenames[2].c_str())) {
+            FileUtil::makeDir(par.filenames[2].c_str());
+        }
+    }
+
+#ifdef OPENMP
+    omp_set_num_threads(par.threads);
+#endif
+
+    cout << "Number of threads: " << par.threads << endl;
+
+    QueryFilter * queryFilter = new QueryFilter(par);
+
+    queryFilter->startClassify(par);
+
+    delete classifier;
+    return 0;
+}
\ No newline at end of file

From 51f12bf980870829cd4b31bcfe9ad6cf2e4e0956 Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Fri, 18 Aug 2023 11:33:21 +0900
Subject: [PATCH 07/65] fix compile errors

---
 src/commons/KmerBuffer.h    | 2 +-
 src/commons/KmerExtractor.h | 2 +-
 src/commons/KmerMatcher.h   | 3 ++-
 src/commons/LocalUtil.cpp   | 2 +-
 src/commons/LocalUtil.h     | 4 ++--
 src/commons/Match.h         | 4 ++--
 src/commons/QueryFilter.cpp | 6 ------
 7 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/src/commons/KmerBuffer.h b/src/commons/KmerBuffer.h
index fd43cd9a..d5fe0d66 100644
--- a/src/commons/KmerBuffer.h
+++ b/src/commons/KmerBuffer.h
@@ -68,7 +68,7 @@ class TargetKmerBuffer{
     static size_t getTargetKmerBufferSize(){
         size_t memLimit = Util::getTotalSystemMemory() * 0.5;
         size_t bufferSize = memLimit / sizeof(TargetKmer);
-        cout<<Util::getTotalSystemMemory()<<endl;
+        std::cout << Util::getTotalSystemMemory() << std::endl;
         if(bufferSize > 10000000000){
             bufferSize = 10000000000;
         }
diff --git a/src/commons/KmerExtractor.h b/src/commons/KmerExtractor.h
index 260bc78c..fe862bdb 100644
--- a/src/commons/KmerExtractor.h
+++ b/src/commons/KmerExtractor.h
@@ -2,7 +2,7 @@
 #define METABULI_KMEREXTRACTER_H
 #include "SeqIterator.h"
 #include "QueryIndexer.h"
-#include "KseqWrapper.h"
+#include "KSeqWrapper.h"
 
 class KmerExtractor {
 private:
diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h
index 56379b7a..183f916f 100644
--- a/src/commons/KmerMatcher.h
+++ b/src/commons/KmerMatcher.h
@@ -9,6 +9,7 @@
 #include "Mmap.h"
 #include "BitManipulateMacros.h"
 #include "NcbiTaxonomy.h"
+#include "unordered_map"
 
 #define BufferSize 16'777'216 //16 * 1024 * 1024 // 16 M
 
@@ -19,7 +20,7 @@
 // Output
 // 1. Matched K-mers
 
-
+using namespace std;
 
 class KmerMatcher {
 protected:
diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp
index 039c635a..8dd6d268 100644
--- a/src/commons/LocalUtil.cpp
+++ b/src/commons/LocalUtil.cpp
@@ -1,7 +1,7 @@
 #include "LocalUtil.h"
 
 
-std::string LocalUtil::getQueryBaseName(const std::string queryPath) {
+std::string LocalUtil::getQueryBaseName(const std::string & queryPath) {
     std::vector<std::string> splits = Util::split(queryPath, ".");
     std::string baseName;
     int extentionNum = 1;
diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h
index 1d34a45c..ec2567cf 100644
--- a/src/commons/LocalUtil.h
+++ b/src/commons/LocalUtil.h
@@ -10,12 +10,12 @@ class LocalUtil : public Util {
 public:
     LocalUtil() = default;
 
-    static std::string getQueryBaseName(const std::string queryPath);
+    static std::string getQueryBaseName(const std::string & queryPath);
 
     template<typename T>
     static T getQueryKmerNumber(T queryLength, int spaceNum);
 
-    static void splitQueryFile(vector<SequenceBlock> & seqSegments, const string & queryPath);
+    static void splitQueryFile(std::vector<SequenceBlock> & seqSegments, const string & queryPath);
 
 };
 
diff --git a/src/commons/Match.h b/src/commons/Match.h
index 5d8cf503..da8dcdb4 100644
--- a/src/commons/Match.h
+++ b/src/commons/Match.h
@@ -33,8 +33,8 @@ struct Match { // 24 byte
     bool redundancy; // 1
 
     void printMatch() const {
-        cout << qInfo.sequenceID << " " << qInfo.pos << " " << qInfo.frame << " "
-        << targetId << " " << genusId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << endl;
+        std::cout << qInfo.sequenceID << " " << qInfo.pos << " " << qInfo.frame << " "
+        << targetId << " " << genusId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << std::endl;
     }
 };
 
diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp
index 636f65d2..863baea3 100644
--- a/src/commons/QueryFilter.cpp
+++ b/src/commons/QueryFilter.cpp
@@ -1,11 +1,6 @@
 #include "QueryFilter.h"
 
 QueryFilter::QueryFilter(LocalParameters & par) {
-    if (par.reducedAA == 1) {
-        classifier = new ReducedClassifier(par);
-    } else {
-        classifier = new Classifier(par);
-    }
     queryIndexer = new QueryIndexer(par);
 
     setInputAndOutputFiles(par);
@@ -13,7 +8,6 @@ QueryFilter::QueryFilter(LocalParameters & par) {
 
 QueryFilter::~QueryFilter() {
     delete queryIndexer;
-    delete classifier;
 }
 
 void QueryFilter::setInputAndOutputFiles(const LocalParameters & par) {

From 7fbfe711b81ab1336a009ba8b8b03187eb7eff26 Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Fri, 18 Aug 2023 11:36:52 +0900
Subject: [PATCH 08/65] fix compile errors

---
 src/commons/KmerMatcher.cpp | 4 ++--
 src/commons/LocalUtil.h     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 7117e3ce..3d15b6d2 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -149,7 +149,7 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer<Match> * m
     while (completedSplitCnt < threads) {
         bool hasOverflow = false;
 #pragma omp parallel default(none), shared(completedSplitCnt, splitCheckList, hasOverflow, \
-querySplits, queryKmerList, matchBuffer, cout, par, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName, targetSplitIdxs)
+querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName, targetSplitIdxs)
         {
             // FILE
             FILE * diffIdxFp = fopen(targetDiffIdxFileName.c_str(), "rb");
@@ -409,7 +409,7 @@ querySplits, queryKmerList, matchBuffer, cout, par, targetDiffIdxFileName, numOf
     queryKmerNum = 0;
 
 #ifdef OPENMP
-    omp_set_num_threads(par.threads);
+    omp_set_num_threads(threads);
 #endif
 
     // Sort matches
diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h
index ec2567cf..0f7ff0d2 100644
--- a/src/commons/LocalUtil.h
+++ b/src/commons/LocalUtil.h
@@ -15,7 +15,7 @@ class LocalUtil : public Util {
     template<typename T>
     static T getQueryKmerNumber(T queryLength, int spaceNum);
 
-    static void splitQueryFile(std::vector<SequenceBlock> & seqSegments, const string & queryPath);
+    static void splitQueryFile(std::vector<SequenceBlock> & seqSegments, const std::string & queryPath);
 
 };
 

From 3f994f237427c896af55f9c138963569fcf497a3 Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Fri, 18 Aug 2023 11:37:46 +0900
Subject: [PATCH 09/65] fix compile errors

---
 src/workflow/filter.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp
index c8e3894a..fd02afcf 100644
--- a/src/workflow/filter.cpp
+++ b/src/workflow/filter.cpp
@@ -48,10 +48,6 @@ int filter(int argc, const char **argv, const Command& command)
 
     cout << "Number of threads: " << par.threads << endl;
 
-    QueryFilter * queryFilter = new QueryFilter(par);
 
-    queryFilter->startClassify(par);
-
-    delete classifier;
     return 0;
 }
\ No newline at end of file

From 5e8b8806495c19b78846028137f8f3b32de97a43 Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Fri, 18 Aug 2023 11:41:09 +0900
Subject: [PATCH 10/65] fix compile errors

---
 src/commons/Classifier.cpp |  2 --
 src/commons/Classifier.h   | 28 ----------------------------
 2 files changed, 30 deletions(-)

diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp
index eebbc63f..09a53dfa 100644
--- a/src/commons/Classifier.cpp
+++ b/src/commons/Classifier.cpp
@@ -1,6 +1,4 @@
 #include "Classifier.h"
-#include "LocalParameters.h"
-#include "taxonomyreport.cpp"
 
 Classifier::Classifier(LocalParameters & par) {
     // Load parameters
diff --git a/src/commons/Classifier.h b/src/commons/Classifier.h
index 5cf61ce9..81a0b027 100644
--- a/src/commons/Classifier.h
+++ b/src/commons/Classifier.h
@@ -49,9 +49,6 @@ class Classifier {
     Reporter * reporter;
     NcbiTaxonomy * taxonomy;
 
-
-
-
 public:
     void startClassify(const LocalParameters &par);
 
@@ -59,32 +56,7 @@ class Classifier {
 
     virtual ~Classifier();
 
-
 };
 
 
-
-
-//inline uint64_t
-//Classifier::getNextTargetKmer(uint64_t lookingTarget, const uint16_t *targetDiffIdxList, size_t &diffIdxPos) {
-//    uint16_t fragment;
-//    uint16_t check = (0x1u << 15u);
-//    uint64_t diffIn64bit = 0;
-//    fragment = targetDiffIdxList[diffIdxPos];
-//    diffIdxPos++;
-//    while (!(fragment & check)) { // 27 %
-//        diffIn64bit |= fragment;
-//        diffIn64bit <<= 15u;
-//        fragment = targetDiffIdxList[diffIdxPos];
-//        diffIdxPos++;
-//    }
-//    fragment &= ~check; // not; 8.47 %
-//    diffIn64bit |= fragment; // or : 23.6%
-//
-//    return diffIn64bit + lookingTarget;
-//}
-
-
-
-
 #endif //ADKMER4_SEARCHER_H

From ba571e9d803c8522f53ae592fcef5d1d81878fa0 Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Fri, 18 Aug 2023 11:43:42 +0900
Subject: [PATCH 11/65] fix compile errors

---
 src/commons/LocalUtil.cpp | 4 ----
 src/commons/LocalUtil.h   | 6 ++++++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp
index 8dd6d268..6498f75b 100644
--- a/src/commons/LocalUtil.cpp
+++ b/src/commons/LocalUtil.cpp
@@ -18,10 +18,6 @@ std::string LocalUtil::getQueryBaseName(const std::string & queryPath) {
     return baseName;
 }
 
-template <typename T>
-T LocalUtil::getQueryKmerNumber(T queryLength, int spaceNum) {
-    return (getMaxCoveredLength(queryLength) / 3 - kmerLength - spaceNum + 1) * 6;
-}
 
 
 void LocalUtil::splitQueryFile(std::vector<SequenceBlock> & sequences, const std::string &queryPath) {
diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h
index 0f7ff0d2..550d34d2 100644
--- a/src/commons/LocalUtil.h
+++ b/src/commons/LocalUtil.h
@@ -20,4 +20,10 @@ class LocalUtil : public Util {
 };
 
 
+template <typename T>
+T LocalUtil::getQueryKmerNumber(T queryLength, int spaceNum) {
+    return (getMaxCoveredLength(queryLength) / 3 - kmerLength - spaceNum + 1) * 6;
+}
+
+
 #endif //METABULI_LOCALUTIL_H

From ec780c5eeef9ad8c557e4ec16a30b55539c5a481 Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Fri, 18 Aug 2023 11:45:43 +0900
Subject: [PATCH 12/65] fix compile errors

---
 src/commons/KmerExtractor.cpp | 13 ++-----------
 src/commons/KmerExtractor.h   |  2 +-
 src/commons/LocalUtil.cpp     | 10 +++++++++-
 src/commons/LocalUtil.h       |  1 +
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp
index f8860ec9..a72521da 100644
--- a/src/commons/KmerExtractor.cpp
+++ b/src/commons/KmerExtractor.cpp
@@ -142,8 +142,8 @@ void KmerExtractor::fillQueryKmerBufferParallel_paired(KSeqWrapper *kseq1,
             int kmerCnt2 = LocalUtil::getQueryKmerNumber<int>((int) e2.sequence.l, spaceNum);
 
             // Query Info
-            queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l);
-            queryList[processedQueryNum].queryLength2 = getMaxCoveredLength((int) e2.sequence.l);
+            queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) e1.sequence.l);
+            queryList[processedQueryNum].queryLength2 = LocalUtil::getMaxCoveredLength((int) e2.sequence.l);
             queryList[processedQueryNum].name = string(e1.name.s);
             queryList[processedQueryNum].kmerCnt = (int) (kmerCnt + kmerCnt2);
 
@@ -205,12 +205,3 @@ void KmerExtractor::fillQueryKmerBufferParallel_paired(KSeqWrapper *kseq1,
     }
 }
 
-int KmerExtractor::getMaxCoveredLength(int queryLength) {
-    if (queryLength % 3 == 2) {
-        return queryLength - 2; // 2
-    } else if (queryLength % 3 == 1) {
-        return queryLength - 4; // 4
-    } else {
-        return queryLength - 3; // 3
-    }
-}
\ No newline at end of file
diff --git a/src/commons/KmerExtractor.h b/src/commons/KmerExtractor.h
index fe862bdb..2e7f2977 100644
--- a/src/commons/KmerExtractor.h
+++ b/src/commons/KmerExtractor.h
@@ -29,7 +29,7 @@ class KmerExtractor {
                                             const QuerySplit & currentSplit,
                                             const LocalParameters &par);
 
-    static int getMaxCoveredLength(int queryLength) ;
+
 
 public:
     explicit KmerExtractor(const LocalParameters & par);
diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp
index 6498f75b..08e04508 100644
--- a/src/commons/LocalUtil.cpp
+++ b/src/commons/LocalUtil.cpp
@@ -33,4 +33,12 @@ void LocalUtil::splitQueryFile(std::vector<SequenceBlock> & sequences, const std
     delete kseq;
 }
 
-
+int LocalUtil::getMaxCoveredLength(int queryLength) {
+    if (queryLength % 3 == 2) {
+        return queryLength - 2; // 2
+    } else if (queryLength % 3 == 1) {
+        return queryLength - 4; // 4
+    } else {
+        return queryLength - 3; // 3
+    }
+}
\ No newline at end of file
diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h
index 550d34d2..0fcbdf82 100644
--- a/src/commons/LocalUtil.h
+++ b/src/commons/LocalUtil.h
@@ -17,6 +17,7 @@ class LocalUtil : public Util {
 
     static void splitQueryFile(std::vector<SequenceBlock> & seqSegments, const std::string & queryPath);
 
+    static int getMaxCoveredLength(int queryLength) ;
 };
 
 

From bf2633ef21740e524e83c5c321ae4d8d64e9fa43 Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Fri, 18 Aug 2023 11:46:20 +0900
Subject: [PATCH 13/65] fix compile errors

---
 src/commons/KmerExtractor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp
index a72521da..2488051f 100644
--- a/src/commons/KmerExtractor.cpp
+++ b/src/commons/KmerExtractor.cpp
@@ -65,7 +65,7 @@ void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq1,
             int kmerCnt = LocalUtil::getQueryKmerNumber<int>((int) e1.sequence.l, spaceNum);
 
             // Query Info
-            queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l);
+            queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) e1.sequence.l);
             queryList[processedQueryNum].name = string(e1.name.s);
             queryList[processedQueryNum].kmerCnt = (int) (kmerCnt);
 

From 5eac1d1036af8f4053007340baabb10fb30642d5 Mon Sep 17 00:00:00 2001
From: JaebeomKim0731 <jbeom0731@gmail.com>
Date: Fri, 18 Aug 2023 12:23:12 +0900
Subject: [PATCH 14/65] Fix error

---
 src/commons/QueryIndexer.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp
index f5a30ff9..ad0f7f86 100644
--- a/src/commons/QueryIndexer.cpp
+++ b/src/commons/QueryIndexer.cpp
@@ -17,6 +17,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) {
     readNum_1 = 0;
     readNum_2 = 0;
     spaceNum = par.spaceMask.length() - kmerLength;
+    totalReadLength = 0;
 
     setAvailableRam();
 }

From 7a74d115649e6d9ef5743fd611ab698ab3883bf5 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Fri, 18 Aug 2023 16:53:22 +0900
Subject: [PATCH 15/65] Fix run time errors

---
 src/commons/Classifier.cpp    |  5 ++
 src/commons/KmerExtractor.cpp |  2 +
 src/commons/KmerMatcher.cpp   | 54 ++++++++++++++------
 src/commons/KmerMatcher.h     | 35 ++++++-------
 src/commons/LocalParameters.h |  2 +-
 src/commons/QueryIndexer.cpp  | 93 +++++++++++++++++++++--------------
 src/commons/Taxonomer.cpp     |  2 +-
 7 files changed, 124 insertions(+), 69 deletions(-)

diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp
index 09a53dfa..3f157fd3 100644
--- a/src/commons/Classifier.cpp
+++ b/src/commons/Classifier.cpp
@@ -43,6 +43,11 @@ void Classifier::startClassify(const LocalParameters &par) {
     cout << "Total number of sequences: " << numOfSeq << endl;
     cout << "Total read length: " << totalReadLength <<  "nt" << endl;
 
+    // Print queryReadSplit
+   for (size_t i = 0; i < queryReadSplit.size(); i++) {
+       cout << queryReadSplit[i].start << " " << queryReadSplit[i].end << " " << queryReadSplit[i].kmerCnt << endl;
+   }
+
     QueryKmerBuffer kmerBuffer;
     Buffer<Match> matchBuffer;
     vector<Query> queryList;
diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp
index 2488051f..91f5ee8f 100644
--- a/src/commons/KmerExtractor.cpp
+++ b/src/commons/KmerExtractor.cpp
@@ -4,6 +4,8 @@ KmerExtractor::KmerExtractor(const LocalParameters &par) {
     spaceNum = par.spaceMask.length() - 8;
     maskMode = par.maskMode;
     maskProb = par.maskProb;
+    subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0);
+    probMatrix = new ProbabilityMatrix(*(subMat));
 }
 
 KmerExtractor::~KmerExtractor() {
diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 3d15b6d2..8486538a 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -22,7 +22,6 @@ KmerMatcher::KmerMatcher(const LocalParameters & par,
         return;
     }
     char taxID[100];
-
     while(feof(taxIdFile) == 0) {
         fscanf(taxIdFile,"%s",taxID);
         TaxID taxId = atol(taxID);
@@ -88,8 +87,7 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer<Match> * m
     // Each split has start and end points of query list + proper offset point of target k-mer list
     std::vector<QueryKmerSplit> querySplits;
     uint64_t queryAA;
-    std::vector<int> targetSplitIdxs;
-
+ 
     if (threads == 1) { //Single thread
         querySplits.emplace_back(0, queryKmerNum - 1, queryKmerNum, diffIdxSplits.data[0]);
     } else if (threads == 2) { //Two threads
@@ -121,7 +119,6 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer<Match> * m
                         querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i,
                                                  diffIdxSplits.data[j]);
                     }
-                    targetSplitIdxs.emplace_back(j);
                     needLastTargetBlock = false;
                     break;
                 }
@@ -130,11 +127,9 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer<Match> * m
                 if (i != threads - 1) { // If it is not the last split
                     querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth,
                                              diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
-                    targetSplitIdxs.emplace_back(numOfDiffIdxSplits_use - 2);
                 } else {
                     querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i,
                                              diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
-                    targetSplitIdxs.emplace_back(numOfDiffIdxSplits_use - 2);
                 }
             }
         }
@@ -149,7 +144,7 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer<Match> * m
     while (completedSplitCnt < threads) {
         bool hasOverflow = false;
 #pragma omp parallel default(none), shared(completedSplitCnt, splitCheckList, hasOverflow, \
-querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName, targetSplitIdxs)
+querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName)
         {
             // FILE
             FILE * diffIdxFp = fopen(targetDiffIdxFileName.c_str(), "rb");
@@ -157,7 +152,7 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
 
             // Target K-mer buffer
             uint16_t * diffIdxBuffer = (uint16_t *) malloc(sizeof(uint16_t) * (BufferSize + 1)); // size = 32 Mb
-            TargetKmerInfo * kmerInfoBuffer = (TargetKmerInfo *) malloc(sizeof(TargetKmerInfo) * (BufferSize+1)); // 64 Mb
+            TargetKmerInfo * kmerInfoBuffer = (TargetKmerInfo *) malloc(sizeof(TargetKmerInfo) * (BufferSize + 1)); // 64 Mb
             size_t kmerInfoBufferIdx = 0;
             size_t diffIdxBufferIdx = 0;
 
@@ -177,9 +172,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
             auto *matches = new Match[localBufferSize]; // 16 * 2'000'000 = 32 Mb
             int matchCnt = 0;
 
-            // For debug
-//            SeqIterator seqIterator(par);
-
             //vectors for selected target k-mers
             std::vector<uint8_t> selectedHammingSum;
             std::vector<size_t> selectedMatches;
@@ -236,6 +228,11 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
                         }
                         for (int k = 0; k < currMatchNum; k++) {
                             idx = selectedMatches[k];
+                            // Check if candidateKmerInfos[idx].sequenceID is valid
+                            if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
+                                taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
+                                cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
+                            }
                             matches[matchCnt] = {queryKmerList[j].info,
                                                  candidateKmerInfos[idx].sequenceID,
                                                  taxId2genusId[candidateKmerInfos[idx].sequenceID],
@@ -273,6 +270,11 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
                         }
                         for (int k = 0; k < currMatchNum; k++) {
                             idx = selectedMatches[k];
+                            // Check if candidateKmerInfos[idx].sequenceID is valid
+                            if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
+                                taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
+                                cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
+                            }
                             matches[matchCnt] = {queryKmerList[j].info,
                                                  candidateKmerInfos[idx].sequenceID,
                                                  taxId2genusId[candidateKmerInfos[idx].sequenceID],
@@ -365,6 +367,11 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
 
                     for (int k = 0; k < currMatchNum; k++) {
                         idx = selectedMatches[k];
+                        // Check if candidateKmerInfos[idx].sequenceID is valid
+                        if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
+                            taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
+                            cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
+                        }
                         matches[matchCnt] = {queryKmerList[j].info,
                                              candidateKmerInfos[idx].sequenceID,
                                              taxId2genusId[candidateKmerInfos[idx].sequenceID],
@@ -404,7 +411,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
         }
     } // end of while(completeSplitCnt < threadNum)
     std::cout << "Time spent for the comparison: " << double(time(nullptr) - beforeSearch) << std::endl;
-    munmap(diffIdxSplits.data, diffIdxSplits.fileSize + 1);
     free(splitCheckList);
     queryKmerNum = 0;
 
@@ -416,8 +422,9 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
     time_t beforeSortMatches = time(nullptr);
     totalMatchCnt += matchBuffer->startIndexOfReserve;
     std::cout << "Sorting matches ..." << std::endl;
-    SORT_PARALLEL(matchBuffer->buffer, matchBuffer->buffer + matchBuffer->startIndexOfReserve,
-                  sortMatch());
+    SORT_PARALLEL(matchBuffer->buffer,
+                  matchBuffer->buffer + matchBuffer->startIndexOfReserve,
+                  compareMatches);
     std::cout << "Time spent for sorting matches: " << double(time(nullptr) - beforeSortMatches) << std::endl;
 
     return 1;
@@ -463,4 +470,23 @@ void KmerMatcher::compareDna(uint64_t query,
         }
     }
     delete[] hammingSums;
+}
+
+bool KmerMatcher::compareMatches(const Match& a, const Match& b) {
+    if (a.qInfo.sequenceID != b.qInfo.sequenceID)
+        return a.qInfo.sequenceID < b.qInfo.sequenceID;
+
+    if (a.genusId != b.genusId)
+        return a.genusId < b.genusId;
+
+    if (a.speciesId != b.speciesId)
+        return a.speciesId < b.speciesId;
+
+    if (a.qInfo.frame != b.qInfo.frame)
+        return a.qInfo.frame < b.qInfo.frame;
+
+    if (a.qInfo.pos != b.qInfo.pos)
+        return a.qInfo.pos < b.qInfo.pos;
+
+    return a.hamming < b.hamming;
 }
\ No newline at end of file
diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h
index 183f916f..c4b98643 100644
--- a/src/commons/KmerMatcher.h
+++ b/src/commons/KmerMatcher.h
@@ -48,7 +48,6 @@ class KmerMatcher {
     struct QueryKmerSplit {
         QueryKmerSplit(size_t start, size_t end, size_t length, const DiffIdxSplit& diffIdxSplit)
                 : start(start), end(end), length(length), diffIdxSplit(diffIdxSplit) {}
-
         size_t start; // start idx in query k-mer list
         size_t end; // end idx in query k-mer list
         size_t length;
@@ -98,6 +97,8 @@ class KmerMatcher {
 
     virtual uint16_t getHammings_reverse(uint64_t kmer1, uint64_t kmer2);
 
+    static bool compareMatches(const Match& a, const Match& b);
+
 public:
     KmerMatcher(const LocalParameters & par,
                 NcbiTaxonomy * taxonomy);
@@ -133,7 +134,7 @@ inline
 TargetKmerInfo KmerMatcher::getKmerInfo(size_t bufferSize,
                                         FILE * kmerInfoFp,
                                         TargetKmerInfo * infoBuffer,
-                                       size_t & infoBufferIdx){
+                                        size_t & infoBufferIdx){
     if (unlikely(infoBufferIdx >= bufferSize)) {
         loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize, (int) (infoBufferIdx - bufferSize));
     }
@@ -173,25 +174,25 @@ inline uint16_t KmerMatcher::getHammings_reverse(uint64_t kmer1, uint64_t kmer2)
     return hammings;
 }
 
-struct sortMatch {
-    bool operator() (const Match& a, const Match& b) const {
-        if (a.qInfo.sequenceID != b.qInfo.sequenceID)
-            return a.qInfo.sequenceID < b.qInfo.sequenceID;
+// struct sortMatch {
+//     bool operator() (const Match& a, const Match& b) const {
+//         if (a.qInfo.sequenceID != b.qInfo.sequenceID)
+//             return a.qInfo.sequenceID < b.qInfo.sequenceID;
 
-        if (a.genusId != b.genusId)
-            return a.genusId < b.genusId;
+//         if (a.genusId != b.genusId)
+//             return a.genusId < b.genusId;
 
-        if (a.speciesId != b.speciesId)
-            return a.speciesId < b.speciesId;
+//         if (a.speciesId != b.speciesId)
+//             return a.speciesId < b.speciesId;
 
-        if (a.qInfo.frame != b.qInfo.frame)
-            return a.qInfo.frame < b.qInfo.frame;
+//         if (a.qInfo.frame != b.qInfo.frame)
+//             return a.qInfo.frame < b.qInfo.frame;
 
-        if (a.qInfo.pos != b.qInfo.pos)
-            return a.qInfo.pos < b.qInfo.pos;
+//         if (a.qInfo.pos != b.qInfo.pos)
+//             return a.qInfo.pos < b.qInfo.pos;
 
-        return a.hamming < b.hamming;
-    }
-};
+//         return a.hamming < b.hamming;
+//     }
+// };
 
 #endif //METABULI_KMERMATCHER_H
diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h
index 4169ab25..ebc4ab2f 100644
--- a/src/commons/LocalParameters.h
+++ b/src/commons/LocalParameters.h
@@ -1,7 +1,7 @@
 #ifndef ADCLASSIFIER2_LOCALPARAMETERS_H
 #define ADCLASSIFIER2_LOCALPARAMETERS_H
 
-#include <Parameters.h>
+#include "Parameters.h"
 
 const int CITATION_SPACEPHARER = CITATION_END;
 
diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp
index ad0f7f86..07a7251f 100644
--- a/src/commons/QueryIndexer.cpp
+++ b/src/commons/QueryIndexer.cpp
@@ -29,56 +29,77 @@ void QueryIndexer::setAvailableRam() {
 
 void QueryIndexer::indexQueryFile() {
     // Read 1
-    KSeqWrapper* kseq;
-    kseq = KSeqFactory(queryPath_1.c_str());
-    size_t kmerCnt = 0;
-    size_t seqCnt = 0;
-    size_t start = 0;
-    while (kseq->ReadEntry()) {
-        readNum_1++;
-        const KSeqWrapper::KSeqEntry &e = kseq->entry;
-        totalReadLength += e.sequence.l;
-        size_t currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(e.sequence.l, spaceNum);
-        kmerCnt += currentKmerCnt;
-        seqCnt++;
-        if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) {
-            querySplits.emplace_back(start, readNum_1, kmerCnt - currentKmerCnt);
-            kmerCnt = currentKmerCnt;
-            start = readNum_1;
-            seqCnt = 1;
-        }
-    }
-    querySplits.emplace_back(start, readNum_1, kmerCnt);
-    delete kseq;
-
-    // Read 2
-    if (seqMode == 2) {
-        kseq = KSeqFactory(queryPath_2.c_str());
-        kmerCnt = 0;
-        seqCnt = 0;
-        start = 0;
+    if (seqMode == 1 || seqMode == 3) {
+        KSeqWrapper* kseq;
+        kseq = KSeqFactory(queryPath_1.c_str());
+        size_t kmerCnt = 0;
+        size_t seqCnt = 0;
+        size_t start = 0;
         while (kseq->ReadEntry()) {
-            readNum_2++;
+            readNum_1++;
             const KSeqWrapper::KSeqEntry &e = kseq->entry;
             totalReadLength += e.sequence.l;
             size_t currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(e.sequence.l, spaceNum);
             kmerCnt += currentKmerCnt;
             seqCnt++;
             if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) {
-                querySplits.emplace_back(start, readNum_2, kmerCnt - currentKmerCnt);
+                querySplits.emplace_back(start, readNum_1, kmerCnt - currentKmerCnt);
                 kmerCnt = currentKmerCnt;
-                start = readNum_2;
+                start = readNum_1;
                 seqCnt = 1;
             }
         }
-        querySplits.emplace_back(start, readNum_2, kmerCnt);
+        querySplits.emplace_back(start, readNum_1, kmerCnt);
         delete kseq;
+    } else {
+        KSeqWrapper* kseq_1 = KSeqFactory(queryPath_1.c_str());
+        KSeqWrapper* kseq_2 = KSeqFactory(queryPath_2.c_str());
+        size_t kmerCnt = 0;
+                size_t seqCnt_1 = 0;
+        size_t seqCnt_2 = 0;
+        size_t start = 0;
+        size_t currentKmerCnt;
+        bool end = false;
+        while(true) {
+            if (kseq_1->ReadEntry()) {
+                readNum_1++;
+                seqCnt_1++;
+                totalReadLength += kseq_1->entry.sequence.l;
+                currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(kseq_1->entry.sequence.l, spaceNum);
+                kmerCnt += currentKmerCnt;
+            } else {
+                end = true;
+            }
+
+            if (kseq_2->ReadEntry()) {
+                readNum_2++;
+                seqCnt_2++;
+                totalReadLength += kseq_2->entry.sequence.l;
+                currentKmerCnt += LocalUtil::getQueryKmerNumber<size_t>(kseq_2->entry.sequence.l, spaceNum);
+                kmerCnt += currentKmerCnt;
+            } else {
+                end = true;
+            }
+
+            if (seqCnt_1 != seqCnt_2) {
+                Debug(Debug::ERROR) << "The number of reads in the two files are not equal." << "\n";
+                EXIT(EXIT_FAILURE);
+            }
 
-        // Check if the number of reads in the two files are equal
-        if (readNum_1 != readNum_2) {
-            Debug(Debug::ERROR) << "The number of reads in the two files are not equal." << "\n";
-            EXIT(EXIT_FAILURE);
+            if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) {
+                querySplits.emplace_back(start, seqCnt_1, kmerCnt - currentKmerCnt);
+                kmerCnt = currentKmerCnt;
+                start = seqCnt_1;
+                seqCnt_1 = 1;
+            }
+
+            if (end) {
+                querySplits.emplace_back(start, seqCnt_1, kmerCnt);
+                break;
+            }
         }
+        delete kseq_1;
+        delete kseq_2;
     }
 }
 
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index c41ddea9..85ef5339 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -7,7 +7,7 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon
     for(size_t i = 0, j = 0; i < par.spaceMask.length(); i++){
         mask[i] = par.spaceMask[i] - 48;
         spaceNum += (mask[i] == 0);
-        if(par.spaceMask[i]==1){
+        if(mask[i] == 1){
             unmaskedPos[j] = (int) i;
             j++;
         }

From 7ee4d6700a95f4fbf40f05764b1d5156b87a1a45 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Mon, 21 Aug 2023 16:38:05 +0900
Subject: [PATCH 16/65] first implementation of a filtering module

---
 src/commons/Classifier.cpp      |   5 --
 src/commons/KmerMatcher.cpp     |   2 +-
 src/commons/LocalParameters.cpp |  29 +++++++
 src/commons/LocalParameters.h   |   7 ++
 src/commons/QueryFilter.cpp     | 135 +++++++++++++++++++++++++++++++-
 src/commons/QueryFilter.h       |  26 +++++-
 src/commons/QueryIndexer.cpp    |  10 +--
 src/commons/Reporter.cpp        |   5 +-
 src/commons/Reporter.h          |  10 +--
 src/metabuli.cpp                |   2 +-
 src/workflow/filter.cpp         |  23 ++----
 11 files changed, 216 insertions(+), 38 deletions(-)

diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp
index 3f157fd3..09a53dfa 100644
--- a/src/commons/Classifier.cpp
+++ b/src/commons/Classifier.cpp
@@ -43,11 +43,6 @@ void Classifier::startClassify(const LocalParameters &par) {
     cout << "Total number of sequences: " << numOfSeq << endl;
     cout << "Total read length: " << totalReadLength <<  "nt" << endl;
 
-    // Print queryReadSplit
-   for (size_t i = 0; i < queryReadSplit.size(); i++) {
-       cout << queryReadSplit[i].start << " " << queryReadSplit[i].end << " " << queryReadSplit[i].kmerCnt << endl;
-   }
-
     QueryKmerBuffer kmerBuffer;
     Buffer<Match> matchBuffer;
     vector<Query> queryList;
diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 8486538a..204187d4 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -160,7 +160,7 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
             uint64_t currentQuery = UINT64_MAX;
             uint64_t currentQueryAA = UINT64_MAX;
             QueryKmerInfo currentQueryInfo;
-
+            
             //target variables
             size_t diffIdxPos = 0;
             std::vector<uint64_t> candidateTargetKmers; //vector for candidate target k-mer, some of which are selected after based on hamming distance
diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp
index 47b2b689..b6d21b7b 100644
--- a/src/commons/LocalParameters.cpp
+++ b/src/commons/LocalParameters.cpp
@@ -223,6 +223,13 @@ LocalParameters::LocalParameters() :
                       typeid(std::string),
                       (void *) &printColumns,
                       "^.*$")
+        PRINT_MODE(PRINT_MODE_ID,
+                        "--print-mode",
+                       "[1] Only filtered reads [2] Both filtered and removed reads",
+                       "[1] Only filtered reads [2] Both filtered and removed reads",
+                       typeid(int),
+                       (void *) &printMode,
+                       "[1-2]")
   {
     //add_to_library
 
@@ -258,6 +265,28 @@ LocalParameters::LocalParameters() :
     classify.push_back(&PARAM_MASK_PROBABILTY);
     classify.push_back(&MATCH_PER_KMER);
 
+    // filter 
+    filter.push_back(&PARAM_THREADS);
+    filter.push_back(&SEQ_MODE);
+    filter.push_back(&VIRUS_TAX_ID);
+    filter.push_back(&REDUCED_AA);
+    filter.push_back(&MIN_SCORE);
+    filter.push_back(&MIN_COVERAGE);
+    filter.push_back(&SPACED);
+    filter.push_back(&HAMMING_MARGIN);
+    filter.push_back(&MIN_SP_SCORE);
+    filter.push_back(&PARAM_V);
+    filter.push_back(&RAM_USAGE);
+    filter.push_back(&MIN_COVERED_POS);
+    filter.push_back(&PRINT_LOG);
+    filter.push_back(&MAX_GAP);
+    filter.push_back(&TAXONOMY_PATH);
+    filter.push_back(&MIN_CONS_CNT);
+    filter.push_back(&MIN_CONS_CNT_EUK);
+    filter.push_back(&PARAM_MASK_RESIDUES);
+    filter.push_back(&PARAM_MASK_PROBABILTY);
+    filter.push_back(&MATCH_PER_KMER);
+    filter.push_back(&PRINT_MODE);
 
     //updateTargetDB
     exclusiontest_hiv.push_back(&TEST_RANK);
diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h
index ebc4ab2f..1a173deb 100644
--- a/src/commons/LocalParameters.h
+++ b/src/commons/LocalParameters.h
@@ -21,6 +21,7 @@ class LocalParameters : public Parameters {
     }
 
     std::vector<MMseqsParameter*> classify;
+    std::vector<MMseqsParameter*> filter;
     std::vector<MMseqsParameter*> exclusiontest_hiv;
     std::vector<MMseqsParameter*> seqHeader2TaxId;
     std::vector<MMseqsParameter*> grade;
@@ -71,6 +72,9 @@ class LocalParameters : public Parameters {
     PARAMETER(COVERAGE_COL)
     PARAMETER(PRINT_COLUMNS)
 
+    // Filter
+    PARAMETER(PRINT_MODE)
+
     // Superkingdom taxonomy id
     int virusTaxId;
     int bacteriaTaxId;
@@ -113,6 +117,9 @@ class LocalParameters : public Parameters {
     // Add to library
     bool assembly;
 
+    // Filter
+    int printMode;
+
 private:
     LocalParameters();
 
diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp
index 863baea3..57f97fd0 100644
--- a/src/commons/QueryFilter.cpp
+++ b/src/commons/QueryFilter.cpp
@@ -1,13 +1,58 @@
 #include "QueryFilter.h"
 
 QueryFilter::QueryFilter(LocalParameters & par) {
+    // Load parameters
+    dbDir = par.filenames[1 + (par.seqMode == 2)];
+    matchPerKmer = par.matchPerKmer;
+    printMode = par.printMode;
+
+    // Taxonomy
+    if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
+    taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp",
+                                par.taxonomyPath + "/nodes.dmp",
+                                par.taxonomyPath + "/merged.dmp");
+
+    // Agents
     queryIndexer = new QueryIndexer(par);
+    kmerExtractor = new KmerExtractor(par);
+    if (par.reducedAA) { kmerMatcher = new ReducedKmerMatcher(par, taxonomy);} 
+    else { kmerMatcher = new KmerMatcher(par, taxonomy);}
+    taxonomer = new Taxonomer(par, taxonomy);
+    reporter = new Reporter(par, taxonomy);
 
     setInputAndOutputFiles(par);
+    filter_kseq1 = KSeqFactory(in1.c_str());
+    if (par.seqMode == 2) { filter_kseq2 = KSeqFactory(in2.c_str()); }
+
+    isFiltered = new bool[queryIndexer->getReadNum_1()];
+    memset(isFiltered, 0, sizeof(bool) * queryIndexer->getReadNum_1());
+    readCounter = 0;
+
+    // Open output files
+    f1_fp = fopen(f1.c_str(), "w");
+    if (par.seqMode == 2) { f2_fp = fopen(f2.c_str(), "w"); }
+    if (printMode == 2) {
+        rm1_fp = fopen(rm1.c_str(), "w");
+        if (par.seqMode == 2) { rm2_fp = fopen(rm2.c_str(), "w"); }
+    }
 }
 
 QueryFilter::~QueryFilter() {
+    delete taxonomy;
     delete queryIndexer;
+    delete kmerExtractor;
+    delete kmerMatcher;
+    delete taxonomer;
+    delete reporter;
+    delete filter_kseq1;
+    delete filter_kseq2;
+    delete[] isFiltered;
+    fclose(f1_fp);
+    if (par.seqMode == 2) { fclose(f2_fp); }
+    if (printMode == 2) {
+        fclose(rm1_fp);
+        if (par.seqMode == 2) { fclose(rm2_fp); }
+    }
 }
 
 void QueryFilter::setInputAndOutputFiles(const LocalParameters & par) {
@@ -16,13 +61,37 @@ void QueryFilter::setInputAndOutputFiles(const LocalParameters & par) {
     string baseName = LocalUtil::getQueryBaseName(in1);
 
     // Set the output file names
-    out1 = baseName + "_filtered.fna.gz";
-    reportFileName = baseName + "_filter_report.tsv";
+    f1 = baseName + "_filtered.fna.gz";
+    rm1 = baseName + "_removed.fna.gz";
 
     // For paired-end reads
     if (par.seqMode == 2) {
         in2 = par.filenames[1];
-        out2 = LocalUtil::getQueryBaseName(in2) + "_filtered.fna.gz";
+        f2 = LocalUtil::getQueryBaseName(in2) + "_filtered.fna.gz";
+        rm2 = LocalUtil::getQueryBaseName(in2) + "_removed.fna.gz";
+    }
+}
+
+void QueryFilter::recordFilteredReads(const vectore<Query> & queryList) {
+    for (query:queryList){
+        isFiltered[readCounter++] = query.isClassified;
+    }
+}
+
+void QueryFilter::printFilteredReads() {
+    for (size_t i = 0; i < readCounter; i ++) {
+        // Read query reads
+        filter_kseq1->ReadEntry();
+        if (par.seqMode == 2) { filter_kseq2->ReadEntry(); }
+
+        // Print reads
+        if (isFiltered[i]) { // Print filtered reads
+            fprintf(f1_fp, ">%s\n%s\n", filter_kseq1->entry.name.s, filter_kseq1->entry.sequence.s);
+            if (par.seqMode == 2) { fprintf(f2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); }
+        } else if (printMode == 2) { // Print removed reads
+            fprintf(rm1_fp, ">%s\n%s\n", filter_kseq1->entry.name.s, filter_kseq1->entry.sequence.s);
+            if (par.seqMode == 2) { fprintf(rm2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); }
+        }
     }
 }
 
@@ -41,6 +110,66 @@ void QueryFilter::filterReads(LocalParameters & par) {
     Buffer<Match> matchBuffer;
     vector<Query> queryList;
 
+    size_t numOfTatalQueryKmerCnt = 0;
+    size_t totalMatchCnt = 0;
+    size_t processedSeqCnt = 0;
+    reporter->openReadClassificationFile();
+
+#ifdef OPENMP
+    omp_set_num_threads(par.threads);
+#endif
+
+    KSeqWrapper* kseq1 = KSeqFactory(in1.c_str());
+    KSeqWrapper* kseq2 = nullptr;
+    if (par.seqMode == 2) { kseq2 = KSeqFactory(in2.c_str()); }
+
+    for (size_t splitIdx = 0; splitIdx < queryReadSplit.size(); splitIdx++) {
+        // Allocate memory for query list
+        queryList.clear();
+        queryList.resize(queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start);
+
+        // Allocate memory for query k-mer list and match list
+        kmerBuffer.reallocateMemory(queryReadSplit[splitIdx].kmerCnt);
+        if (queryReadSplit.size() == 1) {
+            size_t remain = queryIndexer->getAvailableRam() - queryReadSplit[splitIdx].kmerCnt * sizeof(QueryKmer) - numOfSeq * 200;
+            matchBuffer.reallocateMemory(remain / sizeof(Match));
+        } else {
+            matchBuffer.reallocateMemory(queryReadSplit[splitIdx].kmerCnt * matchPerKmer);
+        }
 
+        // Initialize query k-mer buffer and match buffer
+        kmerBuffer.startIndexOfReserve = 0;
+        matchBuffer.startIndexOfReserve = 0;
+
+        // Extract query k-mer
+        kmerExtractor->extractQueryKmers(kmerBuffer,
+                                         queryList,
+                                         queryReadSplit[splitIdx],
+                                         par,
+                                         kseq1,
+                                         kseq2);
+        numOfTatalQueryKmerCnt += kmerBuffer.startIndexOfReserve;
+
+        // Search matches between query and target k-mers
+        kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer);
+
+        // Classify queries based on the matches
+        taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par);
+        processedSeqCnt += queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start;
+        cout << "The number of processed sequences: " << processedSeqCnt << " (" << (double) processedSeqCnt / (double) numOfSeq << ")" << endl;
+        
+        // Write classification results
+        reporter->writeReadClassification(queryList, true);
+
+        recordFilteredReads(queryList);
+    }
+    printFilteredReads();
+    reporter->writeReportFile(numOfSeq, taxonomer->getTaxCounts());
+    reporter->closeReadClassificationFile();
+    
+    // Memory deallocation
+    free(matchBuffer.buffer);
+    delete kseq1;
+    delete kseq2;
 }
 
diff --git a/src/commons/QueryFilter.h b/src/commons/QueryFilter.h
index 33fa7de1..962a3c02 100644
--- a/src/commons/QueryFilter.h
+++ b/src/commons/QueryFilter.h
@@ -4,15 +4,39 @@
 #include "LocalUtil.h"
 #include "QueryIndexer.h"
 #include "ReducedKmerMatcher.h"
+#include "KmerExtractor.h"
+#include "Taxonomer.h"
+#include "Reporter.h"
+
 class QueryFilter {
 private:
+    // Parameters
+    std::string dbDir;
+    size_t matchPerKmer;
+    int printMode;
+
+    // Agents
     QueryIndexer * queryIndexer;
+    KmerExtractor * kmerExtractor;
     KmerMatcher * kmerMatcher;
+    Taxonomer * taxonomer;
+    Reporter * reporter;
 
-    std::string in1, in2, out1, out2, reportFileName; // input and output file names
+    // Kseq
+    KSeqWrapper* filter_kseq1;
+    KSeqWrapper* filter_kseq2;
+
+    std::string in1, in2, f1, f2, rm1, rm2; // input and output file names
+    bool * isFiltered;
+    size_t readCounter;
+    FILE * f1_fp, * f2_fp, * rm1_fp, * rm2_fp;
 
     void setInputAndOutputFiles(const LocalParameters & par);
 
+    void recordFilteredReads(const vector<Query> & queryList);
+    
+    void printFilteredReads();
+
 public:
     void filterReads(LocalParameters & par);
     explicit QueryFilter(LocalParameters & par);
diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp
index 07a7251f..0651b479 100644
--- a/src/commons/QueryIndexer.cpp
+++ b/src/commons/QueryIndexer.cpp
@@ -55,7 +55,7 @@ void QueryIndexer::indexQueryFile() {
         KSeqWrapper* kseq_1 = KSeqFactory(queryPath_1.c_str());
         KSeqWrapper* kseq_2 = KSeqFactory(queryPath_2.c_str());
         size_t kmerCnt = 0;
-                size_t seqCnt_1 = 0;
+        size_t seqCnt_1 = 0;
         size_t seqCnt_2 = 0;
         size_t start = 0;
         size_t currentKmerCnt;
@@ -81,20 +81,20 @@ void QueryIndexer::indexQueryFile() {
                 end = true;
             }
 
-            if (seqCnt_1 != seqCnt_2) {
+            if (readNum_1 != readNum_2) {
                 Debug(Debug::ERROR) << "The number of reads in the two files are not equal." << "\n";
                 EXIT(EXIT_FAILURE);
             }
 
             if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) {
-                querySplits.emplace_back(start, seqCnt_1, kmerCnt - currentKmerCnt);
+                querySplits.emplace_back(start, readNum_1, kmerCnt - currentKmerCnt);
                 kmerCnt = currentKmerCnt;
-                start = seqCnt_1;
+                start = readNum_1;
                 seqCnt_1 = 1;
             }
 
             if (end) {
-                querySplits.emplace_back(start, seqCnt_1, kmerCnt);
+                querySplits.emplace_back(start, readNum_1, kmerCnt);
                 break;
             }
         }
diff --git a/src/commons/Reporter.cpp b/src/commons/Reporter.cpp
index 288aecdb..566e8c66 100644
--- a/src/commons/Reporter.cpp
+++ b/src/commons/Reporter.cpp
@@ -15,8 +15,11 @@ void Reporter::openReadClassificationFile() {
     readClassificationFile.open(outDir + "/" + jobId + "_classifications.tsv");
 }
 
-void Reporter::writeReadClassification(const vector<Query> & queryList) {
+void Reporter::writeReadClassification(const vector<Query> & queryList, bool classifiedOnly) {
     for (size_t i = 0; i < queryList.size(); i++) {
+        if (classifiedOnly && !queryList[i].isClassified) {
+            continue;
+        }
         readClassificationFile << queryList[i].isClassified << "\t" << queryList[i].name << "\t"
                                << queryList[i].classification << "\t"
                                << queryList[i].queryLength + queryList[i].queryLength2 << "\t"
diff --git a/src/commons/Reporter.h b/src/commons/Reporter.h
index 4de0f32c..d64e567c 100644
--- a/src/commons/Reporter.h
+++ b/src/commons/Reporter.h
@@ -23,16 +23,16 @@ class Reporter {
 public:
     Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy);
     // Write report
+    void writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt);
+    void writeReport(FILE *FP, const std::unordered_map<TaxID, TaxonCounts> &cladeCounts,
+                     unsigned long totalReads, TaxID taxID = 0, int depth = 0);
 
     // Read by read classification results
     void openReadClassificationFile();
-    void writeReadClassification(const vector<Query> & queryList);
+    void writeReadClassification(const vector<Query> & queryList, bool classifiedOnly = false);
     void closeReadClassificationFile();
 
-    void writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt);
-
-    void writeReport(FILE *FP, const std::unordered_map<TaxID, TaxonCounts> &cladeCounts,
-                     unsigned long totalReads, TaxID taxID = 0, int depth = 0);
+   
 
     unsigned int cladeCountVal(const std::unordered_map<TaxID, TaxonCounts> &map, TaxID key);
 
diff --git a/src/metabuli.cpp b/src/metabuli.cpp
index 8918c64e..60f09cf9 100644
--- a/src/metabuli.cpp
+++ b/src/metabuli.cpp
@@ -67,7 +67,7 @@ std::vector<Command> commands = {
           {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
           {"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory},
           {"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
-        {"filter", classify, &localPar.classify, COMMAND_TAXONOMY,
+        {"filter", classify, &localPar.filter, COMMAND_TAXONOMY,
                 "Filtering reads based on the classification result",
                 nullptr,
                 "Jaebeom Kim <jbeom0731@gmail.com>",
diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp
index fd02afcf..40bfc797 100644
--- a/src/workflow/filter.cpp
+++ b/src/workflow/filter.cpp
@@ -1,7 +1,4 @@
-#include "Classifier.h"
-#include "Parameters.h"
 #include "LocalParameters.h"
-#include "FileUtil.h"
 #include "QueryFilter.h"
 
 void setFilterDefaults(LocalParameters & par){
@@ -24,6 +21,7 @@ void setFilterDefaults(LocalParameters & par){
     par.maskMode = 0;
     par.maskProb = 0.9;
     par.matchPerKmer = 4;
+    par.printMode = 1;
 }
 
 int filter(int argc, const char **argv, const Command& command)
@@ -32,22 +30,15 @@ int filter(int argc, const char **argv, const Command& command)
     setFilterDefaults(par);
     par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0);
 
-    if (par.seqMode == 2) {
-        if (!FileUtil::directoryExists(par.filenames[3].c_str())) {
-            FileUtil::makeDir(par.filenames[3].c_str());
-        }
-    } else {
-        if (!FileUtil::directoryExists(par.filenames[2].c_str())) {
-            FileUtil::makeDir(par.filenames[2].c_str());
-        }
-    }
-
 #ifdef OPENMP
     omp_set_num_threads(par.threads);
 #endif
 
-    cout << "Number of threads: " << par.threads << endl;
-
-
+    QueryFilter * queryFilter = new QueryFilter(par);
+    
+    queryFilter->filterReads(par);
+    
+    delete queryFilter;
+    
     return 0;
 }
\ No newline at end of file

From 1988259083dd7602b2bda6123c6dd5b4ba4e8659 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 22 Aug 2023 14:52:44 +0900
Subject: [PATCH 17/65] filter agains multiple DBs

---
 src/commons/Classifier.cpp      |   1 +
 src/commons/IndexCreator.h      |   5 -
 src/commons/KmerMatcher.cpp     |  47 ++++--
 src/commons/KmerMatcher.h       | 283 ++++++++++++++++----------------
 src/commons/LocalParameters.cpp |  12 +-
 src/commons/LocalParameters.h   |   2 +
 src/commons/Match.h             |  10 +-
 src/commons/QueryFilter.cpp     |  27 +--
 src/commons/QueryFilter.h       |   3 +
 src/metabuli.cpp                |  31 ++--
 src/workflow/add_to_library.cpp |   1 +
 src/workflow/filter.cpp         |   1 +
 12 files changed, 223 insertions(+), 200 deletions(-)

diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp
index 09a53dfa..5b95e8f6 100644
--- a/src/commons/Classifier.cpp
+++ b/src/commons/Classifier.cpp
@@ -106,6 +106,7 @@ void Classifier::startClassify(const LocalParameters &par) {
 
         // Search matches between query and target k-mers
         kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer);
+        kmerMatcher->sortMatches(&matchBuffer);
 
 
 //#ifdef OPENMP
diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h
index 087cc3b3..8635cdae 100644
--- a/src/commons/IndexCreator.h
+++ b/src/commons/IndexCreator.h
@@ -14,10 +14,7 @@
 #include "common.h"
 #include "NcbiTaxonomy.h"
 #include "FastSort.h"
-#include "Classifier.h"
 #include "LocalParameters.h"
-
-// For masking
 #include "NucleotideMatrix.h"
 #include "SubstitutionMatrix.h"
 #include "tantan.h"
@@ -28,8 +25,6 @@
 #endif
 
 
-
-
 struct TaxId2Fasta{
     TaxID species;
     TaxID taxid;
diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 204187d4..3f9a1495 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -2,17 +2,13 @@
 
 KmerMatcher::KmerMatcher(const LocalParameters & par,
                          NcbiTaxonomy * taxonomy) {
+    // Parameters
     threads = par.threads;
-    std::string dbDir = par.filenames[1 + (par.seqMode == 2)];
-    targetDiffIdxFileName = dbDir + "/diffIdx";
-    targetInfoFileName = dbDir + "/info";
-    diffIdxSplitFileName = dbDir + "/split";
-
-    diffIdxSplits = mmapData<DiffIdxSplit>(diffIdxSplitFileName.c_str(), 3);
-
+    dbDir = par.filenames[1 + (par.seqMode == 2)];
+    hammingMargin = par.hammingMargin;
+    
     MARKER = 16777215;
     MARKER = ~ MARKER;
-    hammingMargin = par.hammingMargin;
     totalMatchCnt = 0;
 
     // Load the taxonomy ID list
@@ -53,16 +49,33 @@ KmerMatcher::KmerMatcher(const LocalParameters & par,
     fclose(taxIdFile);
 }
 
+
 KmerMatcher::~KmerMatcher() {
-    munmap(diffIdxSplits.data, diffIdxSplits.fileSize + 1);
 }
 
-int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer<Match> * matchBuffer) {
-    size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve;
-    QueryKmer *queryKmerList = queryKmerBuffer->buffer;
 
+int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
+                            Buffer<Match> * matchBuffer,
+                            const string & db){
+    // Set database files
+    string targetDiffIdxFileName;
+    string targetInfoFileName;
+    string diffIdxSplitFileName;
+    if (db.empty()) {
+        targetDiffIdxFileName = dbDir + "/diffIdx";
+        targetInfoFileName = dbDir + "/info";
+        diffIdxSplitFileName = dbDir + "/split";
+    } else {
+        targetDiffIdxFileName = dbDir + "/" + db + "/diffIdx";
+        targetInfoFileName = dbDir + "/" + db + "/info";
+        diffIdxSplitFileName = dbDir + "/" + db + "/split";
+    } 
+    MmapedData<DiffIdxSplit> diffIdxSplits = mmapData<DiffIdxSplit>(diffIdxSplitFileName.c_str(), 3);
     size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t);
 
+    size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve;
+    QueryKmer *queryKmerList = queryKmerBuffer->buffer;
+    
     std::cout << "Comparing query and reference metamers..." << std::endl;
 
     // Find the first index of garbage query k-mer (UINT64_MAX) and discard from there
@@ -418,16 +431,17 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
     omp_set_num_threads(threads);
 #endif
 
-    // Sort matches
-    time_t beforeSortMatches = time(nullptr);
     totalMatchCnt += matchBuffer->startIndexOfReserve;
+    return 1;
+}
+
+void KmerMatcher::sortMatches(Buffer<Match> * matchBuffer) {
+    time_t beforeSortMatches = time(nullptr);
     std::cout << "Sorting matches ..." << std::endl;
     SORT_PARALLEL(matchBuffer->buffer,
                   matchBuffer->buffer + matchBuffer->startIndexOfReserve,
                   compareMatches);
     std::cout << "Time spent for sorting matches: " << double(time(nullptr) - beforeSortMatches) << std::endl;
-
-    return 1;
 }
 
 void KmerMatcher::moveMatches(Match *dest, Match *src, int &matchNum) {
@@ -472,6 +486,7 @@ void KmerMatcher::compareDna(uint64_t query,
     delete[] hammingSums;
 }
 
+
 bool KmerMatcher::compareMatches(const Match& a, const Match& b) {
     if (a.qInfo.sequenceID != b.qInfo.sequenceID)
         return a.qInfo.sequenceID < b.qInfo.sequenceID;
diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h
index c4b98643..05b61dc7 100644
--- a/src/commons/KmerMatcher.h
+++ b/src/commons/KmerMatcher.h
@@ -1,17 +1,17 @@
 #ifndef METABULI_KMERMATCHER_H
 #define METABULI_KMERMATCHER_H
+#include "BitManipulateMacros.h"
+#include "FileUtil.h"
 #include "KmerBuffer.h"
-#include "Match.h"
-#include "common.h"
 #include "LocalParameters.h"
-#include <string>
-#include "FileUtil.h"
+#include "Match.h"
 #include "Mmap.h"
-#include "BitManipulateMacros.h"
 #include "NcbiTaxonomy.h"
+#include "common.h"
 #include "unordered_map"
+#include <string>
 
-#define BufferSize 16'777'216 //16 * 1024 * 1024 // 16 M
+#define BufferSize 16'777'216 // 16 * 1024 * 1024 // 16 M
 
 // Input
 // 1. Query K-mers
@@ -24,154 +24,155 @@ using namespace std;
 
 class KmerMatcher {
 protected:
-    NcbiTaxonomy * taxonomy;
-    size_t threads;
-    std::string targetDiffIdxFileName, targetInfoFileName, diffIdxSplitFileName;
-    MmapedData<DiffIdxSplit> diffIdxSplits;
-    uint64_t MARKER;
-    int bitsForCodon = 3;
-    uint8_t hammingMargin;
-    size_t totalMatchCnt;
-    uint8_t hammingLookup[8][8] = {
-            {0, 1, 1, 1, 2, 1, 3, 3},
-            {1, 0, 1, 1, 2, 2, 3, 2},
-            {1, 1, 0, 1, 2, 2, 2, 3},
-            {1, 1, 1, 0, 1, 2, 3, 3},
-            {2, 2, 2, 1, 0, 1, 4, 4},
-            {1, 2, 2, 2, 1, 0, 4, 4},
-            {3, 3, 2, 3, 4, 4, 0, 1},
-            {3, 2, 3, 3, 4, 4, 1, 0}};
-    unordered_map<TaxID, TaxID> taxId2speciesId;
-    unordered_map<TaxID, TaxID> taxId2genusId;
-
-
-    struct QueryKmerSplit {
-        QueryKmerSplit(size_t start, size_t end, size_t length, const DiffIdxSplit& diffIdxSplit)
-                : start(start), end(end), length(length), diffIdxSplit(diffIdxSplit) {}
-        size_t start; // start idx in query k-mer list
-        size_t end; // end idx in query k-mer list
-        size_t length;
-        DiffIdxSplit diffIdxSplit; // index in target k-mer list from where the search begins.
-    };
-
-    size_t AminoAcidPart(size_t kmer) const { return (kmer) & MARKER; }
-
-    template <typename T>
-    static void loadBuffer(FILE * fp, T * buffer, size_t & bufferIdx, size_t size){
-        fread(buffer, sizeof(T), size, fp);
-        bufferIdx = 0;
-    }
-
-    template <typename T>
-    static void loadBuffer(FILE * fp, T * buffer, size_t & bufferIdx, size_t size, int cnt){
-        fseek(fp, cnt * sizeof(T), SEEK_CUR);
-        fread(buffer, sizeof(T), size, fp);
-        bufferIdx = 0;
-    }
-
-    static uint64_t getNextTargetKmer(uint64_t lookingTarget,
-                                      const uint16_t * diffIdxBuffer,
-                                      size_t & diffBufferIdx,
-                                      size_t & totalPos);
-
-
-    static TargetKmerInfo getKmerInfo(size_t bufferSize,
-                                      FILE *kmerInfoFp,
-                                      TargetKmerInfo *infoBuffer,
-                                      size_t &infoBufferIdx);
-
-    void moveMatches(Match *dest,
-                     Match *src,
-                     int &matchNum);
-
-    void compareDna(uint64_t query,
-                    std::vector<uint64_t> &targetKmersToCompare,
-                    std::vector<size_t> &selectedMatches,
-                    std::vector<uint8_t> &selectedHammingSum,
-                    std::vector<uint16_t> &rightEndHammings,
-                    uint8_t frame);
-
-    virtual uint8_t getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2);
-
-    virtual uint16_t getHammings(uint64_t kmer1, uint64_t kmer2);
-
-    virtual uint16_t getHammings_reverse(uint64_t kmer1, uint64_t kmer2);
-
-    static bool compareMatches(const Match& a, const Match& b);
+  NcbiTaxonomy *taxonomy;
+  size_t threads;
+  std::string dbDir;
+  //   string targetDiffIdxFileName, targetInfoFileName, diffIdxSplitFileName;
+  //   MmapedData<DiffIdxSplit> diffIdxSplits;
+  uint64_t MARKER;
+  int bitsForCodon = 3;
+  uint8_t hammingMargin;
+  size_t totalMatchCnt;
+  uint8_t hammingLookup[8][8] = {
+      {0, 1, 1, 1, 2, 1, 3, 3}, {1, 0, 1, 1, 2, 2, 3, 2},
+      {1, 1, 0, 1, 2, 2, 2, 3}, {1, 1, 1, 0, 1, 2, 3, 3},
+      {2, 2, 2, 1, 0, 1, 4, 4}, {1, 2, 2, 2, 1, 0, 4, 4},
+      {3, 3, 2, 3, 4, 4, 0, 1}, {3, 2, 3, 3, 4, 4, 1, 0}};
+  unordered_map<TaxID, TaxID> taxId2speciesId;
+  unordered_map<TaxID, TaxID> taxId2genusId;
+
+  struct QueryKmerSplit {
+    QueryKmerSplit(size_t start, size_t end, size_t length,
+                   const DiffIdxSplit &diffIdxSplit)
+        : start(start), end(end), length(length), diffIdxSplit(diffIdxSplit) {}
+    size_t start; // start idx in query k-mer list
+    size_t end;   // end idx in query k-mer list
+    size_t length;
+    DiffIdxSplit diffIdxSplit; // index in target k-mer list from where the
+                               // search begins.
+  };
+
+  size_t AminoAcidPart(size_t kmer) const { return (kmer)&MARKER; }
+
+  template <typename T>
+  static void loadBuffer(FILE *fp, T *buffer, size_t &bufferIdx, size_t size) {
+    fread(buffer, sizeof(T), size, fp);
+    bufferIdx = 0;
+  }
+
+  template <typename T>
+  static void loadBuffer(FILE *fp, T *buffer, size_t &bufferIdx, size_t size,
+                         int cnt) {
+    fseek(fp, cnt * sizeof(T), SEEK_CUR);
+    fread(buffer, sizeof(T), size, fp);
+    bufferIdx = 0;
+  }
+
+  static uint64_t getNextTargetKmer(uint64_t lookingTarget,
+                                    const uint16_t *diffIdxBuffer,
+                                    size_t &diffBufferIdx, size_t &totalPos);
+
+  static TargetKmerInfo getKmerInfo(size_t bufferSize, FILE *kmerInfoFp,
+                                    TargetKmerInfo *infoBuffer,
+                                    size_t &infoBufferIdx);
+
+  void moveMatches(Match *dest, Match *src, int &matchNum);
+
+  void compareDna(uint64_t query, std::vector<uint64_t> &targetKmersToCompare,
+                  std::vector<size_t> &selectedMatches,
+                  std::vector<uint8_t> &selectedHammingSum,
+                  std::vector<uint16_t> &rightEndHammings, uint8_t frame);
+
+  virtual uint8_t getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2);
+
+  virtual uint16_t getHammings(uint64_t kmer1, uint64_t kmer2);
+
+  virtual uint16_t getHammings_reverse(uint64_t kmer1, uint64_t kmer2);
+
+  static bool compareMatches(const Match &a, const Match &b);
 
 public:
-    KmerMatcher(const LocalParameters & par,
-                NcbiTaxonomy * taxonomy);
-
-    virtual ~KmerMatcher();
-
-    int matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer<Match> * matchBuffer);
-
+  KmerMatcher(const LocalParameters &par, NcbiTaxonomy *taxonomy);
+
+  virtual ~KmerMatcher();
+  
+  int matchKmers(QueryKmerBuffer *queryKmerBuffer, Buffer<Match> *matchBuffer,
+                 const string &db = string());
+  
+  void sortMatches(Buffer<Match> *matchBuffer);
 };
 
-inline
-uint64_t KmerMatcher::getNextTargetKmer(uint64_t lookingTarget,
-                                        const uint16_t *diffIdxBuffer,
-                                        size_t &diffBufferIdx,
-                                        size_t &totalPos) {
-    uint16_t fragment;
-    uint16_t check = 32768; // 2^15
-    uint64_t diffIn64bit = 0;
+inline uint64_t KmerMatcher::getNextTargetKmer(uint64_t lookingTarget,
+                                               const uint16_t *diffIdxBuffer,
+                                               size_t &diffBufferIdx,
+                                               size_t &totalPos) {
+  uint16_t fragment;
+  uint16_t check = 32768; // 2^15
+  uint64_t diffIn64bit = 0;
+  fragment = diffIdxBuffer[diffBufferIdx++];
+  totalPos++;
+  while (!(fragment & check)) { // 27 %
+    diffIn64bit |= fragment;
+    diffIn64bit <<= 15u;
     fragment = diffIdxBuffer[diffBufferIdx++];
     totalPos++;
-    while (!(fragment & check)) { // 27 %
-        diffIn64bit |= fragment;
-        diffIn64bit <<= 15u;
-        fragment = diffIdxBuffer[diffBufferIdx++];
-        totalPos++;
-    }
-    fragment &= ~check; // not; 8.47 %
-    diffIn64bit |= fragment; // or : 23.6%
-    return diffIn64bit + lookingTarget;
+  }
+  fragment &= ~check;      // not; 8.47 %
+  diffIn64bit |= fragment; // or : 23.6%
+  return diffIn64bit + lookingTarget;
 }
 
-inline
-TargetKmerInfo KmerMatcher::getKmerInfo(size_t bufferSize,
-                                        FILE * kmerInfoFp,
-                                        TargetKmerInfo * infoBuffer,
-                                        size_t & infoBufferIdx){
-    if (unlikely(infoBufferIdx >= bufferSize)) {
-        loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize, (int) (infoBufferIdx - bufferSize));
-    }
-    return infoBuffer[infoBufferIdx];
+inline TargetKmerInfo KmerMatcher::getKmerInfo(size_t bufferSize,
+                                               FILE *kmerInfoFp,
+                                               TargetKmerInfo *infoBuffer,
+                                               size_t &infoBufferIdx) {
+  if (unlikely(infoBufferIdx >= bufferSize)) {
+    loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize,
+               (int)(infoBufferIdx - bufferSize));
+  }
+  return infoBuffer[infoBufferIdx];
 }
 
-inline uint8_t KmerMatcher::getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2) {//12345678
-    uint8_t hammingSum = 0;
-    hammingSum += hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 3U)][GET_3_BITS(kmer2 >> 3U)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 6U)][GET_3_BITS(kmer2 >> 6U)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 9U)][GET_3_BITS(kmer2 >> 9U)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 12U)][GET_3_BITS(kmer2 >> 12U)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 15U)][GET_3_BITS(kmer2 >> 15U)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 18U)][GET_3_BITS(kmer2 >> 18U)];
-    hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 21U)][GET_3_BITS(kmer2 >> 21U)];
-    return hammingSum;
+inline uint8_t KmerMatcher::getHammingDistanceSum(uint64_t kmer1,
+                                                  uint64_t kmer2) { // 12345678
+  uint8_t hammingSum = 0;
+  hammingSum += hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)];
+  hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 3U)][GET_3_BITS(kmer2 >> 3U)];
+  hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 6U)][GET_3_BITS(kmer2 >> 6U)];
+  hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 9U)][GET_3_BITS(kmer2 >> 9U)];
+  hammingSum +=
+      hammingLookup[GET_3_BITS(kmer1 >> 12U)][GET_3_BITS(kmer2 >> 12U)];
+  hammingSum +=
+      hammingLookup[GET_3_BITS(kmer1 >> 15U)][GET_3_BITS(kmer2 >> 15U)];
+  hammingSum +=
+      hammingLookup[GET_3_BITS(kmer1 >> 18U)][GET_3_BITS(kmer2 >> 18U)];
+  hammingSum +=
+      hammingLookup[GET_3_BITS(kmer1 >> 21U)][GET_3_BITS(kmer2 >> 21U)];
+  return hammingSum;
 }
 
-inline uint16_t KmerMatcher::getHammings(uint64_t kmer1, uint64_t kmer2) {  //hammings 87654321
-    uint16_t hammings = 0;
-    for (int i = 0; i < 8; i++) {
-        hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * i;
-        kmer1 >>= bitsForCodon;
-        kmer2 >>= bitsForCodon;
-    }
-    return hammings;
+inline uint16_t KmerMatcher::getHammings(uint64_t kmer1,
+                                         uint64_t kmer2) { // hammings 87654321
+  uint16_t hammings = 0;
+  for (int i = 0; i < 8; i++) {
+    hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * i;
+    kmer1 >>= bitsForCodon;
+    kmer2 >>= bitsForCodon;
+  }
+  return hammings;
 }
 
-inline uint16_t KmerMatcher::getHammings_reverse(uint64_t kmer1, uint64_t kmer2) {  //hammings 87654321
-    uint16_t hammings = 0;
-    for (int i = 0; i < 8; i++) {
-        hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * (7-i);
-        kmer1 >>= bitsForCodon;
-        kmer2 >>= bitsForCodon;
-    }
-    return hammings;
+inline uint16_t
+KmerMatcher::getHammings_reverse(uint64_t kmer1,
+                                 uint64_t kmer2) { // hammings 87654321
+  uint16_t hammings = 0;
+  for (int i = 0; i < 8; i++) {
+    hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)]
+                << 2U * (7 - i);
+    kmer1 >>= bitsForCodon;
+    kmer2 >>= bitsForCodon;
+  }
+  return hammings;
 }
 
 // struct sortMatch {
@@ -195,4 +196,4 @@ inline uint16_t KmerMatcher::getHammings_reverse(uint64_t kmer1, uint64_t kmer2)
 //     }
 // };
 
-#endif //METABULI_KMERMATCHER_H
+#endif // METABULI_KMERMATCHER_H
diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp
index b6d21b7b..c5985ccf 100644
--- a/src/commons/LocalParameters.cpp
+++ b/src/commons/LocalParameters.cpp
@@ -222,14 +222,21 @@ LocalParameters::LocalParameters() :
                       "CSV of column numbers to be printed",
                       typeid(std::string),
                       (void *) &printColumns,
-                      "^.*$")
+                      "^.*$"),
         PRINT_MODE(PRINT_MODE_ID,
                         "--print-mode",
                        "[1] Only filtered reads [2] Both filtered and removed reads",
                        "[1] Only filtered reads [2] Both filtered and removed reads",
                        typeid(int),
                        (void *) &printMode,
-                       "[1-2]")
+                       "[1-2]"),
+        CONTAM_LIST(CONTAM_LIST_ID, 
+                   "--contam-list",
+                   "List of contaminants to be filtered",
+                   "List of taxids to be filtered",
+                     typeid(std::string),
+                        (void *) &contamList,
+                        "^.*$") 
   {
     //add_to_library
 
@@ -287,6 +294,7 @@ LocalParameters::LocalParameters() :
     filter.push_back(&PARAM_MASK_PROBABILTY);
     filter.push_back(&MATCH_PER_KMER);
     filter.push_back(&PRINT_MODE);
+    filter.push_back(&CONTAM_LIST);
 
     //updateTargetDB
     exclusiontest_hiv.push_back(&TEST_RANK);
diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h
index 1a173deb..2a92115a 100644
--- a/src/commons/LocalParameters.h
+++ b/src/commons/LocalParameters.h
@@ -74,6 +74,7 @@ class LocalParameters : public Parameters {
 
     // Filter
     PARAMETER(PRINT_MODE)
+    PARAMETER(CONTAM_LIST)
 
     // Superkingdom taxonomy id
     int virusTaxId;
@@ -119,6 +120,7 @@ class LocalParameters : public Parameters {
 
     // Filter
     int printMode;
+    std::string contamList;
 
 private:
     LocalParameters();
diff --git a/src/commons/Match.h b/src/commons/Match.h
index da8dcdb4..436eb0bb 100644
--- a/src/commons/Match.h
+++ b/src/commons/Match.h
@@ -4,14 +4,6 @@
 #include "Kmer.h"
 #include <iostream>
 
-//struct Match_qInfo {
-//    explicit Match_qInfo(uint32_t position = 0, uint32_t queryId = 0, uint8_t frame = 0)
-//            : position(position), queryId(queryId),  frame(frame) {}
-//    uint64_t position : 32;
-//    uint64_t queryId : 29;
-//    uint64_t frame : 3; // 0-5
-//};
-
 struct Match { // 24 byte
     Match(){}
     Match(QueryKmerInfo qInfo,
@@ -25,7 +17,7 @@ struct Match { // 24 byte
           rightEndHamming(eachHamming), hamming(hamming), redundancy(redundancy) { }
 
     QueryKmerInfo qInfo; // 8
-    TaxID targetId; // 4
+    TaxID targetId; // 4 taxonomy id infact
     TaxID genusId; // 4
     TaxID speciesId; // 4
     uint16_t rightEndHamming; // 2
diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp
index 57f97fd0..c3ed99b3 100644
--- a/src/commons/QueryFilter.cpp
+++ b/src/commons/QueryFilter.cpp
@@ -5,7 +5,9 @@ QueryFilter::QueryFilter(LocalParameters & par) {
     dbDir = par.filenames[1 + (par.seqMode == 2)];
     matchPerKmer = par.matchPerKmer;
     printMode = par.printMode;
-
+    seqMode = par.seqMode;
+    contams = Util::split(par.contamList, ",");
+    
     // Taxonomy
     if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
     taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp",
@@ -48,10 +50,10 @@ QueryFilter::~QueryFilter() {
     delete filter_kseq2;
     delete[] isFiltered;
     fclose(f1_fp);
-    if (par.seqMode == 2) { fclose(f2_fp); }
+    if (seqMode == 2) { fclose(f2_fp); }
     if (printMode == 2) {
         fclose(rm1_fp);
-        if (par.seqMode == 2) { fclose(rm2_fp); }
+        if (seqMode == 2) { fclose(rm2_fp); }
     }
 }
 
@@ -65,15 +67,15 @@ void QueryFilter::setInputAndOutputFiles(const LocalParameters & par) {
     rm1 = baseName + "_removed.fna.gz";
 
     // For paired-end reads
-    if (par.seqMode == 2) {
+    if (seqMode == 2) {
         in2 = par.filenames[1];
         f2 = LocalUtil::getQueryBaseName(in2) + "_filtered.fna.gz";
         rm2 = LocalUtil::getQueryBaseName(in2) + "_removed.fna.gz";
     }
 }
 
-void QueryFilter::recordFilteredReads(const vectore<Query> & queryList) {
-    for (query:queryList){
+void QueryFilter::recordFilteredReads(const vector<Query> & queryList) {
+    for (auto query : queryList) {
         isFiltered[readCounter++] = query.isClassified;
     }
 }
@@ -82,15 +84,15 @@ void QueryFilter::printFilteredReads() {
     for (size_t i = 0; i < readCounter; i ++) {
         // Read query reads
         filter_kseq1->ReadEntry();
-        if (par.seqMode == 2) { filter_kseq2->ReadEntry(); }
+        if (seqMode == 2) { filter_kseq2->ReadEntry(); }
 
         // Print reads
         if (isFiltered[i]) { // Print filtered reads
             fprintf(f1_fp, ">%s\n%s\n", filter_kseq1->entry.name.s, filter_kseq1->entry.sequence.s);
-            if (par.seqMode == 2) { fprintf(f2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); }
+            if (seqMode == 2) { fprintf(f2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); }
         } else if (printMode == 2) { // Print removed reads
             fprintf(rm1_fp, ">%s\n%s\n", filter_kseq1->entry.name.s, filter_kseq1->entry.sequence.s);
-            if (par.seqMode == 2) { fprintf(rm2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); }
+            if (seqMode == 2) { fprintf(rm2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); }
         }
     }
 }
@@ -151,7 +153,10 @@ void QueryFilter::filterReads(LocalParameters & par) {
         numOfTatalQueryKmerCnt += kmerBuffer.startIndexOfReserve;
 
         // Search matches between query and target k-mers
-        kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer);
+        for (auto db : contams) {
+            kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer, db);
+        }
+        kmerMatcher->sortMatches(&matchBuffer);
 
         // Classify queries based on the matches
         taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par);
@@ -166,7 +171,7 @@ void QueryFilter::filterReads(LocalParameters & par) {
     printFilteredReads();
     reporter->writeReportFile(numOfSeq, taxonomer->getTaxCounts());
     reporter->closeReadClassificationFile();
-    
+
     // Memory deallocation
     free(matchBuffer.buffer);
     delete kseq1;
diff --git a/src/commons/QueryFilter.h b/src/commons/QueryFilter.h
index 962a3c02..3dccb7e6 100644
--- a/src/commons/QueryFilter.h
+++ b/src/commons/QueryFilter.h
@@ -14,6 +14,8 @@ class QueryFilter {
     std::string dbDir;
     size_t matchPerKmer;
     int printMode;
+    int seqMode;
+    std::vector<std::string> contams;
 
     // Agents
     QueryIndexer * queryIndexer;
@@ -21,6 +23,7 @@ class QueryFilter {
     KmerMatcher * kmerMatcher;
     Taxonomer * taxonomer;
     Reporter * reporter;
+    NcbiTaxonomy * taxonomy;
 
     // Kseq
     KSeqWrapper* filter_kseq1;
diff --git a/src/metabuli.cpp b/src/metabuli.cpp
index 60f09cf9..54c447b5 100644
--- a/src/metabuli.cpp
+++ b/src/metabuli.cpp
@@ -41,7 +41,7 @@ std::vector<Command> commands = {
                 {{"Directory where the DB will be generated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty},
                  {"A list of FASTA files", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                  {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
-        {"database-report", databaseReport, &localPar.databaseReport, COMMAND_DATABASE_CREATION,
+        {"database-report", databaseReport, &localPar.databaseReport, COMMAND_DB,
                 "It generates a report of taxa in a database.",
                 nullptr,
                 "Jaebeom Kim <jbeom0731@gmail.com>",
@@ -57,24 +57,24 @@ std::vector<Command> commands = {
                {{"DB directory to be updated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty},
                        {"A list of FASTA files", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                        {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
-        {"classify", classify, &localPar.classify, COMMAND_TAXONOMY,
-         "Assigning taxonomy label to query reads",
+        {"classify", classify, &localPar.classify, COMMAND_MAIN,
+                "Assigning taxonomy label to query reads",
                 nullptr,
-         "Jaebeom Kim <jbeom0731@gmail.com>",
-         "<i:QUERY> <i:DB dir> <o:out dir> <job ID> ",
-         CITATION_SPACEPHARER,
-         {{"FASTA", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile},
-          {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
-          {"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory},
-          {"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
-        {"filter", classify, &localPar.filter, COMMAND_TAXONOMY,
+                "Jaebeom Kim <jbeom0731@gmail.com>",
+                "<i:QUERY> <i:DB dir> <o:out dir> <job ID> ",
+                CITATION_SPACEPHARER,
+                {{"FASTA", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile},
+                 {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
+                 {"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory},
+                 {"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
+        {"filter", classify, &localPar.filter, COMMAND_MAIN,
                 "Filtering reads based on the classification result",
                 nullptr,
                 "Jaebeom Kim <jbeom0731@gmail.com>",
                 "<i:READ FILE> <i:FILTER DB>",
                 CITATION_SPACEPHARER,
                 {{"READ FILE", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile},
-                        {"FILTER DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}},
+                 {"FILTER DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}},
         {"grade", grade, &localPar.grade, COMMAND_EXPERT,
                 "Grade the classification result (only for benchmarking)",
                 nullptr,
@@ -93,16 +93,15 @@ std::vector<Command> commands = {
                 {{"read-classification", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                         {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
         {"add-to-library", addToLibrary, &localPar.addToLibrary, COMMAND_DATABASE_CREATION,
-                    "It bins sequences into distinct files according to their species referring their accession number.\n "
-                    "It requires a mapping file (accession to tax ID) and NCBI style tax dump files in a taxonomy directory.",
-                nullptr,
+                    "It bins sequences into files according to their species.",
+                    nullptr,
                     "Jaebeom Kim <jbeom0731@gmail.com>",
                     "<I: FASTA list> <I: accession2taxid> <I: DB DIR>",
                     CITATION_SPACEPHARER,
                     {{"List of absolute paths of files to be added. One path per line.", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                      {"NCBI style accession2taxid file. It should be consistent to tax dump files.", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                      {"DB directory", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}},
-        {"apply-threshold", applyThreshold, &localPar.applyThreshold, COMMAND_MAIN,
+        {"apply-threshold", applyThreshold, &localPar.applyThreshold, COMMAND_EXPERT,
                 "Assigning taxonomy label to query reads",
                 nullptr,
                 "Jaebeom Kim <jbeom0731@gmail.com>",
diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp
index 0d533eec..4f8bfacc 100644
--- a/src/workflow/add_to_library.cpp
+++ b/src/workflow/add_to_library.cpp
@@ -6,6 +6,7 @@
 #include <iostream>
 #include "IndexCreator.h"
 #include "FileUtil.h"
+#include <regex>
 
 using namespace std;
 
diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp
index 40bfc797..e4328cb6 100644
--- a/src/workflow/filter.cpp
+++ b/src/workflow/filter.cpp
@@ -22,6 +22,7 @@ void setFilterDefaults(LocalParameters & par){
     par.maskProb = 0.9;
     par.matchPerKmer = 4;
     par.printMode = 1;
+    par.contamList = ""; // TODO: set default
 }
 
 int filter(int argc, const char **argv, const Command& command)

From 6c95cd8bffc9a1f12898b7139e8d2aaf7d314a4f Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 22 Aug 2023 17:22:04 +0900
Subject: [PATCH 18/65] binary taxonomy DB writing and reading

---
 lib/mmseqs/src/taxonomy/NcbiTaxonomy.h |   4 +
 src/commons/Classifier.cpp             |  12 +--
 src/commons/IndexCreator.cpp           | 118 ++++++-------------------
 src/commons/IndexCreator.h             |  26 +++---
 src/commons/KmerMatcher.h              |   3 +
 src/commons/QueryFilter.cpp            |   9 +-
 src/commons/common.cpp                 |  85 ++++++++++++++----
 src/commons/common.h                   |   2 +
 src/workflow/build.cpp                 |  23 ++---
 src/workflow/classify.cpp              |   2 +-
 src/workflow/filter.cpp                |   2 +-
 11 files changed, 140 insertions(+), 146 deletions(-)

diff --git a/lib/mmseqs/src/taxonomy/NcbiTaxonomy.h b/lib/mmseqs/src/taxonomy/NcbiTaxonomy.h
index 6e69dc81..204055c9 100644
--- a/lib/mmseqs/src/taxonomy/NcbiTaxonomy.h
+++ b/lib/mmseqs/src/taxonomy/NcbiTaxonomy.h
@@ -132,6 +132,10 @@ class NcbiTaxonomy {
     TaxID getTaxIdAtRank(int taxId, const std::string & rank);
     void createTaxIdListAtRank(std::vector<int> & taxIdList, std::vector<int> & taxIdListAtRank,
                                const std::string & rank);
+    void setMmapData(char* data, size_t size) {
+        mmapData = data;
+        mmapSize = size;
+    }
 
 private:
     size_t loadNodes(std::vector<TaxonNode> &tmpNodes, const std::string &nodesFile);
diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp
index 5b95e8f6..c426f889 100644
--- a/src/commons/Classifier.cpp
+++ b/src/commons/Classifier.cpp
@@ -6,10 +6,11 @@ Classifier::Classifier(LocalParameters & par) {
     matchPerKmer = par.matchPerKmer;
 
     // Taxonomy
-    if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
-    taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp",
-                                par.taxonomyPath + "/nodes.dmp",
-                                par.taxonomyPath + "/merged.dmp");
+    taxonomy = loadTaxonomy(dbDir, par.taxonomyPath);
+    // if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
+    // taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp",
+    //                             par.taxonomyPath + "/nodes.dmp",
+    //                             par.taxonomyPath + "/merged.dmp");
 
     // Agents
     queryIndexer = new QueryIndexer(par);
@@ -48,7 +49,6 @@ void Classifier::startClassify(const LocalParameters &par) {
     vector<Query> queryList;
 
     size_t numOfTatalQueryKmerCnt = 0;
-    size_t totalMatchCnt = 0;
     size_t processedSeqCnt = 0;
 
     reporter->openReadClassificationFile();
@@ -134,7 +134,7 @@ void Classifier::startClassify(const LocalParameters &par) {
     }
 
     cout << "Number of query k-mers: " << numOfTatalQueryKmerCnt << endl;
-    cout << "The number of matches: " << totalMatchCnt << endl;
+    cout << "The number of matches: " << kmerMatcher->getTotalMatchCnt() << endl;
     reporter->closeReadClassificationFile();
 
     // Write report files
diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index be7c0f77..834f60a1 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -2,22 +2,33 @@
 
 #include <utility>
 
-IndexCreator::IndexCreator(const LocalParameters & par)
-{
-    dbDir = par.filenames[0];
-    fnaListFileName = par.filenames[1];
-    taxonomyDir = par.filenames[0] + "/taxonomy";
+IndexCreator::IndexCreator(const LocalParameters & par) {
+    // Parameters
     threadNum = par.threads;
     bufferSize = par.bufferSize;
+    
+    // Input files
+    dbDir = par.filenames[0];
+    if (par.taxonomyPath.empty()) {
+        taxonomyDir = dbDir + "/taxonomy/";
+    } else {
+        taxonomyDir = par.taxonomyPath + "/";
+    }
+    cout << "Taxonomy path: " << par.taxonomyPath << endl;
+    fnaListFileName = par.filenames[1];
+    acc2taxidFileName = par.filenames[2];
 
+    
+    // Output files
+    taxidListFileName = dbDir + "/taxID_list";
+    taxonomyBinaryFileName = dbDir + "/taxonomyDB";
+    versionFileName = dbDir + "/db.version";
 
     // Load taxonomy
     taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp",
                                 taxonomyDir + "/nodes.dmp",
                                 taxonomyDir + "/merged.dmp");
 
-    // ======================================================= //
-
     if (par.reducedAA == 1){
         MARKER = 0Xffffffff;
         MARKER = ~ MARKER;
@@ -30,27 +41,6 @@ IndexCreator::IndexCreator(const LocalParameters & par)
     subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0);
 }
 
-IndexCreator::IndexCreator(const LocalParameters &par, string dbDir, string fnaListFileName, string acc2taxidFile)
-        : dbDir(std::move(dbDir)), fnaListFileName(std::move(fnaListFileName)),
-          taxonomyDir(par.taxonomyPath), acc2taxidFileName(std::move(acc2taxidFile))
-{
-    // Load taxonomy
-    taxonomy = new NcbiTaxonomy(this->taxonomyDir + "/names.dmp",
-                                this->taxonomyDir + "/nodes.dmp",
-                                this->taxonomyDir + "/merged.dmp");
-
-    if (par.reducedAA == 1){
-        MARKER = 0Xffffffff;
-        MARKER = ~ MARKER;
-    } else {
-        MARKER = 16777215;
-        MARKER = ~ MARKER;
-    }
-    tinfo_path = par.tinfoPath;
-
-    // For masking low complexity regions
-    subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0);
-}
 
 IndexCreator::~IndexCreator() {
     delete taxonomy;
@@ -64,7 +54,6 @@ void IndexCreator::createIndex(const LocalParameters &par) {
     cout << "Made blocks for each thread" << endl;
 
     // Write taxonomy id list
-    string taxidListFileName = dbDir + "/taxID_list";
     FILE * taxidListFile = fopen(taxidListFileName.c_str(), "w");
     for (auto & taxid : taxIdList) {
         fprintf(taxidListFile, "%d\n", taxid);
@@ -108,6 +97,7 @@ void IndexCreator::createIndex(const LocalParameters &par) {
         delete[] uniqKmerIdx;
     }
     delete[] splitChecker;
+    writeTaxonomyDB();
 }
 
 void IndexCreator::updateIndex(const LocalParameters &par) {
@@ -117,7 +107,7 @@ void IndexCreator::updateIndex(const LocalParameters &par) {
 
     // Train Prodigal for each species
     time_t prodigalStart = time(nullptr);
-    trainProdigal();
+    // trainProdigal();
     time_t prodigalEnd = time(nullptr);
     cout << "Prodigal training time: " << prodigalEnd - prodigalStart << " seconds" << endl;
 
@@ -826,64 +816,14 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
     return 0;
 }
 
-
-void IndexCreator::trainProdigal() {
-    // Train prodigal for each FASTA.
-#pragma omp parallel default(none), shared(cerr, fastaList, tinfo_path)
-    {
-        ProdigalWrapper prodigal;
-        kseq_buffer_t buffer;
-        kseq_t *seq;
-        size_t lengthOfTrainingSeq;
-#pragma omp for schedule(dynamic, 1)
-        for (size_t i = 0; i < fastaList.size(); i++) {
-            FASTA &currentFasta = fastaList[i];
-            TaxID currentSpecies = currentFasta.speciesID;
-            string fileName =  tinfo_path + to_string(currentSpecies) + ".tinfo";
-
-            // Skip if the training file for current species already exists.
-            if (fileExist(fileName)) {
-                cerr << "Training file for " << currentSpecies << " already exists. Skip." << endl;
-                continue;
-            }
-
-            // Load sequence for training.
-            struct MmapedData<char> fastaFile = mmapData<char>(currentFasta.path.c_str());
-            buffer = {const_cast<char *>(&fastaFile.data[currentFasta.sequences[currentFasta.trainingSeqIdx].start]),
-                      static_cast<size_t>(currentFasta.sequences[currentFasta.trainingSeqIdx].length)};
-            seq = kseq_init(&buffer);
-            kseq_read(seq);
-
-            // Train prodigal.
-            prodigal.is_meta = 0;
-            lengthOfTrainingSeq = seq->seq.l;
-            if (lengthOfTrainingSeq < 100'000) {
-                prodigal.is_meta = 1;
-                prodigal.trainMeta(seq->seq.s);
-            } else {
-                prodigal.trainASpecies(seq->seq.s);
-            }
-
-            // Write training result into a file for later use.
-            _training *tinfo = prodigal.getTrainingInfo();
-            write_training_file(const_cast<char *>(fileName.c_str()), tinfo);
-
-            kseq_destroy(seq);
-            munmap(fastaFile.data, fastaFile.fileSize + 1);
-        }
+void IndexCreator::writeTaxonomyDB() {
+    std::pair<char *, size_t> serialized = NcbiTaxonomy::serialize(*taxonomy);
+    FILE *handle = fopen(taxonomyBinaryFileName.c_str(), "w");
+    if (handle == NULL) {
+        Debug(Debug::ERROR) << "Could not open " << taxonomyBinaryFileName << " for writing\n";
+        EXIT(EXIT_FAILURE);
     }
-//    // TODO: Write species ID of newly trained species into a file.
-//    // Write trained species into a file.
-//    for (int i = 0; i < threadNum; i++) {
-//        for (auto &species : newSpeciesList[i]) {
-//            trainedSpecies.push_back(species);
-//        }
-//    }
-//    FILE *fp = fopen((tinfo_path + "/species-list.txt").c_str(), "w");
-//    for (int trainedSpecie: trainedSpecies) {
-//        fprintf(fp, "%d\n", trainedSpecie);
-//    }
-//    fclose(fp);
+    fwrite(serialized.first, serialized.second, sizeof(char), handle);
+    fclose(handle);
+    free(serialized.first);
 }
-
-
diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h
index 8635cdae..39336e9a 100644
--- a/src/commons/IndexCreator.h
+++ b/src/commons/IndexCreator.h
@@ -37,20 +37,24 @@ using namespace std;
 class IndexCreator{
 private:
     uint64_t MARKER;
-    string tinfo_path;
-    string tinfo_list;
-    vector<TaxID> trainedSpecies;
-    unordered_map<TaxID, _training> trainingInfo;
-    int threadNum;
     BaseMatrix *subMat;
 
-    // parameters
+    // Parameters
+    int threadNum;
+    size_t bufferSize;
+    
+    // Inputs
     NcbiTaxonomy * taxonomy;
     string dbDir;
     string fnaListFileName;
     string taxonomyDir;
     string acc2taxidFileName;
-    size_t bufferSize;
+
+
+    // Outputs
+    string taxidListFileName;
+    string taxonomyBinaryFileName;
+    string versionFileName;
 
     struct FASTA {
         string path;
@@ -93,13 +97,14 @@ class IndexCreator{
 
     size_t numOfFlush=0;
 
-    void trainProdigal();
-
-//    void writeTargetFiles(TargetKmer * kmerBuffer, size_t & kmerNum, const char * outputFileName,const vector<int> & taxIdList);
     void writeTargetFiles(TargetKmer * kmerBuffer, size_t & kmerNum, const LocalParameters & par, const size_t * uniqeKmerIdx, size_t & uniqKmerCnt);
 
     void writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & kmerNum, const LocalParameters & par, const size_t * uniqeKmerIdx, size_t & uniqKmerCnt);
+
     void writeDiffIdx(uint16_t *buffer, FILE* handleKmerTable, uint16_t *toWrite, size_t size, size_t & localBufIdx );
+
+    void writeTaxonomyDB();
+
     static bool compareForDiffIdx(const TargetKmer & a, const TargetKmer & b);
 
 //    void maskLowComplexityRegions(char * seq, char * maskedSeq, ProbabilityMatrix & probMat,
@@ -146,7 +151,6 @@ class IndexCreator{
                                   unordered_map<string, TaxID> & foundAcc2taxid);
     static void getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments, const char * seqFileName);
     IndexCreator(const LocalParameters & par);
-    IndexCreator(const LocalParameters & par, string dbDir, string fnaListFileName, string acc2taxidFile);
     IndexCreator() {taxonomy = nullptr;}
     ~IndexCreator();
     int getNumOfFlush();
diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h
index 05b61dc7..36b5d8b3 100644
--- a/src/commons/KmerMatcher.h
+++ b/src/commons/KmerMatcher.h
@@ -100,6 +100,9 @@ class KmerMatcher {
                  const string &db = string());
   
   void sortMatches(Buffer<Match> *matchBuffer);
+
+  // Getters
+  size_t getTotalMatchCnt() const { return totalMatchCnt; }
 };
 
 inline uint64_t KmerMatcher::getNextTargetKmer(uint64_t lookingTarget,
diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp
index c3ed99b3..b22bc3fe 100644
--- a/src/commons/QueryFilter.cpp
+++ b/src/commons/QueryFilter.cpp
@@ -9,10 +9,11 @@ QueryFilter::QueryFilter(LocalParameters & par) {
     contams = Util::split(par.contamList, ",");
     
     // Taxonomy
-    if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
-    taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp",
-                                par.taxonomyPath + "/nodes.dmp",
-                                par.taxonomyPath + "/merged.dmp");
+    taxonomy = loadTaxonomy(dbDir, par.taxonomyPath);
+    // if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
+    // taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp",
+    //                             par.taxonomyPath + "/nodes.dmp",
+    //                             par.taxonomyPath + "/merged.dmp");
 
     // Agents
     queryIndexer = new QueryIndexer(par);
diff --git a/src/commons/common.cpp b/src/commons/common.cpp
index 6fbc84f5..57b5bc79 100644
--- a/src/commons/common.cpp
+++ b/src/commons/common.cpp
@@ -1,25 +1,74 @@
 #include "common.h"
-#include <iostream>
+#include "FileUtil.h"
+#include "NcbiTaxonomy.h"
 #include <fstream>
+#include <iostream>
 #include <unistd.h>
+// #include "MathUtil.h"
+#include "Debug.h"
+#include "Util.h"
+#include "sys/mman.h"
 
-void process_mem_usage(double& vm_usage, double& resident_set)
-{
-    vm_usage     = 0.0;
-    resident_set = 0.0;
+// #include <fstream>
+// #include <algorithm>
+// #include <cassert>
 
-    // the two fields we want
-    unsigned long vsize;
-    long rss;
-    {
-        std::string ignore;
-        std::ifstream ifs("/proc/self/stat", std::ios_base::in);
-        ifs >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore
-            >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore
-            >> ignore >> ignore >> vsize >> rss;
-    }
+void process_mem_usage(double &vm_usage, double &resident_set) {
+  vm_usage = 0.0;
+  resident_set = 0.0;
 
-    long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages
-    vm_usage = vsize / 1024.0;
-    resident_set = rss * page_size_kb;
+  // the two fields we want
+  unsigned long vsize;
+  long rss;
+  {
+    std::string ignore;
+    std::ifstream ifs("/proc/self/stat", std::ios_base::in);
+    ifs >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >>
+        ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >>
+        ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >>
+        ignore >> vsize >> rss;
+  }
+
+  long page_size_kb = sysconf(_SC_PAGE_SIZE) /
+                      1024; // in case x86-64 is configured to use 2MB pages
+  vm_usage = vsize / 1024.0;
+  resident_set = rss * page_size_kb;
 }
+
+// Mostly copied from lib/mmseqs/src/taxonomy/NcbiTaxonomy.cpp
+NcbiTaxonomy *loadTaxonomy(const std::string &dbDir,
+                           const std::string &taxonomyDir) {
+  std::string binFile = dbDir + "/taxonomyDB";
+  if (fileExist(binFile)) {
+    FILE *handle = fopen(binFile.c_str(), "r");
+    struct stat sb;
+    if (fstat(fileno(handle), &sb) < 0) {
+      Debug(Debug::ERROR) << "Failed to fstat file " << binFile << "\n";
+      EXIT(EXIT_FAILURE);
+    }
+    char *data = (char *)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE,
+                              fileno(handle), 0);
+    if (data == MAP_FAILED) {
+      Debug(Debug::ERROR) << "Failed to mmap file " << binFile << " with error "
+                          << errno << "\n";
+      EXIT(EXIT_FAILURE);
+    }
+    fclose(handle);
+    NcbiTaxonomy *t = NcbiTaxonomy::unserialize(data);
+    if (t != NULL) {
+      t->setMmapData(data, sb.st_size);
+      return t;
+    } else {
+      Debug(Debug::WARNING) << "Outdated taxonomy information, please recreate "
+                               "with createtaxdb.\n";
+    }
+  } else if (taxonomyDir != "") {
+    return new NcbiTaxonomy(taxonomyDir + "/names.dmp",
+                            taxonomyDir + "/nodes.dmp",
+                            taxonomyDir + "/merged.dmp");
+  }
+
+  return new NcbiTaxonomy(dbDir + "/taxonomy/names.dmp",
+                          dbDir + "/taxonomy/nodes.dmp",
+                          dbDir + "/taxonomy/merged.dmp");
+}
\ No newline at end of file
diff --git a/src/commons/common.h b/src/commons/common.h
index 9b499da2..4e2fcbcd 100644
--- a/src/commons/common.h
+++ b/src/commons/common.h
@@ -81,4 +81,6 @@ inline bool fileExist(const std::string& name) {
 
 void process_mem_usage(double& vm_usage, double& resident_set);
 
+NcbiTaxonomy * loadTaxonomy(const std::string & dbDir, const std::string & taxonomyDir = "");
+
 #endif //ADCLASSIFIER2_COMMON_H
diff --git a/src/workflow/build.cpp b/src/workflow/build.cpp
index 80addbb0..f78311cc 100644
--- a/src/workflow/build.cpp
+++ b/src/workflow/build.cpp
@@ -9,7 +9,7 @@ void setDefaults_build(LocalParameters & par){
     par.spaceMask = "11111111";
     par.taxonomyPath = "" ;
     par.splitNum = 4096;
-    par.maskProb = 0.5;
+    par.maskProb = 0.9;
     par.maskMode = 0;
     par.bufferSize = 1'000'000'000;
 }
@@ -19,23 +19,14 @@ int build(int argc, const char **argv, const Command &command){
     LocalParameters &par = LocalParameters::getLocalInstance();
     setDefaults_build(par);
     par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0);
-    string dbDirectory = par.filenames[0];
-    string fastaListPath = par.filenames[1];
-    string mappingFile = par.filenames[2];
-    if (par.taxonomyPath.empty()) {
-        par.taxonomyPath = dbDirectory + "/taxonomy/";
-    } else {
-        par.taxonomyPath = par.taxonomyPath + "/";
-    }
-
+  
     // If dbDirectory does not exist, create it
-    if (!FileUtil::directoryExists(dbDirectory.c_str())) {
-        FileUtil::makeDir(dbDirectory.c_str());
+    if (!FileUtil::directoryExists(par.filenames[0].c_str())) {
+        FileUtil::makeDir(par.filenames[0].c_str());
     }
 
-    cout << "Taxonomy path: " << par.taxonomyPath << endl;
-
-    IndexCreator idxCre(par, dbDirectory, fastaListPath, mappingFile);
+    // Create index
+    IndexCreator idxCre(par);
     idxCre.createIndex(par);
 
     if(idxCre.getNumOfFlush() == 1) {
@@ -43,7 +34,7 @@ int build(int argc, const char **argv, const Command &command){
         return 0;
     }
 
-    //Merge files
+    // Merge index files
     cout << "Merge reference DB files ... " << endl;
     int numOfSplits = idxCre.getNumOfFlush();
     FileMerger merger(par);
diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp
index 21a45f17..36062932 100644
--- a/src/workflow/classify.cpp
+++ b/src/workflow/classify.cpp
@@ -16,7 +16,7 @@ void setClassifyDefaults(LocalParameters & par){
     par.minCoveredPos = 4;
     par.printLog = 0;
     par.maxGap = 0;
-    par.taxonomyPath = "DBDIR/taxonomy/" ;
+    par.taxonomyPath = "" ;
     par.minConsCnt = 4;
     par.minConsCntEuk = 9;
     par.eukaryotaTaxId = 2759;
diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp
index e4328cb6..d3d3a08e 100644
--- a/src/workflow/filter.cpp
+++ b/src/workflow/filter.cpp
@@ -14,7 +14,7 @@ void setFilterDefaults(LocalParameters & par){
     par.minCoveredPos = 4;
     par.printLog = 0;
     par.maxGap = 0;
-    par.taxonomyPath = "DBDIR/taxonomy/" ;
+    par.taxonomyPath = "" ;
     par.minConsCnt = 4;
     par.minConsCntEuk = 9;
     par.eukaryotaTaxId = 2759;

From df1316b6a2cce95b9a62628a550611e7f34c433d Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 22 Aug 2023 17:22:26 +0900
Subject: [PATCH 19/65] binary taxonomy DB writing and reading

---
 src/workflow/add_to_library.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp
index 4f8bfacc..de51bd8f 100644
--- a/src/workflow/add_to_library.cpp
+++ b/src/workflow/add_to_library.cpp
@@ -16,7 +16,6 @@ void setDefaults_addToLibrary(LocalParameters & par){
 }
 
 // Group sequences by species
-//
 int addToLibrary(int argc, const char **argv, const Command &command){
     LocalParameters &par = LocalParameters::getLocalInstance();
     setDefaults_addToLibrary(par);

From f72f541ca8ceb0ab5adff54889ff55451e6e1ed2 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Thu, 24 Aug 2023 10:44:43 +0900
Subject: [PATCH 20/65] first running code for filtering module

---
 lib/prodigal/bitmap.cpp         |  18 ++++
 lib/prodigal/bitmap.h           |  17 +---
 src/commons/Classifier.cpp      |   9 +-
 src/commons/IndexCreator.cpp    |  43 ++++++--
 src/commons/IndexCreator.h      |   6 +-
 src/commons/KmerExtractor.h     |   2 -
 src/commons/KmerMatcher.cpp     | 168 +++++++++++++++++++++++++-------
 src/commons/KmerMatcher.h       |   2 +
 src/commons/QueryFilter.cpp     |  47 ++++++---
 src/commons/QueryFilter.h       |   6 +-
 src/commons/Reporter.cpp        |  42 +++++---
 src/commons/Reporter.h          |  15 ++-
 src/commons/common.cpp          |  23 +++++
 src/commons/common.h            |   3 +
 src/metabuli.cpp                |   2 +-
 src/workflow/add_to_library.cpp |  11 +--
 src/workflow/build.cpp          |   2 +-
 src/workflow/classify.cpp       |   5 +-
 src/workflow/filter.cpp         |  16 +--
 19 files changed, 323 insertions(+), 114 deletions(-)

diff --git a/lib/prodigal/bitmap.cpp b/lib/prodigal/bitmap.cpp
index 76abc9d8..b000cb6e 100644
--- a/lib/prodigal/bitmap.cpp
+++ b/lib/prodigal/bitmap.cpp
@@ -21,4 +21,22 @@
 #include "bitmap.h"
 
 /* Test a bit, 0 = not set, 1 = set */
+ unsigned char test(unsigned char *bm, int ndx) {
+     return ( bm[ndx>>3] & (1 << (ndx&0x07))?1:0 );
+ }
+
+/* Clear a bit (set it to 0) */
+ void clear(unsigned char *bm, int ndx) {
+     bm[ndx>>3] &= ~(1 << (ndx&0x07));
+ }
+
+/* Set a bit to 1 */
+ void set(unsigned char *bm, int ndx) {
+     bm[ndx>>3] |= (1 << (ndx&0x07));
+ }
+
+/* Flip a bit's value 0->1 or 1->0 */
+ void toggle(unsigned char *bm, int ndx) {
+     bm[ndx>>3] ^= (1 << (ndx&0x07));
+ }
 
diff --git a/lib/prodigal/bitmap.h b/lib/prodigal/bitmap.h
index 74eb4370..4253f071 100644
--- a/lib/prodigal/bitmap.h
+++ b/lib/prodigal/bitmap.h
@@ -21,23 +21,16 @@
 #ifndef BITMAP_H_
 #define BITMAP_H_
 
- unsigned char static test(unsigned char *bm, int ndx) {
-     return ( bm[ndx>>3] & (1 << (ndx&0x07))?1:0 );
- }
+/* Test a bit, 0 = not set, 1 = set */
+unsigned char test(unsigned char *bm, int ndx);
 
 /* Clear a bit (set it to 0) */
- void static clear(unsigned char *bm, int ndx) {
-     bm[ndx>>3] &= ~(1 << (ndx&0x07));
- }
+void clear(unsigned char *bm, int ndx);
 
 /* Set a bit to 1 */
- void static set(unsigned char *bm, int ndx) {
-     bm[ndx>>3] |= (1 << (ndx&0x07));
- }
+void set(unsigned char *bm, int ndx);
 
 /* Flip a bit's value 0->1 or 1->0 */
- void static toggle(unsigned char *bm, int ndx) {
-     bm[ndx>>3] ^= (1 << (ndx&0x07));
- }
+void toggle(unsigned char *bm, int ndx);
 
 #endif
diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp
index c426f889..9258a194 100644
--- a/src/commons/Classifier.cpp
+++ b/src/commons/Classifier.cpp
@@ -1,16 +1,15 @@
 #include "Classifier.h"
+#include "FileUtil.h"
+#include "common.h"
 
 Classifier::Classifier(LocalParameters & par) {
     // Load parameters
     dbDir = par.filenames[1 + (par.seqMode == 2)];
     matchPerKmer = par.matchPerKmer;
-
+    loadDbParameters(par);
+    
     // Taxonomy
     taxonomy = loadTaxonomy(dbDir, par.taxonomyPath);
-    // if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
-    // taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp",
-    //                             par.taxonomyPath + "/nodes.dmp",
-    //                             par.taxonomyPath + "/merged.dmp");
 
     // Agents
     queryIndexer = new QueryIndexer(par);
diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index 834f60a1..7f611b93 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -1,11 +1,13 @@
 #include "IndexCreator.h"
-
+#include <cstdio>
 #include <utility>
 
 IndexCreator::IndexCreator(const LocalParameters & par) {
     // Parameters
     threadNum = par.threads;
     bufferSize = par.bufferSize;
+    reducedAA = par.reducedAA;
+    spaceMask = par.spaceMask;
     
     // Input files
     dbDir = par.filenames[0];
@@ -18,11 +20,11 @@ IndexCreator::IndexCreator(const LocalParameters & par) {
     fnaListFileName = par.filenames[1];
     acc2taxidFileName = par.filenames[2];
 
-    
     // Output files
     taxidListFileName = dbDir + "/taxID_list";
     taxonomyBinaryFileName = dbDir + "/taxonomyDB";
     versionFileName = dbDir + "/db.version";
+    paramterFileName = dbDir + "/db.parameters";
 
     // Load taxonomy
     taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp",
@@ -53,6 +55,11 @@ void IndexCreator::createIndex(const LocalParameters &par) {
     makeBlocksForParallelProcessing();
     cout << "Made blocks for each thread" << endl;
 
+    // Print fnaSplits
+    for (auto & fnaSplit : fnaSplits) {
+        cout << fnaSplit.offset << " " << fnaSplit.cnt << " " << fnaSplit.speciesID << " " << fnaSplit.file_idx << " " << fnaSplit.training << endl;
+    }
+
     // Write taxonomy id list
     FILE * taxidListFile = fopen(taxidListFileName.c_str(), "w");
     for (auto & taxid : taxIdList) {
@@ -98,6 +105,7 @@ void IndexCreator::createIndex(const LocalParameters &par) {
     }
     delete[] splitChecker;
     writeTaxonomyDB();
+    writeDbParameters();
 }
 
 void IndexCreator::updateIndex(const LocalParameters &par) {
@@ -207,22 +215,34 @@ void IndexCreator::splitFastaForProdigalTraining(int file_idx, TaxID speciesID)
     bool stored = false;
     while(seqIdx < fastaList[file_idx].sequences.size()){
         stored = false;
+        
+        // Skip
         if(speciesID == 0) { seqIdx++; continue;}
 
+        // Length
         currLength = fastaList[file_idx].sequences[seqIdx].length;
         if (currLength > maxLength){
             maxLength = currLength;
             seqForTraining = seqIdx;
         }
         lengthSum += currLength;
+        
         cnt ++;
+        // Check the size of current split
         if(lengthSum > 100'000'000 || cnt > 300 || (cnt > 100 && lengthSum > 50'000'000)){
-            tempSplits.emplace_back(0, offset, cnt - 1, speciesID, file_idx);
-            offset += cnt - 1;
+            tempSplits.emplace_back(0, offset, cnt, speciesID, file_idx);
+            offset += cnt;
             lengthSum = 0;
-            cnt = 1;
+            cnt = 0;
             stored = true;
         }
+        // if(lengthSum > 100'000'000 || cnt > 300 || (cnt > 100 && lengthSum > 50'000'000)){
+        //     tempSplits.emplace_back(0, offset, cnt - 1, speciesID, file_idx);
+        //     offset += cnt - 1;
+        //     lengthSum = 0;
+        //     cnt = 1;
+        //     stored = true;
+        // }
         seqIdx ++;
     }
     if(!stored){
@@ -801,7 +821,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
                     cout << omp_get_thread_num() << " Processed " << i << "th splits (" << processedSplitCnt << ")" << endl;
 #endif
                     munmap(fastaFile.data, fastaFile.fileSize + 1);
-                }else {
+                } else {
                     // Withdraw the reservation if the buffer is full.
                     cout << "Buffer is full. Withdraw the reservation." << endl;
                     checker[i] = false;
@@ -827,3 +847,14 @@ void IndexCreator::writeTaxonomyDB() {
     fclose(handle);
     free(serialized.first);
 }
+
+void IndexCreator::writeDbParameters() {
+    FILE *handle = fopen(paramterFileName.c_str(), "w");
+    if (handle == NULL) {
+        Debug(Debug::ERROR) << "Could not open " << paramterFileName << " for writing\n";
+        EXIT(EXIT_FAILURE);
+    }
+    fprintf(handle, "Reduced_alphabet\t%d\n", reducedAA);
+    fprintf(handle, "Spaced_kmer_mask\t%s\n", spaceMask.c_str());
+    fclose(handle);
+}
diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h
index 39336e9a..8f343717 100644
--- a/src/commons/IndexCreator.h
+++ b/src/commons/IndexCreator.h
@@ -42,6 +42,8 @@ class IndexCreator{
     // Parameters
     int threadNum;
     size_t bufferSize;
+    int reducedAA;
+    string spaceMask;
     
     // Inputs
     NcbiTaxonomy * taxonomy;
@@ -50,11 +52,11 @@ class IndexCreator{
     string taxonomyDir;
     string acc2taxidFileName;
 
-
     // Outputs
     string taxidListFileName;
     string taxonomyBinaryFileName;
     string versionFileName;
+    string paramterFileName;
 
     struct FASTA {
         string path;
@@ -105,6 +107,8 @@ class IndexCreator{
 
     void writeTaxonomyDB();
 
+    void writeDbParameters();
+
     static bool compareForDiffIdx(const TargetKmer & a, const TargetKmer & b);
 
 //    void maskLowComplexityRegions(char * seq, char * maskedSeq, ProbabilityMatrix & probMat,
diff --git a/src/commons/KmerExtractor.h b/src/commons/KmerExtractor.h
index 2e7f2977..14226262 100644
--- a/src/commons/KmerExtractor.h
+++ b/src/commons/KmerExtractor.h
@@ -29,8 +29,6 @@ class KmerExtractor {
                                             const QuerySplit & currentSplit,
                                             const LocalParameters &par);
 
-
-
 public:
     explicit KmerExtractor(const LocalParameters & par);
     ~KmerExtractor();
diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 3f9a1495..5b352c5f 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -1,4 +1,7 @@
 #include "KmerMatcher.h"
+#include "Kmer.h"
+#include "Mmap.h"
+#include <ostream>
 
 KmerMatcher::KmerMatcher(const LocalParameters & par,
                          NcbiTaxonomy * taxonomy) {
@@ -11,48 +14,132 @@ KmerMatcher::KmerMatcher(const LocalParameters & par,
     MARKER = ~ MARKER;
     totalMatchCnt = 0;
 
-    // Load the taxonomy ID list
-    FILE * taxIdFile;
-    if((taxIdFile = fopen((dbDir + "/taxID_list").c_str(),"r")) == NULL){
-        std::cout<<"Cannot open the taxID list file."<<std::endl;
-        return;
-    }
-    char taxID[100];
-    while(feof(taxIdFile) == 0) {
-        fscanf(taxIdFile,"%s",taxID);
-        TaxID taxId = atol(taxID);
-        TaxonNode const * taxon = taxonomy->taxonNode(taxId);
-        if (taxId == taxon->taxId) {
-            TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
-            TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
-            while (taxon->taxId != speciesTaxID) {
-                taxId2speciesId[taxon->taxId] = speciesTaxID;
-                taxId2genusId[taxon->taxId] = genusTaxID;
-                taxon = taxonomy->taxonNode(taxon->parentTaxId);
-            }
-            taxId2speciesId[speciesTaxID] = speciesTaxID;
-            taxId2genusId[speciesTaxID] = genusTaxID;
-        } else {
-            TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
-            TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
-            while (taxon->taxId != speciesTaxID) {
-                taxId2speciesId[taxon->taxId] = speciesTaxID;
-                taxId2genusId[taxon->taxId] = genusTaxID;
-                taxon = taxonomy->taxonNode(taxon->parentTaxId);
-            }
-            taxId2speciesId[speciesTaxID] = speciesTaxID;
-            taxId2genusId[speciesTaxID] = genusTaxID;
-            taxId2speciesId[taxId] = speciesTaxID;
-            taxId2genusId[taxId] = genusTaxID;
-        }
-    }
-    fclose(taxIdFile);
+    this->taxonomy = taxonomy;
+    loadTaxIdList(par);
+
+    // // Load the taxonomy ID list
+    // FILE * taxIdFile;
+    // if((taxIdFile = fopen((dbDir + "/taxID_list").c_str(),"r")) == NULL){
+    //     std::cout<<"Cannot open the taxID list file."<<std::endl;
+    //     return;
+    // }
+    // char taxID[100];
+    // while(feof(taxIdFile) == 0) {
+    //     fscanf(taxIdFile,"%s",taxID);
+    //     TaxID taxId = atol(taxID);
+    //     TaxonNode const * taxon = taxonomy->taxonNode(taxId);
+    //     if (taxId == taxon->taxId) {
+    //         TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
+    //         TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
+    //         while (taxon->taxId != speciesTaxID) {
+    //             taxId2speciesId[taxon->taxId] = speciesTaxID;
+    //             taxId2genusId[taxon->taxId] = genusTaxID;
+    //             taxon = taxonomy->taxonNode(taxon->parentTaxId);
+    //         }
+    //         taxId2speciesId[speciesTaxID] = speciesTaxID;
+    //         taxId2genusId[speciesTaxID] = genusTaxID;
+    //     } else {
+    //         TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
+    //         TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
+    //         while (taxon->taxId != speciesTaxID) {
+    //             taxId2speciesId[taxon->taxId] = speciesTaxID;
+    //             taxId2genusId[taxon->taxId] = genusTaxID;
+    //             taxon = taxonomy->taxonNode(taxon->parentTaxId);
+    //         }
+    //         taxId2speciesId[speciesTaxID] = speciesTaxID;
+    //         taxId2genusId[speciesTaxID] = genusTaxID;
+    //         taxId2speciesId[taxId] = speciesTaxID;
+    //         taxId2genusId[taxId] = genusTaxID;
+    //     }
+    // }
+    // fclose(taxIdFile);
 }
 
 
 KmerMatcher::~KmerMatcher() {
 }
 
+void KmerMatcher::loadTaxIdList(const LocalParameters & par) {
+    if (par.contamList != "") {
+        vector<string> contams = Util::split(par.contamList, ",");
+        for (auto &contam : contams) {
+            FILE *taxIdFile;
+            cout << dbDir + "/" + contam + "/taxID_list" << endl;
+            if ((taxIdFile = fopen((dbDir + "/" + contam + "/taxID_list").c_str(), "r")) == NULL) {
+                std::cout << "Cannot open the taxID list file." << std::endl;
+                return;
+            }
+            char taxID[100];
+            while (feof(taxIdFile) == 0) {
+                fscanf(taxIdFile, "%s", taxID);
+                TaxID taxId = atol(taxID);
+                TaxonNode const *taxon = taxonomy->taxonNode(taxId);
+                if (taxId == taxon->taxId) {
+                    TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
+                    TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
+                    while (taxon->taxId != speciesTaxID) {
+                        taxId2speciesId[taxon->taxId] = speciesTaxID;
+                        taxId2genusId[taxon->taxId] = genusTaxID;
+                        taxon = taxonomy->taxonNode(taxon->parentTaxId);
+                    }
+                    taxId2speciesId[speciesTaxID] = speciesTaxID;
+                    taxId2genusId[speciesTaxID] = genusTaxID;
+                } else {
+                    TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
+                    TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
+                    while (taxon->taxId != speciesTaxID) {
+                        taxId2speciesId[taxon->taxId] = speciesTaxID;
+                        taxId2genusId[taxon->taxId] = genusTaxID;
+                        taxon = taxonomy->taxonNode(taxon->parentTaxId);
+                    }
+                    taxId2speciesId[speciesTaxID] = speciesTaxID;
+                    taxId2genusId[speciesTaxID] = genusTaxID;
+                    taxId2speciesId[taxId] = speciesTaxID;
+                    taxId2genusId[taxId] = genusTaxID;
+                }
+            }
+            fclose(taxIdFile);
+        }
+    } else {
+        FILE *taxIdFile;
+        if ((taxIdFile = fopen((dbDir + "/taxID_list").c_str(), "r")) == NULL) {
+            std::cout << "Cannot open the taxID list file." << std::endl;
+            return;
+        }
+        char taxID[100];
+        while (feof(taxIdFile) == 0) {
+            fscanf(taxIdFile, "%s", taxID);
+            TaxID taxId = atol(taxID);
+            TaxonNode const *taxon = taxonomy->taxonNode(taxId);
+            if (taxId == taxon->taxId) {
+                TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
+                TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
+                while (taxon->taxId != speciesTaxID) {
+                  taxId2speciesId[taxon->taxId] = speciesTaxID;
+                  taxId2genusId[taxon->taxId] = genusTaxID;
+                  taxon = taxonomy->taxonNode(taxon->parentTaxId);
+                }
+                taxId2speciesId[speciesTaxID] = speciesTaxID;
+                taxId2genusId[speciesTaxID] = genusTaxID;
+            } else {
+                TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
+                TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
+                while (taxon->taxId != speciesTaxID) {
+                  taxId2speciesId[taxon->taxId] = speciesTaxID;
+                  taxId2genusId[taxon->taxId] = genusTaxID;
+                  taxon = taxonomy->taxonNode(taxon->parentTaxId);
+                }
+                taxId2speciesId[speciesTaxID] = speciesTaxID;
+                taxId2genusId[speciesTaxID] = genusTaxID;
+                taxId2speciesId[taxId] = speciesTaxID;
+                taxId2genusId[taxId] = genusTaxID;
+            }
+        }
+        fclose(taxIdFile);
+    }
+    cout << "Taxonomy ID list is loaded." << endl;
+}
+
 
 int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
                             Buffer<Match> * matchBuffer,
@@ -73,6 +160,15 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
     MmapedData<DiffIdxSplit> diffIdxSplits = mmapData<DiffIdxSplit>(diffIdxSplitFileName.c_str(), 3);
     size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t);
 
+    MmapedData<TargetKmerInfo> tempInfos = mmapData<TargetKmerInfo>(targetInfoFileName.c_str(), 3);
+    size_t numOfInfos = tempInfos.fileSize / sizeof(TargetKmerInfo);
+
+    // Print kmer infos
+    for (size_t i = 0; i < numOfInfos; i++) {
+        cout << (int) tempInfos.data[i].sequenceID << " " << (int) tempInfos.data[i].redundancy << endl;
+    }
+
+
     size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve;
     QueryKmer *queryKmerList = queryKmerBuffer->buffer;
     
diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h
index 36b5d8b3..cdf23e30 100644
--- a/src/commons/KmerMatcher.h
+++ b/src/commons/KmerMatcher.h
@@ -91,6 +91,8 @@ class KmerMatcher {
 
   static bool compareMatches(const Match &a, const Match &b);
 
+  void loadTaxIdList(const LocalParameters & par);
+
 public:
   KmerMatcher(const LocalParameters &par, NcbiTaxonomy *taxonomy);
 
diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp
index b22bc3fe..100cd7f2 100644
--- a/src/commons/QueryFilter.cpp
+++ b/src/commons/QueryFilter.cpp
@@ -1,4 +1,5 @@
 #include "QueryFilter.h"
+#include "common.h"
 
 QueryFilter::QueryFilter(LocalParameters & par) {
     // Load parameters
@@ -7,14 +8,11 @@ QueryFilter::QueryFilter(LocalParameters & par) {
     printMode = par.printMode;
     seqMode = par.seqMode;
     contams = Util::split(par.contamList, ",");
+    loadDbParameters(par);
     
     // Taxonomy
     taxonomy = loadTaxonomy(dbDir, par.taxonomyPath);
-    // if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
-    // taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp",
-    //                             par.taxonomyPath + "/nodes.dmp",
-    //                             par.taxonomyPath + "/merged.dmp");
-
+ 
     // Agents
     queryIndexer = new QueryIndexer(par);
     kmerExtractor = new KmerExtractor(par);
@@ -22,13 +20,19 @@ QueryFilter::QueryFilter(LocalParameters & par) {
     else { kmerMatcher = new KmerMatcher(par, taxonomy);}
     taxonomer = new Taxonomer(par, taxonomy);
     reporter = new Reporter(par, taxonomy);
-
     setInputAndOutputFiles(par);
+    reporter->setReadClassificationFileName(readClassificationFileName);
+    reporter->setReportFileName(reportFileName);
+    cout << "Filtered reads: " << f1 << endl;
+    if (par.seqMode == 2) { cout << "Filtered reads: " << f2 << endl; }
+    if (printMode == 2) {
+        cout << "Removed reads: " << rm1 << endl;
+        if (par.seqMode == 2) { cout << "Removed reads: " << rm2 << endl; }
+    }
+
     filter_kseq1 = KSeqFactory(in1.c_str());
     if (par.seqMode == 2) { filter_kseq2 = KSeqFactory(in2.c_str()); }
 
-    isFiltered = new bool[queryIndexer->getReadNum_1()];
-    memset(isFiltered, 0, sizeof(bool) * queryIndexer->getReadNum_1());
     readCounter = 0;
 
     // Open output files
@@ -38,6 +42,8 @@ QueryFilter::QueryFilter(LocalParameters & par) {
         rm1_fp = fopen(rm1.c_str(), "w");
         if (par.seqMode == 2) { rm2_fp = fopen(rm2.c_str(), "w"); }
     }
+
+
 }
 
 QueryFilter::~QueryFilter() {
@@ -59,19 +65,22 @@ QueryFilter::~QueryFilter() {
 }
 
 void QueryFilter::setInputAndOutputFiles(const LocalParameters & par) {
+    cout << "Setting output file names" << endl;
     // Get the base name of in1
     in1 = par.filenames[0];
     string baseName = LocalUtil::getQueryBaseName(in1);
 
     // Set the output file names
-    f1 = baseName + "_filtered.fna.gz";
-    rm1 = baseName + "_removed.fna.gz";
+    f1 = baseName + "_filtered.fna";
+    rm1 = baseName + "_removed.fna";
+    reportFileName = baseName + "_report.tsv";
+    readClassificationFileName = baseName + "_classifications.tsv";
 
     // For paired-end reads
     if (seqMode == 2) {
         in2 = par.filenames[1];
-        f2 = LocalUtil::getQueryBaseName(in2) + "_filtered.fna.gz";
-        rm2 = LocalUtil::getQueryBaseName(in2) + "_removed.fna.gz";
+        f2 = LocalUtil::getQueryBaseName(in2) + "_filtered.fna";
+        rm2 = LocalUtil::getQueryBaseName(in2) + "_removed.fna";
     }
 }
 
@@ -88,7 +97,7 @@ void QueryFilter::printFilteredReads() {
         if (seqMode == 2) { filter_kseq2->ReadEntry(); }
 
         // Print reads
-        if (isFiltered[i]) { // Print filtered reads
+        if (!isFiltered[i]) { // Print filtered reads
             fprintf(f1_fp, ">%s\n%s\n", filter_kseq1->entry.name.s, filter_kseq1->entry.sequence.s);
             if (seqMode == 2) { fprintf(f2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); }
         } else if (printMode == 2) { // Print removed reads
@@ -105,16 +114,21 @@ void QueryFilter::filterReads(LocalParameters & par) {
     size_t numOfSeq = queryIndexer->getReadNum_1();
     size_t totalReadLength = queryIndexer->getTotalReadLength();
     const vector<QuerySplit> & queryReadSplit = queryIndexer->getQuerySplits();
+    // print queryReadSplit
+    // for (size_t i = 0; i < queryReadSplit.size(); i++) {
+    //     cout << queryReadSplit[i].start << " " << queryReadSplit[i].end << " " << queryReadSplit[i].kmerCnt << endl;
+    // }
     cout << "Done" << endl;
     cout << "Total number of sequences: " << numOfSeq << endl;
     cout << "Total read length: " << totalReadLength <<  "nt" << endl;
 
+    isFiltered = new bool[queryIndexer->getReadNum_1()];
+    memset(isFiltered, 0, sizeof(bool) * queryIndexer->getReadNum_1());
     QueryKmerBuffer kmerBuffer;
     Buffer<Match> matchBuffer;
     vector<Query> queryList;
 
     size_t numOfTatalQueryKmerCnt = 0;
-    size_t totalMatchCnt = 0;
     size_t processedSeqCnt = 0;
     reporter->openReadClassificationFile();
 
@@ -169,8 +183,11 @@ void QueryFilter::filterReads(LocalParameters & par) {
 
         recordFilteredReads(queryList);
     }
+
+    cout << "Number of query k-mers: " << numOfTatalQueryKmerCnt << endl;
+    cout << "The number of matches: " << kmerMatcher->getTotalMatchCnt() << endl;
     printFilteredReads();
-    reporter->writeReportFile(numOfSeq, taxonomer->getTaxCounts());
+    reporter->writeReportFile(numOfSeq, taxonomer->getTaxCounts(), false);
     reporter->closeReadClassificationFile();
 
     // Memory deallocation
diff --git a/src/commons/QueryFilter.h b/src/commons/QueryFilter.h
index 3dccb7e6..a53455c4 100644
--- a/src/commons/QueryFilter.h
+++ b/src/commons/QueryFilter.h
@@ -29,7 +29,11 @@ class QueryFilter {
     KSeqWrapper* filter_kseq1;
     KSeqWrapper* filter_kseq2;
 
-    std::string in1, in2, f1, f2, rm1, rm2; // input and output file names
+    std::string in1, in2;
+    std::string f1, f2, rm1, rm2; // input and output file names
+    std::string readClassificationFileName;
+    std::string reportFileName;
+    
     bool * isFiltered;
     size_t readCounter;
     FILE * f1_fp, * f2_fp, * rm1_fp, * rm2_fp;
diff --git a/src/commons/Reporter.cpp b/src/commons/Reporter.cpp
index 566e8c66..0c6b9cd1 100644
--- a/src/commons/Reporter.cpp
+++ b/src/commons/Reporter.cpp
@@ -1,18 +1,26 @@
 #include "Reporter.h"
 #include "taxonomyreport.cpp"
 
-Reporter::Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxonomy(taxonomy){
-    if (par.seqMode == 2) {
-        outDir = par.filenames[3];
-        jobId = par.filenames[4];
-    } else {
-        outDir = par.filenames[2];
-        jobId = par.filenames[3];
+Reporter::Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxonomy(taxonomy) {
+    if (par.contamList == "") { // classify module
+        if (par.seqMode == 2) {
+            outDir = par.filenames[3];
+            jobId = par.filenames[4];
+        } else {
+            outDir = par.filenames[2];
+            jobId = par.filenames[3];
+        }
+        // Output file names
+        reportFileName = outDir + + "/" + jobId + "_report.tsv";
+        readClassificationFileName = outDir + "/" + jobId + "_classifications.tsv";
     }
+
+    
+    
 }
 
 void Reporter::openReadClassificationFile() {
-    readClassificationFile.open(outDir + "/" + jobId + "_classifications.tsv");
+    readClassificationFile.open(readClassificationFileName);
 }
 
 void Reporter::writeReadClassification(const vector<Query> & queryList, bool classifiedOnly) {
@@ -38,20 +46,22 @@ void Reporter::closeReadClassificationFile() {
     readClassificationFile.close();
 }
 
-void Reporter::writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt) {
+void Reporter::writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt, bool krona) {
     unordered_map<TaxID, TaxonCounts> cladeCounts = taxonomy->getCladeCounts(taxCnt);
     FILE *fp;
-    fp = fopen((outDir + + "/" + jobId + "_report.tsv").c_str(), "w");
+    fp = fopen((reportFileName).c_str(), "w");
     writeReport(fp, cladeCounts, numOfQuery);
     fclose(fp);
 
     // Write Krona chart
-    FILE *kronaFile = fopen((outDir + "/" + jobId + "_krona.html").c_str(), "w");
-    fwrite(krona_prelude_html, krona_prelude_html_len, sizeof(char), kronaFile);
-    fprintf(kronaFile, "<node name=\"all\"><magnitude><val>%zu</val></magnitude>", numOfQuery);
-    kronaReport(kronaFile, *taxonomy, cladeCounts, numOfQuery);
-    fprintf(kronaFile, "</node></krona></div></body></html>");
-
+    if (krona) {
+        FILE *kronaFile = fopen((outDir + "/" + jobId + "_krona.html").c_str(), "w");
+        fwrite(krona_prelude_html, krona_prelude_html_len, sizeof(char), kronaFile);
+        fprintf(kronaFile, "<node name=\"all\"><magnitude><val>%zu</val></magnitude>", numOfQuery);
+        kronaReport(kronaFile, *taxonomy, cladeCounts, numOfQuery);
+        fprintf(kronaFile, "</node></krona></div></body></html>");
+        fclose(kronaFile);
+    }
 }
 
 void Reporter::writeReport(FILE *FP, const std::unordered_map<TaxID, TaxonCounts> &cladeCounts,
diff --git a/src/commons/Reporter.h b/src/commons/Reporter.h
index d64e567c..4745bf26 100644
--- a/src/commons/Reporter.h
+++ b/src/commons/Reporter.h
@@ -17,13 +17,14 @@ class Reporter {
     NcbiTaxonomy * taxonomy;
 
     // Output
+    string reportFileName;
+    string readClassificationFileName;
     ofstream readClassificationFile;
 
-
 public:
     Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy);
     // Write report
-    void writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt);
+    void writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt, bool krona = true);
     void writeReport(FILE *FP, const std::unordered_map<TaxID, TaxonCounts> &cladeCounts,
                      unsigned long totalReads, TaxID taxID = 0, int depth = 0);
 
@@ -32,10 +33,16 @@ class Reporter {
     void writeReadClassification(const vector<Query> & queryList, bool classifiedOnly = false);
     void closeReadClassificationFile();
 
-   
-
     unsigned int cladeCountVal(const std::unordered_map<TaxID, TaxonCounts> &map, TaxID key);
 
+    // Setter
+    void setReportFileName(const string &reportFileName) {
+        Reporter::reportFileName = reportFileName;
+    }
+
+    void setReadClassificationFileName(const string &readClassificationFileName) {
+        Reporter::readClassificationFileName = readClassificationFileName;
+    }
 };
 
 
diff --git a/src/commons/common.cpp b/src/commons/common.cpp
index 57b5bc79..92d7c735 100644
--- a/src/commons/common.cpp
+++ b/src/commons/common.cpp
@@ -6,6 +6,7 @@
 #include <unistd.h>
 // #include "MathUtil.h"
 #include "Debug.h"
+#include "Reporter.h"
 #include "Util.h"
 #include "sys/mman.h"
 
@@ -71,4 +72,26 @@ NcbiTaxonomy *loadTaxonomy(const std::string &dbDir,
   return new NcbiTaxonomy(dbDir + "/taxonomy/names.dmp",
                           dbDir + "/taxonomy/nodes.dmp",
                           dbDir + "/taxonomy/merged.dmp");
+}
+
+int loadDbParameters(LocalParameters &par) {
+  std::string dbDir = par.filenames[1 + (par.seqMode == 2)];
+  if (fileExist(dbDir + "/db.parameters")) {
+    // open db.parameters
+    std::ifstream dbParametersFile;
+    dbParametersFile.open(dbDir + "/db.parameters");
+    std::string eachLine;
+    if (dbParametersFile.is_open()) {
+      while (getline(dbParametersFile, eachLine)) {
+        std::vector<std::string> tokens = Util::split(eachLine, "\t");
+        if (tokens[0] == "Reduced_alphabet") {
+          par.reducedAA = stoi(tokens[1]);
+        } else if (tokens[0] == "Spaced_kmer_mask") {
+          par.spaceMask = tokens[1];
+        }
+      }
+      return 1;
+    }
+  }
+  return 0;
 }
\ No newline at end of file
diff --git a/src/commons/common.h b/src/commons/common.h
index 4e2fcbcd..7749c39a 100644
--- a/src/commons/common.h
+++ b/src/commons/common.h
@@ -1,6 +1,7 @@
 #ifndef ADCLASSIFIER2_COMMON_H
 #define ADCLASSIFIER2_COMMON_H
 #include <utility>
+#include "LocalParameters.h"
 #include "NcbiTaxonomy.h"
 #include <iostream>
 
@@ -83,4 +84,6 @@ void process_mem_usage(double& vm_usage, double& resident_set);
 
 NcbiTaxonomy * loadTaxonomy(const std::string & dbDir, const std::string & taxonomyDir = "");
 
+int loadDbParameters(LocalParameters & par);
+
 #endif //ADCLASSIFIER2_COMMON_H
diff --git a/src/metabuli.cpp b/src/metabuli.cpp
index 54c447b5..7783d91f 100644
--- a/src/metabuli.cpp
+++ b/src/metabuli.cpp
@@ -67,7 +67,7 @@ std::vector<Command> commands = {
                  {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
                  {"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory},
                  {"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
-        {"filter", classify, &localPar.filter, COMMAND_MAIN,
+        {"filter", filter, &localPar.filter, COMMAND_MAIN,
                 "Filtering reads based on the classification result",
                 nullptr,
                 "Jaebeom Kim <jbeom0731@gmail.com>",
diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp
index de51bd8f..e5f97eba 100644
--- a/src/workflow/add_to_library.cpp
+++ b/src/workflow/add_to_library.cpp
@@ -6,6 +6,7 @@
 #include <iostream>
 #include "IndexCreator.h"
 #include "FileUtil.h"
+#include "common.h"
 #include <regex>
 
 using namespace std;
@@ -33,10 +34,7 @@ int addToLibrary(int argc, const char **argv, const Command &command){
     }
 
     // Load taxonomy
-    string names = par.taxonomyPath + "/names.dmp";
-    string nodes =  par.taxonomyPath + "/nodes.dmp";
-    string merged =  par.taxonomyPath + "/merged.dmp";
-    NcbiTaxonomy ncbiTaxonomy(names, nodes, merged);
+    NcbiTaxonomy * taxonomy = loadTaxonomy(dbDir);
 
     // Load file names
     ifstream fileListFile;
@@ -91,7 +89,7 @@ int addToLibrary(int argc, const char **argv, const Command &command){
                 }
 
                 // Get species taxID
-                int speciesTaxID = ncbiTaxonomy.getTaxIdAtRank(acc2taxid[accession], "species");
+                int speciesTaxID = taxonomy->getTaxIdAtRank(acc2taxid[accession], "species");
 
                 // Skip if species taxID is not found
                 if (speciesTaxID == 0) {
@@ -157,7 +155,7 @@ int addToLibrary(int argc, const char **argv, const Command &command){
             }
 
             // Get species taxID
-            int speciesTaxID = ncbiTaxonomy.getTaxIdAtRank(assembly2taxid[assemblyID], "species");
+            int speciesTaxID = taxonomy->getTaxIdAtRank(assembly2taxid[assemblyID], "species");
             if (speciesTaxID == 0) {
                 cout << "During processing " << fileNames[i] << ", accession " << assemblyID <<
                      " is not matched to any species. It is skipped." << endl;
@@ -196,5 +194,6 @@ int addToLibrary(int argc, const char **argv, const Command &command){
         }
         fclose(file);
     }
+    delete taxonomy;
     return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/src/workflow/build.cpp b/src/workflow/build.cpp
index f78311cc..b5f9e571 100644
--- a/src/workflow/build.cpp
+++ b/src/workflow/build.cpp
@@ -10,7 +10,7 @@ void setDefaults_build(LocalParameters & par){
     par.taxonomyPath = "" ;
     par.splitNum = 4096;
     par.maskProb = 0.9;
-    par.maskMode = 0;
+    par.maskMode = 1;
     par.bufferSize = 1'000'000'000;
 }
 
diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp
index 36062932..514f6ae8 100644
--- a/src/workflow/classify.cpp
+++ b/src/workflow/classify.cpp
@@ -2,14 +2,15 @@
 #include "Parameters.h"
 #include "LocalParameters.h"
 #include "FileUtil.h"
+#include "common.h"
 
 void setClassifyDefaults(LocalParameters & par){
-    par.seqMode = 2;
     par.reducedAA = 0;
+    par.spaceMask = "11111111";
+    par.seqMode = 2;    
     par.minScore = 0;
     par.minCoverage = 0;
     par.minSpScore = 0;
-    par.spaceMask = "11111111";
     par.hammingMargin = 0;
     par.verbosity = 3;
     par.ramUsage = 128;
diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp
index d3d3a08e..3d0b0448 100644
--- a/src/workflow/filter.cpp
+++ b/src/workflow/filter.cpp
@@ -1,13 +1,13 @@
 #include "LocalParameters.h"
 #include "QueryFilter.h"
 
-void setFilterDefaults(LocalParameters & par){
-    par.seqMode = 2;
+void setFilterDefaults(LocalParameters & par) {
     par.reducedAA = 0;
-    par.minScore = 0.7;
+    par.spaceMask = "11111111";
+    par.seqMode = 2;
+    par.minScore = 0.5;
     par.minCoverage = 0;
     par.minSpScore = 0;
-    par.spaceMask = "11111111";
     par.hammingMargin = 0;
     par.verbosity = 3;
     par.ramUsage = 128;
@@ -25,8 +25,7 @@ void setFilterDefaults(LocalParameters & par){
     par.contamList = ""; // TODO: set default
 }
 
-int filter(int argc, const char **argv, const Command& command)
-{
+int filter(int argc, const char **argv, const Command& command) {
     LocalParameters & par = LocalParameters::getLocalInstance();
     setFilterDefaults(par);
     par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0);
@@ -35,6 +34,11 @@ int filter(int argc, const char **argv, const Command& command)
     omp_set_num_threads(par.threads);
 #endif
 
+    if (par.contamList == "") {
+        cerr << "Error: Contamination list is not specified." << endl;
+        return 1;
+    }
+
     QueryFilter * queryFilter = new QueryFilter(par);
     
     queryFilter->filterReads(par);

From bde8c6b26da19a54a6398d53b122ed5814bd67de Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Thu, 24 Aug 2023 15:35:42 +0900
Subject: [PATCH 21/65] first running version

---
 src/commons/IndexCreator.cpp | 130 +++++++++++++++++++++++++++--------
 src/commons/IndexCreator.h   |  13 +++-
 src/commons/KmerMatcher.cpp  |   9 ---
 src/commons/LocalUtil.cpp    |  23 ++++++-
 src/commons/LocalUtil.h      |   4 ++
 src/commons/SeqIterator.cpp  |   4 --
 src/commons/Taxonomer.cpp    |   1 -
 7 files changed, 138 insertions(+), 46 deletions(-)

diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index 7f611b93..318d3040 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -1,4 +1,7 @@
 #include "IndexCreator.h"
+#include "FileUtil.h"
+#include "LocalUtil.h"
+#include <bits/types/FILE.h>
 #include <cstdio>
 #include <utility>
 
@@ -27,9 +30,9 @@ IndexCreator::IndexCreator(const LocalParameters & par) {
     paramterFileName = dbDir + "/db.parameters";
 
     // Load taxonomy
-    taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp",
-                                taxonomyDir + "/nodes.dmp",
-                                taxonomyDir + "/merged.dmp");
+    // taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp",
+    //                             taxonomyDir + "/nodes.dmp",
+    //                             taxonomyDir + "/merged.dmp");
 
     if (par.reducedAA == 1){
         MARKER = 0Xffffffff;
@@ -55,11 +58,6 @@ void IndexCreator::createIndex(const LocalParameters &par) {
     makeBlocksForParallelProcessing();
     cout << "Made blocks for each thread" << endl;
 
-    // Print fnaSplits
-    for (auto & fnaSplit : fnaSplits) {
-        cout << fnaSplit.offset << " " << fnaSplit.cnt << " " << fnaSplit.speciesID << " " << fnaSplit.file_idx << " " << fnaSplit.training << endl;
-    }
-
     // Write taxonomy id list
     FILE * taxidListFile = fopen(taxidListFileName.c_str(), "w");
     for (auto & taxid : taxIdList) {
@@ -162,7 +160,10 @@ void IndexCreator::updateIndex(const LocalParameters &par) {
 void IndexCreator::makeBlocksForParallelProcessing(){
 
     unordered_map<string, TaxID> acc2taxid;
-    load_accession2taxid(acc2taxidFileName, acc2taxid);
+    TaxID maxTaxID = load_accession2taxid(acc2taxidFileName, acc2taxid);
+    newTaxID = maxTaxID + 1;
+
+    vector<pair<string,pair<TaxID, TaxID>>> newAcc2taxid; // accession.version -> (parent, newTaxID)
 
     // Make blocks of sequences that can be processed in parallel
     int fileNum = getNumberOfLines(fnaListFileName);
@@ -176,18 +177,42 @@ void IndexCreator::makeBlocksForParallelProcessing(){
     }
     string eachFile;
     string seqHeader;
+    string accession_version;
+    string accession;
+    vector<TaxID> tempTaxIDList;
 
-    unordered_map<string, TaxID> foundAcc2taxid;
     for (int i = 0; i < fileNum; ++i) {
         // Get start and end position of each sequence in the file
         getline(fnaListFile, eachFile);
         fastaList[i].path = eachFile;
         processedSeqCnt.push_back(taxIdList.size());
-        seqHeader = getSeqSegmentsWithHead(fastaList[i].sequences, eachFile, acc2taxid, foundAcc2taxid);
-        seqHeader = seqHeader.substr(1, seqHeader.find('.') - 1);
-        TaxID speciesTaxid = taxonomy->getTaxIdAtRank(acc2taxid[seqHeader], "species");
 
-        // Split current file into blocks for parallel processing
+
+        seqHeader = getSeqSegmentsWithHead(fastaList[i].sequences, eachFile, acc2taxid, newAcc2taxid);
+        // accession_version = seqHeader.substr(1, seqHeader.find('.') - 1);
+        accession = seqHeader.substr(1, seqHeader.find('.') - 1);
+        accession_version = seqHeader.substr(1, LocalUtil::getFirstWhiteSpacePos(seqHeader) - 1);
+        // newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid[accession], newTaxID));
+        tempTaxIDList.push_back(acc2taxid[accession]);   
+        
+        // TaxID speciesTaxid = taxonomy->getTaxIdAtRank(acc2taxid[accession], "species");
+
+        // // Split current file into blocks for parallel processing
+        // splitFastaForProdigalTraining(i, speciesTaxid);
+        // fastaList[i].speciesID = speciesTaxid;
+    }
+
+    // Edit taxonomy dump files
+    editTaxonomyDumpFiles(newAcc2taxid);
+
+    // Load taxonomy
+    taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp.new",
+                                taxonomyDir + "/nodes.dmp.new",
+                                taxonomyDir + "/merged.dmp");
+
+
+    for (int i = 0; i < fileNum; ++i) {
+        TaxID speciesTaxid = taxonomy->getTaxIdAtRank(tempTaxIDList[i], "species");
         splitFastaForProdigalTraining(i, speciesTaxid);
         fastaList[i].speciesID = speciesTaxid;
     }
@@ -196,8 +221,8 @@ void IndexCreator::makeBlocksForParallelProcessing(){
     // Write accession to taxid map to file
     string acc2taxidFileName2 = dbDir + "/acc2taxid.map";
     FILE * acc2taxidFile = fopen(acc2taxidFileName2.c_str(), "w");
-    for (auto it = foundAcc2taxid.begin(); it != foundAcc2taxid.end(); ++it) {
-        fprintf(acc2taxidFile, "%s\t%d\n", it->first.c_str(), it->second);
+    for (auto it : newAcc2taxid) {
+        fprintf(acc2taxidFile, "%s\t%d\t%d\n", it.first.c_str(), it.second.first, it.second.second);
     }
     fclose(acc2taxidFile);
 
@@ -256,7 +281,8 @@ void IndexCreator::splitFastaForProdigalTraining(int file_idx, TaxID speciesID)
     fastaList[file_idx].trainingSeqIdx = seqForTraining;
 }
 
-void IndexCreator::load_accession2taxid(const string & mappingFileName, unordered_map<string, int> & acc2taxid) {
+TaxID IndexCreator::load_accession2taxid(const string & mappingFileName, unordered_map<string, int> & acc2taxid) {
+    TaxID maxTaxID = 0;
     cerr << "Load mapping from accession ID to taxonomy ID ... " << flush;
     string eachLine;
     string eachItem;
@@ -266,11 +292,15 @@ void IndexCreator::load_accession2taxid(const string & mappingFileName, unordere
         fscanf(mappingFile, "%*s\t%*s\t%*s\t%*s");
         while (fscanf(mappingFile, "%s\t%*s\t%d\t%*d", buffer, &taxID) == 2 ){
             acc2taxid[string(buffer)] = taxID;
+            if (taxID > maxTaxID) {
+                maxTaxID = taxID;
+            }
         }
     } else {
         cerr << "Cannot open file for mapping from accession to tax ID" << endl;
     }
     cerr << "Done" << endl;
+    return maxTaxID;
 }
 
 // This function sort the TargetKmerBuffer, do redundancy reducing task, write the differential index of them
@@ -444,7 +474,8 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq
                         break;
                     }
                     taxIds.push_back(taxIdList[kmerBuffer.buffer[i].info.sequenceID]);
-                    hasSeenOtherStrains += (taxIdList[lookingKmer->info.sequenceID] != taxIdList[kmerBuffer.buffer[i].info.sequenceID]);
+                    hasSeenOtherStrains += (taxonomy->taxonNode(taxIdList[lookingKmer->info.sequenceID])->parentTaxId 
+                                                != taxonomy->taxonNode(taxIdList[kmerBuffer.buffer[i].info.sequenceID]) -> parentTaxId);
                     i++;
                     if(i == splits[split].end + 1){
                         endFlag = 1;
@@ -569,7 +600,7 @@ void IndexCreator::splitSequenceFile(vector<SequenceBlock> & seqSegments, Mmaped
 
 string IndexCreator::getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments, const string & seqFileName,
                                             const unordered_map<string, TaxID> & acc2taxid,
-                                            unordered_map<string, TaxID> & foundAcc2taxid) {
+                                            vector<pair<string,pair<TaxID, TaxID>>> & newAcc2taxid) {
     struct stat stat1{};
     stat(seqFileName.c_str(), &stat1);
     size_t numOfChar = stat1.st_size;
@@ -580,24 +611,33 @@ string IndexCreator::getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments,
     size_t start = 0;
     size_t pos;
     vector<SequenceBlock> seqSegmentsTmp;
-    vector<string> headers;
-    size_t seqCnt = taxIdList.size();
+    string accession;
+    string accession_version;
+
     if (seqFile.is_open()) {
         getline(seqFile, firstLine, '\n');
+        accession = firstLine.substr(1, firstLine.find('.') - 1);
+        accession_version = firstLine.substr(1, LocalUtil::getFirstWhiteSpacePos(firstLine) - 1);
+        newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid.at(accession), newTaxID));
+        taxIdList.push_back(newTaxID++);
 //        cout << firstLine << endl;
-        taxIdList.push_back(acc2taxid.at(firstLine.substr(1, firstLine.find('.') - 1)));
-        foundAcc2taxid[firstLine.substr(1, firstLine.find(' ') - 1)] = taxIdList.back();
+        // taxIdList.push_back(acc2taxid.at(firstLine.substr(1, firstLine.find('.') - 1)));
+        // foundAcc2taxid[firstLine.substr(1, firstLine.find(' ') - 1)] = taxIdList.back();
         while (getline(seqFile, eachLine, '\n')) {
             if (eachLine[0] == '>') {
+                accession = eachLine.substr(1, eachLine.find('.') - 1);
+                accession_version = eachLine.substr(1, LocalUtil::getFirstWhiteSpacePos(eachLine) - 1);
+                newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid.at(accession), newTaxID));
+                taxIdList.push_back(newTaxID++);
 //                cout << eachLine << endl;
-                taxIdList.push_back(acc2taxid.at(eachLine.substr(1, eachLine.find('.') - 1)));
-                foundAcc2taxid[eachLine.substr(1, eachLine.find(' ') - 1)] = taxIdList.back();
+                // taxIdList.push_back(acc2taxid.at(eachLine.substr(1, eachLine.find('.') - 1)));
+                // foundAcc2taxid[eachLine.substr(1, eachLine.find(' ') - 1)] = taxIdList.back();
                 pos = (size_t) seqFile.tellg();
                 seqSegmentsTmp.emplace_back(start, pos - eachLine.length() - 3,pos - eachLine.length() - start - 2);
                 start = pos - eachLine.length() - 1;
             }
         }
-        seqSegmentsTmp.emplace_back(start, numOfChar - 2, numOfChar - start - 1, seqCnt);
+        seqSegmentsTmp.emplace_back(start, numOfChar - 2, numOfChar - start - 1);
     } else {
         cerr << "Unable to open file: " << seqFileName << endl;
     }
@@ -608,7 +648,7 @@ string IndexCreator::getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments,
 
 void IndexCreator::getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments, const char * seqFileName) {
     struct stat stat1{};
-    int a = stat(seqFileName, &stat1);
+    stat(seqFileName, &stat1);
     size_t numOfChar = stat1.st_size;
 
     ifstream seqFile;
@@ -858,3 +898,39 @@ void IndexCreator::writeDbParameters() {
     fprintf(handle, "Spaced_kmer_mask\t%s\n", spaceMask.c_str());
     fclose(handle);
 }
+
+void IndexCreator::editTaxonomyDumpFiles(const vector<pair<string, pair<TaxID, TaxID>>> & newAcc2taxid) {
+    // Edit names.dmp
+    string nameFileName = taxonomyDir + "/names.dmp";
+    string newNameFileName = taxonomyDir + "/names.dmp.new";
+    FileUtil::copyFile(nameFileName.c_str(), newNameFileName.c_str());
+    FILE *nameFile = fopen(newNameFileName.c_str(), "a");
+    if (nameFile == NULL) {
+        Debug(Debug::ERROR) << "Could not open " << newNameFileName << " for writing\n";
+        EXIT(EXIT_FAILURE);
+    }
+
+    for (size_t i = 0; i < newAcc2taxid.size() - 1; i++) {
+        fprintf(nameFile, "%d\t|\t%s\t|\t\t|\tscientific name\t|\n", newAcc2taxid[i].second.second, newAcc2taxid[i].first.c_str());
+    }
+    fprintf(nameFile, "%d\t|\t%s\t|\t\t|\tscientific name\t|", newAcc2taxid.back().second.second, newAcc2taxid.back().first.c_str());
+    fclose(nameFile);
+
+    // Edit nodes.dmp
+    string nodeFileName = taxonomyDir + "/nodes.dmp";
+    string newNodeFileName = taxonomyDir + "/nodes.dmp.new";
+    FileUtil::copyFile(nodeFileName.c_str(), newNodeFileName.c_str());
+    FILE *nodeFile = fopen(newNodeFileName.c_str(), "a");
+    if (nodeFile == NULL) {
+        Debug(Debug::ERROR) << "Could not open " << newNodeFileName << " for writing\n";
+        EXIT(EXIT_FAILURE);
+    }
+
+    for (size_t i = 0; i < newAcc2taxid.size() - 1; i++) {
+        fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\n", newAcc2taxid[i].second.second, newAcc2taxid[i].second.first);
+    }
+    fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|", newAcc2taxid.back().second.second, newAcc2taxid.back().second.first);
+    fclose(nodeFile);
+
+    // Edit node.dmp
+}
\ No newline at end of file
diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h
index 8f343717..de31e394 100644
--- a/src/commons/IndexCreator.h
+++ b/src/commons/IndexCreator.h
@@ -18,6 +18,7 @@
 #include "NucleotideMatrix.h"
 #include "SubstitutionMatrix.h"
 #include "tantan.h"
+#include "LocalUtil.h"
 
 
 #ifdef OPENMP
@@ -65,6 +66,7 @@ class IndexCreator{
         vector<SequenceBlock> sequences;
     };
 
+    TaxID newTaxID;
     vector<FASTA> fastaList;
     vector<TaxID> taxIdList;
     vector<size_t> processedSeqCnt; // Index of this vector is the same as the index of fnaList
@@ -128,7 +130,10 @@ class IndexCreator{
     }
 
     void load_assacc2taxid(const string & mappingFile, unordered_map<string, int> & assacc2taxid);
-    static void load_accession2taxid(const string & mappingFile, unordered_map<string, int> & assacc2taxid);
+
+    static TaxID load_accession2taxid(const string & mappingFile, unordered_map<string, int> & assacc2taxid);
+
+    void editTaxonomyDumpFiles(const vector<pair<string, pair<TaxID, TaxID>>> & newAcc2taxid);
 
     void reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniqeKmerIdx, size_t & uniqKmerCnt,
                           const LocalParameters & par);
@@ -150,9 +155,11 @@ class IndexCreator{
 public:
     static void splitSequenceFile(vector<SequenceBlock> & seqSegments, MmapedData<char> seqFile);
 
-    string getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments, const string & seqFileName,
+    string getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments,
+                                  const string & seqFileName,
                                   const unordered_map<string, TaxID> & acc2taxid,
-                                  unordered_map<string, TaxID> & foundAcc2taxid);
+                                  vector<pair<string, pair<TaxID, TaxID>>> & newAcc2taxid);
+
     static void getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments, const char * seqFileName);
     IndexCreator(const LocalParameters & par);
     IndexCreator() {taxonomy = nullptr;}
diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 5b352c5f..39b9a13d 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -160,15 +160,6 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
     MmapedData<DiffIdxSplit> diffIdxSplits = mmapData<DiffIdxSplit>(diffIdxSplitFileName.c_str(), 3);
     size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t);
 
-    MmapedData<TargetKmerInfo> tempInfos = mmapData<TargetKmerInfo>(targetInfoFileName.c_str(), 3);
-    size_t numOfInfos = tempInfos.fileSize / sizeof(TargetKmerInfo);
-
-    // Print kmer infos
-    for (size_t i = 0; i < numOfInfos; i++) {
-        cout << (int) tempInfos.data[i].sequenceID << " " << (int) tempInfos.data[i].redundancy << endl;
-    }
-
-
     size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve;
     QueryKmer *queryKmerList = queryKmerBuffer->buffer;
     
diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp
index 08e04508..d981f238 100644
--- a/src/commons/LocalUtil.cpp
+++ b/src/commons/LocalUtil.cpp
@@ -1,4 +1,5 @@
 #include "LocalUtil.h"
+#include <cctype>
 
 
 std::string LocalUtil::getQueryBaseName(const std::string & queryPath) {
@@ -19,7 +20,6 @@ std::string LocalUtil::getQueryBaseName(const std::string & queryPath) {
 }
 
 
-
 void LocalUtil::splitQueryFile(std::vector<SequenceBlock> & sequences, const std::string &queryPath) {
     KSeqWrapper* kseq = nullptr;
     kseq = KSeqFactory(queryPath.c_str());
@@ -41,4 +41,23 @@ int LocalUtil::getMaxCoveredLength(int queryLength) {
     } else {
         return queryLength - 3; // 3
     }
-}
\ No newline at end of file
+}
+
+int LocalUtil::getFirstWhiteSpacePos(const std::string &str) {
+    for (size_t i = 0; i < str.size(); ++i) {
+        if (isspace(int(str[i]))) {
+            return i;
+        }
+    }
+    return str.size();
+}
+
+// std::string LocalUtil::getAccessionFromHeader(const std::string &header) {
+//     int pos = getFirstWhiteSpacePos(header);
+//     std::string accession = header.substr(0, pos);
+//     std::vector<std::string> splits = Util::split(accession, ".");
+//     if (splits.size() > 1) {
+//         accession = splits[0];
+//     }
+//     return std::stoi(accession.substr(3));
+// }
\ No newline at end of file
diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h
index 0fcbdf82..7f511239 100644
--- a/src/commons/LocalUtil.h
+++ b/src/commons/LocalUtil.h
@@ -18,6 +18,10 @@ class LocalUtil : public Util {
     static void splitQueryFile(std::vector<SequenceBlock> & seqSegments, const std::string & queryPath);
 
     static int getMaxCoveredLength(int queryLength) ;
+
+    static int getFirstWhiteSpacePos(const std::string & str);
+
+    // static std::string getAccessionFromHeader(const std::string & header);
 };
 
 
diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp
index 262d1258..525ac651 100644
--- a/src/commons/SeqIterator.cpp
+++ b/src/commons/SeqIterator.cpp
@@ -1,7 +1,3 @@
-//
-// Created by KJB on 01/09/2020.
-//
-
 #include "SeqIterator.h"
 
 const string SeqIterator::atcg = "................................................................"
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index 85ef5339..58ee0722 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -250,7 +250,6 @@ TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, pair<int, int>
         uint8_t minHamming = matches[i].hamming;
         Match * minHammingMatch = & matches[i];
         TaxID minHammingTaxId = minHammingMatch->targetId;
-        bool first = true;
         i --;
         while ( (i >= matchRange.first) && (currQuotient == matches[i].qInfo.pos / 3) ) {
             if (matches[i].hamming < minHamming) {

From 585b2d8ce8b49cc07be1037829dd1d08faa66b65 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 29 Aug 2023 10:15:47 +0900
Subject: [PATCH 22/65] fix DB reproducibility problem

---
 src/commons/IndexCreator.cpp    | 195 ++++++++++++++++++++++++++------
 src/commons/IndexCreator.h      |  10 ++
 src/commons/LocalParameters.cpp |  12 +-
 src/commons/LocalParameters.h   |   2 +
 src/commons/ProdigalWrapper.cpp |  65 +++++------
 src/commons/Taxonomer.cpp       |   1 -
 src/commons/common.cpp          |   5 +
 7 files changed, 221 insertions(+), 69 deletions(-)

diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index 318d3040..c8f8df7a 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -1,7 +1,9 @@
 #include "IndexCreator.h"
 #include "FileUtil.h"
 #include "LocalUtil.h"
+#include "ProdigalWrapper.h"
 #include <bits/types/FILE.h>
+#include <cstdint>
 #include <cstdio>
 #include <utility>
 
@@ -11,7 +13,8 @@ IndexCreator::IndexCreator(const LocalParameters & par) {
     bufferSize = par.bufferSize;
     reducedAA = par.reducedAA;
     spaceMask = par.spaceMask;
-    
+    accessionLevel = par.accessionLevel;
+
     // Input files
     dbDir = par.filenames[0];
     if (par.taxonomyPath.empty()) {
@@ -29,10 +32,11 @@ IndexCreator::IndexCreator(const LocalParameters & par) {
     versionFileName = dbDir + "/db.version";
     paramterFileName = dbDir + "/db.parameters";
 
-    // Load taxonomy
-    // taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp",
-    //                             taxonomyDir + "/nodes.dmp",
-    //                             taxonomyDir + "/merged.dmp");
+    if (!par.accessionLevel){
+        taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp",
+                                    taxonomyDir + "/nodes.dmp",
+                                    taxonomyDir + "/merged.dmp");
+    }
 
     if (par.reducedAA == 1){
         MARKER = 0Xffffffff;
@@ -55,7 +59,12 @@ IndexCreator::~IndexCreator() {
 void IndexCreator::createIndex(const LocalParameters &par) {
 
     // Read through FASTA files and make blocks of sequences to be processed by each thread
-    makeBlocksForParallelProcessing();
+    if (par.accessionLevel) {
+        makeBlocksForParallelProcessing_accession_level();
+    } else {
+        makeBlocksForParallelProcessing();
+    }
+    
     cout << "Made blocks for each thread" << endl;
 
     // Write taxonomy id list
@@ -81,16 +90,22 @@ void IndexCreator::createIndex(const LocalParameters &par) {
 
         // Extract Target k-mers
         fillTargetKmerBuffer(kmerBuffer, splitChecker, processedSplitCnt, par);
-        time_t start = time(nullptr);
 
         // Sort the k-mers
+        time_t start = time(nullptr);
         SORT_PARALLEL(kmerBuffer.buffer, kmerBuffer.buffer + kmerBuffer.startIndexOfReserve,
-                      IndexCreator::compareForDiffIdx);
+                      IndexCreator::compareForDiffIdx2);
         time_t sort = time(nullptr);
         cout << "Sort time: " << sort - start << endl;
         auto * uniqKmerIdx = new size_t[kmerBuffer.startIndexOfReserve + 1];
         size_t uniqKmerCnt = 0;
 
+        // Print out the k-mers
+        string tmpFileName = dbDir + "/tmp";
+        FILE * tmpFile = fopen(tmpFileName.c_str(), "wb");
+        fwrite(kmerBuffer.buffer, sizeof(uint16_t), kmerBuffer.startIndexOfReserve, tmpFile);
+        fclose(tmpFile);
+
         reduceRedundancy(kmerBuffer, uniqKmerIdx, uniqKmerCnt, par);
         time_t reduction = time(nullptr);
         cout<<"Time spent for reducing redundancy: "<<(double) (reduction - sort) << endl;
@@ -157,7 +172,51 @@ void IndexCreator::updateIndex(const LocalParameters &par) {
     delete[] splitChecker;
 
 }
-void IndexCreator::makeBlocksForParallelProcessing(){
+
+void IndexCreator::makeBlocksForParallelProcessing() {
+    unordered_map<string, TaxID> acc2taxid;
+    load_accession2taxid(acc2taxidFileName, acc2taxid);
+
+    // Make blocks of sequences that can be processed in parallel
+    int fileNum = getNumberOfLines(fnaListFileName);
+    fastaList.resize(fileNum);
+
+    ifstream fnaListFile;
+    fnaListFile.open(fnaListFileName);
+    if (!fnaListFile.is_open()) {
+        Debug(Debug::ERROR) << "Cannot open file for file list" << "\n";
+        EXIT(EXIT_FAILURE);
+    }
+    string eachFile;
+    string seqHeader;
+
+    unordered_map<string, TaxID> foundAcc2taxid;
+    for (int i = 0; i < fileNum; ++i) {
+        // Get start and end position of each sequence in the file
+        getline(fnaListFile, eachFile);
+        fastaList[i].path = eachFile;
+        processedSeqCnt.push_back(taxIdList.size());
+        seqHeader = getSeqSegmentsWithHead(fastaList[i].sequences, eachFile, acc2taxid, foundAcc2taxid);
+        seqHeader = seqHeader.substr(1, seqHeader.find('.') - 1);
+        TaxID speciesTaxid = taxonomy->getTaxIdAtRank(acc2taxid[seqHeader], "species");
+
+        // Split current file into blocks for parallel processing
+        splitFastaForProdigalTraining(i, speciesTaxid);
+        fastaList[i].speciesID = speciesTaxid;
+    }
+    fnaListFile.close();
+
+    // Write accession to taxid map to file
+    string acc2taxidFileName2 = dbDir + "/acc2taxid.map";
+    FILE * acc2taxidFile = fopen(acc2taxidFileName2.c_str(), "w");
+    for (auto it = foundAcc2taxid.begin(); it != foundAcc2taxid.end(); ++it) {
+        fprintf(acc2taxidFile, "%s\t%d\n", it->first.c_str(), it->second);
+    }
+    fclose(acc2taxidFile);
+
+}
+
+void IndexCreator::makeBlocksForParallelProcessing_accession_level() {
 
     unordered_map<string, TaxID> acc2taxid;
     TaxID maxTaxID = load_accession2taxid(acc2taxidFileName, acc2taxid);
@@ -187,7 +246,6 @@ void IndexCreator::makeBlocksForParallelProcessing(){
         fastaList[i].path = eachFile;
         processedSeqCnt.push_back(taxIdList.size());
 
-
         seqHeader = getSeqSegmentsWithHead(fastaList[i].sequences, eachFile, acc2taxid, newAcc2taxid);
         // accession_version = seqHeader.substr(1, seqHeader.find('.') - 1);
         accession = seqHeader.substr(1, seqHeader.find('.') - 1);
@@ -453,7 +511,7 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq
         idxOfEachSplit[i] = new size_t[splits[i].end - splits[i].offset + 2];
         cntOfEachSplit[i] = 0;
     }
-#pragma omp parallel default(none), shared(kmerBuffer, idxOfEachSplit, cntOfEachSplit, splits)
+#pragma omp parallel default(none), shared(kmerBuffer, idxOfEachSplit, cntOfEachSplit, splits, par)
     {
         TargetKmer * lookingKmer;
         size_t lookingIndex;
@@ -474,8 +532,12 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq
                         break;
                     }
                     taxIds.push_back(taxIdList[kmerBuffer.buffer[i].info.sequenceID]);
-                    hasSeenOtherStrains += (taxonomy->taxonNode(taxIdList[lookingKmer->info.sequenceID])->parentTaxId 
+                    if (par.accessionLevel) {
+                        hasSeenOtherStrains += (taxonomy->taxonNode(taxIdList[lookingKmer->info.sequenceID])->parentTaxId 
                                                 != taxonomy->taxonNode(taxIdList[kmerBuffer.buffer[i].info.sequenceID]) -> parentTaxId);
+                    } else {
+                        hasSeenOtherStrains += (taxIdList[lookingKmer->info.sequenceID] != taxIdList[kmerBuffer.buffer[i].info.sequenceID]);
+                    }
                     i++;
                     if(i == splits[split].end + 1){
                         endFlag = 1;
@@ -583,9 +645,30 @@ int IndexCreator::getNumOfFlush()
 }
 
 inline bool IndexCreator::compareForDiffIdx(const TargetKmer & a, const TargetKmer & b){
-    return a.ADkmer < b.ADkmer || (a.ADkmer == b.ADkmer && a.taxIdAtRank < b.taxIdAtRank);
+    if (a.ADkmer != b.ADkmer) {
+        return a.ADkmer < b.ADkmer;
+    }
+    return a.taxIdAtRank < b.taxIdAtRank;
 }
 
+inline bool IndexCreator::compareForDiffIdx2(const TargetKmer & a, const TargetKmer & b){
+    if (a.ADkmer != b.ADkmer) {
+        return a.ADkmer < b.ADkmer;
+    }
+
+    if (a.taxIdAtRank != b.taxIdAtRank) {
+        return a.taxIdAtRank < b.taxIdAtRank;
+    }
+
+    if (a.info.sequenceID != b.info.sequenceID) {
+        return a.info.sequenceID < b.info.sequenceID;
+    }
+
+    return a.info.redundancy < b.info.redundancy;
+}
+
+
+
 void IndexCreator::splitSequenceFile(vector<SequenceBlock> & seqSegments, MmapedData<char> seqFile) {
     size_t start = 0;
     size_t numOfChar = seqFile.fileSize / sizeof(char);
@@ -598,7 +681,8 @@ void IndexCreator::splitSequenceFile(vector<SequenceBlock> & seqSegments, Mmaped
     seqSegments.emplace_back(start, numOfChar - 2, numOfChar - start - 1);
 }
 
-string IndexCreator::getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments, const string & seqFileName,
+string IndexCreator::getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments,
+                                            const string & seqFileName,
                                             const unordered_map<string, TaxID> & acc2taxid,
                                             vector<pair<string,pair<TaxID, TaxID>>> & newAcc2taxid) {
     struct stat stat1{};
@@ -646,6 +730,46 @@ string IndexCreator::getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments,
     return firstLine;
 }
 
+string IndexCreator::getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments,
+                                            const string & seqFileName,
+                                            const unordered_map<string, TaxID> & acc2taxid,
+                                            unordered_map<string, TaxID> & foundAcc2taxid) {
+    struct stat stat1{};
+    stat(seqFileName.c_str(), &stat1);
+    size_t numOfChar = stat1.st_size;
+    string firstLine;
+    ifstream seqFile;
+    seqFile.open(seqFileName);
+    string eachLine;
+    size_t start = 0;
+    size_t pos;
+    vector<SequenceBlock> seqSegmentsTmp;
+    vector<string> headers;
+    size_t seqCnt = taxIdList.size();
+    if (seqFile.is_open()) {
+        getline(seqFile, firstLine, '\n');
+//        cout << firstLine << endl;
+        taxIdList.push_back(acc2taxid.at(firstLine.substr(1, firstLine.find('.') - 1)));
+        foundAcc2taxid[firstLine.substr(1, firstLine.find(' ') - 1)] = taxIdList.back();
+        while (getline(seqFile, eachLine, '\n')) {
+            if (eachLine[0] == '>') {
+//                cout << eachLine << endl;
+                taxIdList.push_back(acc2taxid.at(eachLine.substr(1, eachLine.find('.') - 1)));
+                foundAcc2taxid[eachLine.substr(1, eachLine.find(' ') - 1)] = taxIdList.back();
+                pos = (size_t) seqFile.tellg();
+                seqSegmentsTmp.emplace_back(start, pos - eachLine.length() - 3,pos - eachLine.length() - start - 2);
+                start = pos - eachLine.length() - 1;
+            }
+        }
+        seqSegmentsTmp.emplace_back(start, numOfChar - 2, numOfChar - start - 1, seqCnt);
+    } else {
+        cerr << "Unable to open file: " << seqFileName << endl;
+    }
+    seqFile.close();
+    seqSegments = std::move(seqSegmentsTmp);
+    return firstLine;                            
+}
+
 void IndexCreator::getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments, const char * seqFileName) {
     struct stat stat1{};
     stat(seqFileName, &stat1);
@@ -701,7 +825,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
 #pragma omp parallel default(none), shared(kmerBuffer, checker, processedSplitCnt, hasOverflow, par, cout)
     {
         ProbabilityMatrix probMatrix(*subMat);
-        ProdigalWrapper prodigal;
+        // ProdigalWrapper prodigal;
         SeqIterator seqIterator(par);
         size_t posToWrite;
         size_t orfNum;
@@ -727,10 +851,12 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
                 for (size_t p = 0; p < fnaSplits[i].cnt; p++) {
                     totalLength += fastaList[fnaSplits[i].file_idx].sequences[fnaSplits[i].offset + p].length;
                 }
-                size_t estimatedKmerCnt = (totalLength + totalLength / 1000) / 3;
+                
+                size_t estimatedKmerCnt = (totalLength + totalLength / 10) / 3;
 
                 // Process current split if buffer has enough space.
                 posToWrite = kmerBuffer.reserveMemory(estimatedKmerCnt);
+                ProdigalWrapper * prodigal = new ProdigalWrapper();
                 if (posToWrite + estimatedKmerCnt < kmerBuffer.bufferSize) {
                     // MMap FASTA file of current split
                     struct MmapedData<char> fastaFile = mmapData<char>(fastaList[fnaSplits[i].file_idx].path.c_str());
@@ -742,16 +868,15 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
                     lengthOfTrainingSeq = seq->seq.l;
                     cout << "T: " << seq->name.s << " " << lengthOfTrainingSeq << " " << estimatedKmerCnt << endl;
 
-                    // Train prodigal.
-                    prodigal.is_meta = 0;
+                    // Train prodigal
+                    prodigal->is_meta = 0;
                     if (lengthOfTrainingSeq < 100'000) {
-                        prodigal.is_meta = 1;
-                        prodigal.trainMeta(seq->seq.s);
+                        prodigal->is_meta = 1;
+                        prodigal->trainMeta(seq->seq.s);
                     } else {
-                        prodigal.trainASpecies(seq->seq.s);
+                        prodigal->trainASpecies(seq->seq.s);
                     }
 
-
 //                    // Load training information
 //                    int read_check = read_training_file(const_cast<char *>((par.tinfoPath + to_string(fnaSplits[i].speciesID) + ".tinfo").c_str()),
 //                                                        prodigal.getTrainingInfo());
@@ -761,9 +886,9 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
 //                    }
 
                     // Generate intergenic 23-mer list. It is used to determine extension direction of intergenic sequences.
-                    prodigal.getPredictedGenes(seq->seq.s);
-                    seqIterator.generateIntergenicKmerList(prodigal.genes, prodigal.nodes,
-                                                           prodigal.getNumberOfPredictedGenes(),
+                    prodigal->getPredictedGenes(seq->seq.s);
+                    seqIterator.generateIntergenicKmerList(prodigal->genes, prodigal->nodes,
+                                                           prodigal->getNumberOfPredictedGenes(),
                                                            intergenicKmers,seq->seq.s);
 
                     // Get min k-mer hash list for determining strandness
@@ -786,10 +911,10 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
                         if (seqIterator.compareMinHashList(standardList, currentList, lengthOfTrainingSeq, // Forward
                                                            strlen(seq->seq.s))) {
                             // Get extended ORFs
-                            prodigal.getPredictedGenes(seq->seq.s);
-                            prodigal.removeCompletelyOverlappingGenes();
-                            seqIterator.getExtendedORFs(prodigal.finalGenes, prodigal.nodes, extendedORFs,
-                                                             prodigal.fng, strlen(seq->seq.s),
+                            prodigal->getPredictedGenes(seq->seq.s);
+                            prodigal->removeCompletelyOverlappingGenes();
+                            seqIterator.getExtendedORFs(prodigal->finalGenes, prodigal->nodes, extendedORFs,
+                                                             prodigal->fng, strlen(seq->seq.s),
                                                         orfNum, intergenicKmers, seq->seq.s);
                             // Get masked sequence
                             char *maskedSeq = nullptr;
@@ -821,10 +946,10 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
                             reverseCompliment = seqIterator.reverseCompliment(seq->seq.s, seq->seq.l);
 
                             // Get extended ORFs
-                            prodigal.getPredictedGenes(reverseCompliment);
-                            prodigal.removeCompletelyOverlappingGenes();
-                            seqIterator.getExtendedORFs(prodigal.finalGenes, prodigal.nodes, extendedORFs,
-                                                             prodigal.fng, strlen(reverseCompliment),
+                            prodigal->getPredictedGenes(reverseCompliment);
+                            prodigal->removeCompletelyOverlappingGenes();
+                            seqIterator.getExtendedORFs(prodigal->finalGenes, prodigal->nodes, extendedORFs,
+                                                             prodigal->fng, strlen(reverseCompliment),
                                                         orfNum, intergenicKmers, reverseCompliment);
 
                             // Get masked sequence
@@ -868,6 +993,9 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
                     __sync_fetch_and_add(&hasOverflow, 1);
                     __sync_fetch_and_sub(&kmerBuffer.startIndexOfReserve, estimatedKmerCnt);
                 }
+                cout << totalLength << " " << prodigal->fng << endl;
+                delete prodigal;
+                
             }
         }
     }
@@ -896,6 +1024,7 @@ void IndexCreator::writeDbParameters() {
     }
     fprintf(handle, "Reduced_alphabet\t%d\n", reducedAA);
     fprintf(handle, "Spaced_kmer_mask\t%s\n", spaceMask.c_str());
+    fprintf(handle, "Accession_level\t%d\n", accessionLevel);
     fclose(handle);
 }
 
diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h
index de31e394..bee95f8f 100644
--- a/src/commons/IndexCreator.h
+++ b/src/commons/IndexCreator.h
@@ -45,6 +45,7 @@ class IndexCreator{
     size_t bufferSize;
     int reducedAA;
     string spaceMask;
+    int accessionLevel;
     
     // Inputs
     NcbiTaxonomy * taxonomy;
@@ -112,6 +113,8 @@ class IndexCreator{
     void writeDbParameters();
 
     static bool compareForDiffIdx(const TargetKmer & a, const TargetKmer & b);
+    
+    static bool compareForDiffIdx2(const TargetKmer & a, const TargetKmer & b);
 
 //    void maskLowComplexityRegions(char * seq, char * maskedSeq, ProbabilityMatrix & probMat,
 //                                  const LocalParameters & par);
@@ -123,6 +126,8 @@ class IndexCreator{
 
     void makeBlocksForParallelProcessing();
 
+    void makeBlocksForParallelProcessing_accession_level();
+
     void splitFastaForProdigalTraining(int file_idx, TaxID speciesID);
 
     void unzipAndList(const string & folder, const string & fastaList_fname){
@@ -160,6 +165,11 @@ class IndexCreator{
                                   const unordered_map<string, TaxID> & acc2taxid,
                                   vector<pair<string, pair<TaxID, TaxID>>> & newAcc2taxid);
 
+    string getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments,
+                                  const string & seqFileName,
+                                  const unordered_map<string, TaxID> & acc2taxid,
+                                  unordered_map<string, TaxID> & foundAcc2taxid);
+
     static void getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments, const char * seqFileName);
     IndexCreator(const LocalParameters & par);
     IndexCreator() {taxonomy = nullptr;}
diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp
index c5985ccf..f38cd6a1 100644
--- a/src/commons/LocalParameters.cpp
+++ b/src/commons/LocalParameters.cpp
@@ -174,6 +174,13 @@ LocalParameters::LocalParameters() :
                     typeid(size_t),
                     (void *) &bufferSize,
                     "^[0-9]+$"),
+        ACCESSION_LEVEL(ACCESSION_LEVEL_ID,
+                        "--accession-level",
+                        "Build a database for accession level classification",
+                        "Build a database for accession level classification",
+                        typeid(int),
+                        (void *) &accessionLevel,
+                        "[0-1]"),
         TEST_RANK(TEST_RANK_ID,
                   "--test-rank",
                   ".",
@@ -249,6 +256,7 @@ LocalParameters::LocalParameters() :
     build.push_back(&PARAM_MASK_PROBABILTY);
     build.push_back(&PARAM_MASK_RESIDUES);
     build.push_back(&BUFFER_SIZE);
+    build.push_back(&ACCESSION_LEVEL);
 
     //classify
     classify.push_back(&PARAM_THREADS);
@@ -271,6 +279,7 @@ LocalParameters::LocalParameters() :
     classify.push_back(&PARAM_MASK_RESIDUES);
     classify.push_back(&PARAM_MASK_PROBABILTY);
     classify.push_back(&MATCH_PER_KMER);
+    classify.push_back(&ACCESSION_LEVEL);
 
     // filter 
     filter.push_back(&PARAM_THREADS);
@@ -295,7 +304,8 @@ LocalParameters::LocalParameters() :
     filter.push_back(&MATCH_PER_KMER);
     filter.push_back(&PRINT_MODE);
     filter.push_back(&CONTAM_LIST);
-
+    filter.push_back(&ACCESSION_LEVEL);
+    
     //updateTargetDB
     exclusiontest_hiv.push_back(&TEST_RANK);
 
diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h
index 2a92115a..2d75912a 100644
--- a/src/commons/LocalParameters.h
+++ b/src/commons/LocalParameters.h
@@ -62,6 +62,7 @@ class LocalParameters : public Parameters {
     PARAMETER(IS_ASSEMBLY)
     PARAMETER(SPLIT_NUM)
     PARAMETER(BUFFER_SIZE)
+    PARAMETER(ACCESSION_LEVEL)
 
     // Test parameters
     PARAMETER(TEST_RANK)
@@ -105,6 +106,7 @@ class LocalParameters : public Parameters {
     std::string taxonomyPath;
     int splitNum;
     size_t bufferSize;
+    int accessionLevel;
 
     // Test parameters
     std::string testRank;
diff --git a/src/commons/ProdigalWrapper.cpp b/src/commons/ProdigalWrapper.cpp
index 901eb276..7b6e39d2 100644
--- a/src/commons/ProdigalWrapper.cpp
+++ b/src/commons/ProdigalWrapper.cpp
@@ -52,28 +52,14 @@ ProdigalWrapper::ProdigalWrapper() {
 void ProdigalWrapper::
 trainASpecies(char * genome){
 
-    memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char));
-    memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char));
-    memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char));
-    memset(nodes, 0, nn * sizeof(struct _node));
+    // Initialize training information
+    memset(mlist, 0, MAX_MASKS*sizeof(mask));
     memset(&tinf, 0, sizeof(struct _training));
-    nn = 0; slen = 0; ipath = 0; nmask = 0;
     tinf.st_wt = 4.35;
     tinf.trans_table = 11;
 
     slen = getNextSeq(genome, 1);
-//    if(slen == 0) {
-//        fprintf(stderr, "\n\nSequence read failed (file must be Fasta, ");
-//        fprintf(stderr, "Genbank, or EMBL format).\n\n");
-//        exit(9);
-//    }
 
-//    if(slen < IDEAL_SINGLE_GENOME) {
-//        fprintf(stderr, "\n\nWarning:  ideally Prodigal should be given at");
-//        fprintf(stderr, " least %d bases for ", IDEAL_SINGLE_GENOME);
-//        fprintf(stderr, "training.\nYou may get better results with the ");
-//        fprintf(stderr, "-p meta option.\n\n");
-//    }
     rcom_seq(seq, rseq, useq, slen);
 
     /***********************************************************************
@@ -82,10 +68,11 @@ trainASpecies(char * genome){
     ***********************************************************************/
     if(slen > max_slen && slen > STT_NOD*8) {
         nodes = (struct _node *)realloc(nodes, (int)(slen/8)*sizeof(struct _node));
-//        if(nodes == NULL) {
-//            fprintf(stderr, "Realloc failed on nodes\n\n");
-//            exit(11);
-//        }
+        if(nodes == NULL) {
+            fprintf(stderr, "Realloc failed on nodes\n\n");
+            exit(11);
+        }
+        memset(nodes, 0, (int)(slen/8)*sizeof(struct _node));
         max_slen = slen;
     }
     nn = add_nodes(seq, rseq, slen, nodes, closed, mlist, nmask, &tinf);
@@ -130,18 +117,21 @@ trainASpecies(char * genome){
     determine_sd_usage(&tinf);
     if(force_nonsd == 1) tinf.uses_sd = 0;
     if(tinf.uses_sd == 0) train_starts_nonsd(seq, rseq, slen, nodes, nn, &tinf);
-}
 
-void ProdigalWrapper::trainMeta(char *genome) {
+    // Initialize memories to reuse them
     memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char));
     memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char));
     memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char));
-    memset(nodes, 0, nn*sizeof(struct _node));
+    memset(nodes, 0, nn * sizeof(struct _node));
+    nn = 0; slen = 0; ipath = 0; nmask = 0;
+}
+
+void ProdigalWrapper::trainMeta(char *genome) {
+    // Initialize training information
     memset(&tinf, 0, sizeof(struct _training));
     tinf.st_wt = 4.35;
     tinf.trans_table = 11;
-    nn = 0; slen = 0; ipath = 0; nmask = 0;
-
+   
     initialize_metagenomic_bins(meta);
 
     slen = getNextSeq(genome, 1);
@@ -154,6 +144,7 @@ void ProdigalWrapper::trainMeta(char *genome) {
             fprintf(stderr, "Realloc failed on nodes\n\n");
             exit(11);
         }
+        memset(nodes, 0, (int)(slen/8)*sizeof(struct _node));
         max_slen = slen;
     }
 
@@ -182,13 +173,15 @@ void ProdigalWrapper::trainMeta(char *genome) {
             max_score = nodes[ipath].score;
         }
     }
-}
-void ProdigalWrapper::getPredictedGenes(char * genome){
+
+    // Initialize memories to reuse them
     memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char));
     memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char));
     memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char));
-    memset(nodes, 0, nn*sizeof(struct _node));
-    nn = 0; slen = 0; nmask = 0; ipath=0;
+    memset(nodes, 0, nn * sizeof(struct _node));
+    nn = 0; slen = 0; ipath = 0; nmask = 0;
+}
+void ProdigalWrapper::getPredictedGenes(char * genome){
 
     /* Initialize structure */
     slen = getNextSeq(genome, 0);
@@ -211,7 +204,6 @@ void ProdigalWrapper::getPredictedGenes(char * genome){
     }
 
     if(is_meta == 0) {
-        ipath = 0;
         /***********************************************************************
          Find all the potential starts and stops, sort them, and create
          comprehensive list of nodes for dynamic programming.
@@ -235,9 +227,8 @@ void ProdigalWrapper::getPredictedGenes(char * genome){
     }
     else{
 
-    /// metagenomic version
-        fprintf(stderr, "Request:  Metagenomic, Phase:  Gene Finding\n");
-
+        /// Metagenomic version
+    
         nn = add_nodes(seq, rseq, slen, nodes, closed, mlist, nmask,
                        meta[max_phase].tinf);
         qsort(nodes, nn, sizeof(struct _node), &compare_nodes);
@@ -250,7 +241,13 @@ void ProdigalWrapper::getPredictedGenes(char * genome){
         tweak_final_starts(genes, ng, nodes, nn, meta[max_phase].tinf);
         record_gene_data(genes, ng, nodes, meta[max_phase].tinf, num_seq);
     }
-//    fprintf(stderr, "done! gene count: %d (%d bp)\n", ng, slen);
+
+    // Initialize memories to reuse them
+    memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char));
+    memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char));
+    memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char));
+    memset(nodes, 0, nn*sizeof(struct _node));
+    nn = 0; slen = 0; nmask = 0; ipath=0;
 }
 
 int ProdigalWrapper::getNextSeq(char * line, int training) {
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index 58ee0722..1118f618 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -220,7 +220,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
         queryList[currentQuery].taxCnt[genusMatches[i].targetId]++;
     }
 
-
     // Store classification results
     queryList[currentQuery].isClassified = true;
     queryList[currentQuery].classification = result;
diff --git a/src/commons/common.cpp b/src/commons/common.cpp
index 92d7c735..f5dde1cd 100644
--- a/src/commons/common.cpp
+++ b/src/commons/common.cpp
@@ -88,6 +88,11 @@ int loadDbParameters(LocalParameters &par) {
           par.reducedAA = stoi(tokens[1]);
         } else if (tokens[0] == "Spaced_kmer_mask") {
           par.spaceMask = tokens[1];
+        } else if (tokens[0] == "Accession_level") {
+          if (tokens[1] == "0" && par.accessionLevel == 1){
+            par.accessionLevel = 0;
+            cerr << "Warning: Current DB doesn't support accession-level classification." << endl;
+          }
         }
       }
       return 1;

From 8460b558bc45bdfd72b0a7099816fa6190f40174 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 29 Aug 2023 17:14:28 +0900
Subject: [PATCH 23/65] activate --accession-level option for 'classify'

---
 src/commons/IndexCreator.cpp |  9 ++-------
 src/commons/Taxonomer.cpp    | 38 +++++++++++++++++++++++++++++++++++-
 src/commons/Taxonomer.h      |  1 +
 src/commons/common.cpp       |  3 +++
 src/workflow/build.cpp       |  1 +
 src/workflow/classify.cpp    |  1 +
 src/workflow/filter.cpp      |  1 +
 7 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index c8f8df7a..ab4baf9a 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -94,18 +94,13 @@ void IndexCreator::createIndex(const LocalParameters &par) {
         // Sort the k-mers
         time_t start = time(nullptr);
         SORT_PARALLEL(kmerBuffer.buffer, kmerBuffer.buffer + kmerBuffer.startIndexOfReserve,
-                      IndexCreator::compareForDiffIdx2);
+                      IndexCreator::compareForDiffIdx);
         time_t sort = time(nullptr);
         cout << "Sort time: " << sort - start << endl;
         auto * uniqKmerIdx = new size_t[kmerBuffer.startIndexOfReserve + 1];
         size_t uniqKmerCnt = 0;
 
-        // Print out the k-mers
-        string tmpFileName = dbDir + "/tmp";
-        FILE * tmpFile = fopen(tmpFileName.c_str(), "wb");
-        fwrite(kmerBuffer.buffer, sizeof(uint16_t), kmerBuffer.startIndexOfReserve, tmpFile);
-        fclose(tmpFile);
-
+        // Reduce redundancy
         reduceRedundancy(kmerBuffer, uniqKmerIdx, uniqKmerCnt, par);
         time_t reduction = time(nullptr);
         cout<<"Time spent for reducing redundancy: "<<(double) (reduction - sort) << endl;
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index 1118f618..ad5505ba 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -1,4 +1,6 @@
 #include "Taxonomer.h"
+#include "NcbiTaxonomy.h"
+#include <unordered_map>
 
 
 Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxonomy(taxonomy) {
@@ -15,6 +17,7 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon
     delete[] mask;
     maxGap = par.maxGap;
     minCoveredPos = par.minCoveredPos;
+    accessionLevel = par.accessionLevel;
 }
 
 Taxonomer::~Taxonomer() {
@@ -262,13 +265,42 @@ TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, pair<int, int>
             }
             i--;
         }
+        // if (accessionLevel == 2) {
+        //     if (taxonomy->taxonNode(minHammingTaxId).) {
+        //         minHammingTaxId = taxonomy->taxonNode(minHammingTaxId)->parentTaxId;
+        //     }
+        // }
         taxCnt[minHammingTaxId]++;
     }
 
     unordered_map<TaxID, TaxonCounts> cladeCnt;
     getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
 
-    return BFS(cladeCnt, spTaxId);
+    if (accessionLevel == 2) {
+        unordered_map<TaxID, TaxonCounts> trimmedCladeCnt;
+        // Remove leaf nodes
+        for (auto it = cladeCnt.begin(); it != cladeCnt.end(); it++) {
+            TaxonNode const * taxon = taxonomy->taxonNode(it->first);
+            if (strcmp(taxonomy->getString(taxon->rankIdx), "") == 0) {
+                // trimmedCladeCnt[it->first] = it->second;
+                cladeCnt[taxon->parentTaxId].children.clear();
+            } 
+            // if (strcmp(taxonomy->getString(taxonomy->taxonNode(it->first)->rankIdx), "") != 0) {
+            //     trimmedCladeCnt[it->first] = it->second;
+            // } else {
+            //     cout << it->first << endl;
+            // }
+
+            // if (!it->second.children.empty() || it->first == spTaxId) {
+            //     trimmedCladeCnt[it->first] = it->second;
+            // } else if (it->second.children.empty()) {
+            //     cout << it->first << endl;
+            // }
+        }
+        return BFS(cladeCnt, spTaxId);
+    } else {
+        return BFS(cladeCnt, spTaxId);
+    }
 }
 
 void Taxonomer::getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> &taxCnt,
@@ -296,6 +328,10 @@ TaxID Taxonomer::BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID r
     if (cladeCnt.at(root).children.empty()) { // root is a leaf
         return root;
     }
+    if (cladeCnt.find(cladeCnt.at(root).children[0]) == cladeCnt.end()) { // its children are trimmed
+        // cout << cladeCnt.at(root).children[0] << endl;
+        return root;
+    }
     unsigned int maxCnt = 3;
     unsigned int currentCnt;
     vector<TaxID> bestChildren;
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index d7de78ac..6c597dd2 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -30,6 +30,7 @@ class Taxonomer {
     // Parameters
     int maxGap;
     int minCoveredPos;
+    int accessionLevel;
 
     struct MatchBlock {
         MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {}
diff --git a/src/commons/common.cpp b/src/commons/common.cpp
index f5dde1cd..9f54ac75 100644
--- a/src/commons/common.cpp
+++ b/src/commons/common.cpp
@@ -93,6 +93,9 @@ int loadDbParameters(LocalParameters &par) {
             par.accessionLevel = 0;
             cerr << "Warning: Current DB doesn't support accession-level classification." << endl;
           }
+          if (tokens[1] == "1" && par.accessionLevel == 0){
+            par.accessionLevel = 2;
+          }
         }
       }
       return 1;
diff --git a/src/workflow/build.cpp b/src/workflow/build.cpp
index b5f9e571..eaf0c367 100644
--- a/src/workflow/build.cpp
+++ b/src/workflow/build.cpp
@@ -12,6 +12,7 @@ void setDefaults_build(LocalParameters & par){
     par.maskProb = 0.9;
     par.maskMode = 1;
     par.bufferSize = 1'000'000'000;
+    par.accessionLevel = 0;
 }
 
 int build(int argc, const char **argv, const Command &command){
diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp
index 514f6ae8..91977368 100644
--- a/src/workflow/classify.cpp
+++ b/src/workflow/classify.cpp
@@ -24,6 +24,7 @@ void setClassifyDefaults(LocalParameters & par){
     par.maskMode = 0;
     par.maskProb = 0.9;
     par.matchPerKmer = 4;
+    par.accessionLevel = 0;
 }
 
 int classify(int argc, const char **argv, const Command& command)
diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp
index 3d0b0448..79bac439 100644
--- a/src/workflow/filter.cpp
+++ b/src/workflow/filter.cpp
@@ -23,6 +23,7 @@ void setFilterDefaults(LocalParameters & par) {
     par.matchPerKmer = 4;
     par.printMode = 1;
     par.contamList = ""; // TODO: set default
+    par.accessionLevel = 0;
 }
 
 int filter(int argc, const char **argv, const Command& command) {

From 50768b506d909f3e8316e9f49615221de91aa0ef Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Thu, 31 Aug 2023 13:51:50 +0900
Subject: [PATCH 24/65] accession-level DB + turning off accession-level in
 classiy == not accession-level DB

---
 src/commons/Taxonomer.cpp | 30 +++++-------------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index ad5505ba..663ebba4 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -265,37 +265,22 @@ TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, pair<int, int>
             }
             i--;
         }
-        // if (accessionLevel == 2) {
-        //     if (taxonomy->taxonNode(minHammingTaxId).) {
-        //         minHammingTaxId = taxonomy->taxonNode(minHammingTaxId)->parentTaxId;
-        //     }
-        // }
         taxCnt[minHammingTaxId]++;
     }
 
     unordered_map<TaxID, TaxonCounts> cladeCnt;
     getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
 
-    if (accessionLevel == 2) {
-        unordered_map<TaxID, TaxonCounts> trimmedCladeCnt;
+    if (accessionLevel == 2) { // Don't do accession-level classification
         // Remove leaf nodes
         for (auto it = cladeCnt.begin(); it != cladeCnt.end(); it++) {
             TaxonNode const * taxon = taxonomy->taxonNode(it->first);
             if (strcmp(taxonomy->getString(taxon->rankIdx), "") == 0) {
-                // trimmedCladeCnt[it->first] = it->second;
-                cladeCnt[taxon->parentTaxId].children.clear();
+                // Remove current node from its parent's children list
+                cladeCnt[taxon->parentTaxId].children.erase(find(cladeCnt[taxon->parentTaxId].children.begin(),
+                                                                 cladeCnt[taxon->parentTaxId].children.end(),
+                                                                 it->first));
             } 
-            // if (strcmp(taxonomy->getString(taxonomy->taxonNode(it->first)->rankIdx), "") != 0) {
-            //     trimmedCladeCnt[it->first] = it->second;
-            // } else {
-            //     cout << it->first << endl;
-            // }
-
-            // if (!it->second.children.empty() || it->first == spTaxId) {
-            //     trimmedCladeCnt[it->first] = it->second;
-            // } else if (it->second.children.empty()) {
-            //     cout << it->first << endl;
-            // }
         }
         return BFS(cladeCnt, spTaxId);
     } else {
@@ -328,10 +313,6 @@ TaxID Taxonomer::BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID r
     if (cladeCnt.at(root).children.empty()) { // root is a leaf
         return root;
     }
-    if (cladeCnt.find(cladeCnt.at(root).children[0]) == cladeCnt.end()) { // its children are trimmed
-        // cout << cladeCnt.at(root).children[0] << endl;
-        return root;
-    }
     unsigned int maxCnt = 3;
     unsigned int currentCnt;
     vector<TaxID> bestChildren;
@@ -1025,7 +1006,6 @@ TaxonScore Taxonomer::chooseSpecies(const vector<Match> &matches,
     // Score each species
     std::unordered_map<TaxID, TaxonScore> speciesScores;
 
-
     size_t i = 0;
     TaxID currentSpeices;
     size_t numOfMatch = matches.size();

From 6dda6d20ed5c84a65de1eec36fede000bd7556d4 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Fri, 8 Sep 2023 16:37:45 +0900
Subject: [PATCH 25/65] some minro code fixes

---
 src/commons/IndexCreator.cpp    |  8 +++++---
 src/workflow/add_to_library.cpp |  8 +++++---
 util/prepare_gtdb_taxonomy.sh   | 20 +++++++++++++-------
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index ab4baf9a..aded0b45 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -340,11 +340,13 @@ TaxID IndexCreator::load_accession2taxid(const string & mappingFileName, unorder
     string eachLine;
     string eachItem;
     if (FILE * mappingFile = fopen(mappingFileName.c_str(), "r")) {
-        char buffer[512];
+        char accession[2048];
+        char accession_version[2048];
         int taxID;
         fscanf(mappingFile, "%*s\t%*s\t%*s\t%*s");
-        while (fscanf(mappingFile, "%s\t%*s\t%d\t%*d", buffer, &taxID) == 2 ){
-            acc2taxid[string(buffer)] = taxID;
+        while (fscanf(mappingFile, "%s\t%s\t%d\t%*d", accession, accession_version, &taxID) == 2 ){
+            acc2taxid[string(accession_version)] = taxID;
+            acc2taxid[string(accession)] = taxID;
             if (taxID > maxTaxID) {
                 maxTaxID = taxID;
             }
diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp
index e5f97eba..8abb3ff2 100644
--- a/src/workflow/add_to_library.cpp
+++ b/src/workflow/add_to_library.cpp
@@ -56,11 +56,13 @@ int addToLibrary(int argc, const char **argv, const Command &command){
         unordered_map<string, int> acc2taxid;
         string eachItem;
         if (FILE *mappingFile = fopen(mappingFileName.c_str(), "r")) {
-            char buffer[512];
+            char accession[2048];
+            char accession_version[2048];
             int taxID;
             fscanf(mappingFile, "%*s\t%*s\t%*s\t%*s");
-            while (fscanf(mappingFile, "%s\t%*s\t%d\t%*d", buffer, &taxID) == 2) {
-                acc2taxid[string(buffer)] = taxID;
+            while (fscanf(mappingFile, "%s\t%s\t%d\t%*d", accession, accession_version, &taxID) == 2) {
+                acc2taxid[string(accession_version)] = taxID;
+                acc2taxid[string(accession)] = taxID;
             }
         } else {
             cerr << "Cannot open file for mapping from accession to tax ID" << endl;
diff --git a/util/prepare_gtdb_taxonomy.sh b/util/prepare_gtdb_taxonomy.sh
index 855d8d74..4621fa49 100755
--- a/util/prepare_gtdb_taxonomy.sh
+++ b/util/prepare_gtdb_taxonomy.sh
@@ -2,10 +2,16 @@
 
 # set output directory
 OUT=$1
+TAX_DIR="${OUT}/taxonomy"
 PWD=$(pwd)
 ar_gz="${PWD}/ar.tar.gz"
 bac_gz="${PWD}/bac.tar.gz"
 
+# mkdir TAX_DIR if it doesn't exist
+if [ ! -d "${TAX_DIR}" ]; then
+  mkdir -p "${TAX_DIR}"
+fi
+
 
 wget -O "${ar_gz}" https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tar.gz
 wget -O "${bac_gz}" https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tar.gz
@@ -42,14 +48,14 @@ awk -F '\t' '$1 ~ /^(R|G)/{print $1,$55,$111}' "${ar_bac_meta_wTaxIDs}" | while
 	fi
 done > "${map}"
 
-mv "${ar_bac_meta_wTaxIDs}" "${OUT}"
+mv "${ar_bac_meta_wTaxIDs}" "${TAX_DIR}"
 echo -e "\t|\t\t|" > "merged.dmp"
-mv "merged.dmp" "${OUT}"
-mv "names.dmp" "${OUT}"
-mv "nodes.dmp" "${OUT}"
-mv "delnodes.dmp" "${OUT}"
-mv "${map}" "${OUT}"
-mv "taxID_info.tsv" "${OUT}"
+mv "merged.dmp" "${TAX_DIR}"
+mv "names.dmp" "${TAX_DIR}"
+mv "nodes.dmp" "${TAX_DIR}"
+mv "delnodes.dmp" "${TAX_DIR}"
+mv "${map}" "${TAX_DIR}"
+mv "taxID_info.tsv" "${TAX_DIR}"
 
 
 #mv "${ar_bac_meta_wTaxIDs}" "../gtdb_taxdmp"

From d3773453b642d99931432ee65743af3400ce4782 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Fri, 8 Sep 2023 20:10:18 +0900
Subject: [PATCH 26/65] fix minor error

---
 src/commons/IndexCreator.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index aded0b45..e21e00a4 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -344,7 +344,7 @@ TaxID IndexCreator::load_accession2taxid(const string & mappingFileName, unorder
         char accession_version[2048];
         int taxID;
         fscanf(mappingFile, "%*s\t%*s\t%*s\t%*s");
-        while (fscanf(mappingFile, "%s\t%s\t%d\t%*d", accession, accession_version, &taxID) == 2 ){
+        while (fscanf(mappingFile, "%s\t%s\t%d\t%*d", accession, accession_version, &taxID) == 3 ){
             acc2taxid[string(accession_version)] = taxID;
             acc2taxid[string(accession)] = taxID;
             if (taxID > maxTaxID) {
@@ -701,18 +701,13 @@ string IndexCreator::getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments,
         accession_version = firstLine.substr(1, LocalUtil::getFirstWhiteSpacePos(firstLine) - 1);
         newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid.at(accession), newTaxID));
         taxIdList.push_back(newTaxID++);
-//        cout << firstLine << endl;
-        // taxIdList.push_back(acc2taxid.at(firstLine.substr(1, firstLine.find('.') - 1)));
-        // foundAcc2taxid[firstLine.substr(1, firstLine.find(' ') - 1)] = taxIdList.back();
+
         while (getline(seqFile, eachLine, '\n')) {
             if (eachLine[0] == '>') {
                 accession = eachLine.substr(1, eachLine.find('.') - 1);
                 accession_version = eachLine.substr(1, LocalUtil::getFirstWhiteSpacePos(eachLine) - 1);
                 newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid.at(accession), newTaxID));
                 taxIdList.push_back(newTaxID++);
-//                cout << eachLine << endl;
-                // taxIdList.push_back(acc2taxid.at(eachLine.substr(1, eachLine.find('.') - 1)));
-                // foundAcc2taxid[eachLine.substr(1, eachLine.find(' ') - 1)] = taxIdList.back();
                 pos = (size_t) seqFile.tellg();
                 seqSegmentsTmp.emplace_back(start, pos - eachLine.length() - 3,pos - eachLine.length() - start - 2);
                 start = pos - eachLine.length() - 1;

From 31e5a1b15c5b433fd1932699c241c2c23336c545 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Fri, 8 Sep 2023 20:57:36 +0900
Subject: [PATCH 27/65] properly use merged.dmp during building accession-level
 DB

---
 src/commons/IndexCreator.cpp | 39 ++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index e21e00a4..ba967e6b 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -5,7 +5,9 @@
 #include <bits/types/FILE.h>
 #include <cstdint>
 #include <cstdio>
+#include <unordered_map>
 #include <utility>
+#include "NcbiTaxonomy.cpp"
 
 IndexCreator::IndexCreator(const LocalParameters & par) {
     // Parameters
@@ -1021,6 +1023,26 @@ void IndexCreator::writeDbParameters() {
 }
 
 void IndexCreator::editTaxonomyDumpFiles(const vector<pair<string, pair<TaxID, TaxID>>> & newAcc2taxid) {
+    // Load merged.dmp
+    string mergedFileName = taxonomyDir + "/merged.dmp";
+    std::ifstream ss(mergedFileName);
+    if (ss.fail()) {
+        Debug(Debug::ERROR) << "File " << mergedFileName << " not found!\n";
+        EXIT(EXIT_FAILURE);
+    }
+
+    std::string line;
+    size_t count = 0;
+    unordered_map<int, int> mergedMap;
+    while (std::getline(ss, line)) {
+        std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
+        if (result.size() != 2) {
+            Debug(Debug::ERROR) << "Invalid name entry!\n";
+            EXIT(EXIT_FAILURE);
+        }
+        mergedMap[atoi(result[0].c_str())] = atoi(result[1].c_str());
+    }
+
     // Edit names.dmp
     string nameFileName = taxonomyDir + "/names.dmp";
     string newNameFileName = taxonomyDir + "/names.dmp.new";
@@ -1048,10 +1070,19 @@ void IndexCreator::editTaxonomyDumpFiles(const vector<pair<string, pair<TaxID, T
     }
 
     for (size_t i = 0; i < newAcc2taxid.size() - 1; i++) {
-        fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\n", newAcc2taxid[i].second.second, newAcc2taxid[i].second.first);
+        // Check if the parent taxon is merged
+        if (mergedMap.find(newAcc2taxid[i].second.first) != mergedMap.end()) { // merged
+            fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\n", newAcc2taxid[i].second.second, mergedMap[newAcc2taxid[i].second.first]);
+        } else {
+            fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\n", newAcc2taxid[i].second.second, newAcc2taxid[i].second.first);
+        }
+        // fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\n", newAcc2taxid[i].second.second, taxonomy->taxonNode(newAcc2taxid[i].second.first)->taxId);
+    }
+    // Check if the parent taxon is merged
+    if (mergedMap.find(newAcc2taxid.back().second.first) != mergedMap.end()) { // merged
+        fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|", newAcc2taxid.back().second.second, mergedMap[newAcc2taxid.back().second.first]);
+    } else {
+        fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|", newAcc2taxid.back().second.second, newAcc2taxid.back().second.first);
     }
-    fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|", newAcc2taxid.back().second.second, newAcc2taxid.back().second.first);
     fclose(nodeFile);
-
-    // Edit node.dmp
 }
\ No newline at end of file

From e807ee34db5d4a9b51e17b20f3fb90c5f9fe15c5 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Sat, 9 Sep 2023 18:21:40 +0900
Subject: [PATCH 28/65] get maxTaxID properly

---
 src/commons/IndexCreator.cpp | 46 +++++++++++++++++++++++--
 src/commons/IndexCreator.h   |  2 ++
 src/commons/KmerMatcher.cpp  | 67 ++++++++++--------------------------
 3 files changed, 65 insertions(+), 50 deletions(-)

diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index ba967e6b..aaf50291 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -217,8 +217,8 @@ void IndexCreator::makeBlocksForParallelProcessing_accession_level() {
 
     unordered_map<string, TaxID> acc2taxid;
     TaxID maxTaxID = load_accession2taxid(acc2taxidFileName, acc2taxid);
-    newTaxID = maxTaxID + 1;
-
+    newTaxID = std::max(getMaxTaxID() + 1, maxTaxID + 1);
+    
     vector<pair<string,pair<TaxID, TaxID>>> newAcc2taxid; // accession.version -> (parent, newTaxID)
 
     // Make blocks of sequences that can be processed in parallel
@@ -1085,4 +1085,46 @@ void IndexCreator::editTaxonomyDumpFiles(const vector<pair<string, pair<TaxID, T
         fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|", newAcc2taxid.back().second.second, newAcc2taxid.back().second.first);
     }
     fclose(nodeFile);
+}
+
+TaxID IndexCreator::getMaxTaxID() {
+    // Check nodes.dmp
+    string nodeFileName = taxonomyDir + "/nodes.dmp";
+    std::ifstream ss(nodeFileName);
+    if (ss.fail()) {
+        Debug(Debug::ERROR) << "File " << nodeFileName << " not found!\n";
+        EXIT(EXIT_FAILURE);
+    }
+
+    std::string line;
+    TaxID maxTaxID = 0;
+    while (std::getline(ss, line)) {
+        std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
+        if (result.size() != 2) {
+            Debug(Debug::ERROR) << "Invalid name entry!\n";
+            EXIT(EXIT_FAILURE);
+        }
+        maxTaxID = std::max(maxTaxID, (TaxID) atoi(result[0].c_str()));
+    }
+    ss.close();
+
+    // Check names.dmp
+    string nameFileName = taxonomyDir + "/names.dmp";
+    ss = std::ifstream(nameFileName);
+    if (ss.fail()) {
+        Debug(Debug::ERROR) << "File " << nameFileName << " not found!\n";
+        EXIT(EXIT_FAILURE);
+    }
+
+    while (std::getline(ss, line)) {
+        std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
+        if (result.size() != 2) {
+            Debug(Debug::ERROR) << "Invalid name entry!\n";
+            EXIT(EXIT_FAILURE);
+        }
+        maxTaxID = std::max(maxTaxID, (TaxID) atoi(result[0].c_str()));
+    }
+    ss.close();
+
+    return maxTaxID;
 }
\ No newline at end of file
diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h
index bee95f8f..1f5b7eb3 100644
--- a/src/commons/IndexCreator.h
+++ b/src/commons/IndexCreator.h
@@ -138,6 +138,8 @@ class IndexCreator{
 
     static TaxID load_accession2taxid(const string & mappingFile, unordered_map<string, int> & assacc2taxid);
 
+    TaxID getMaxTaxID();
+
     void editTaxonomyDumpFiles(const vector<pair<string, pair<TaxID, TaxID>>> & newAcc2taxid);
 
     void reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniqeKmerIdx, size_t & uniqKmerCnt,
diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 39b9a13d..3c0f08f8 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -17,42 +17,6 @@ KmerMatcher::KmerMatcher(const LocalParameters & par,
     this->taxonomy = taxonomy;
     loadTaxIdList(par);
 
-    // // Load the taxonomy ID list
-    // FILE * taxIdFile;
-    // if((taxIdFile = fopen((dbDir + "/taxID_list").c_str(),"r")) == NULL){
-    //     std::cout<<"Cannot open the taxID list file."<<std::endl;
-    //     return;
-    // }
-    // char taxID[100];
-    // while(feof(taxIdFile) == 0) {
-    //     fscanf(taxIdFile,"%s",taxID);
-    //     TaxID taxId = atol(taxID);
-    //     TaxonNode const * taxon = taxonomy->taxonNode(taxId);
-    //     if (taxId == taxon->taxId) {
-    //         TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
-    //         TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
-    //         while (taxon->taxId != speciesTaxID) {
-    //             taxId2speciesId[taxon->taxId] = speciesTaxID;
-    //             taxId2genusId[taxon->taxId] = genusTaxID;
-    //             taxon = taxonomy->taxonNode(taxon->parentTaxId);
-    //         }
-    //         taxId2speciesId[speciesTaxID] = speciesTaxID;
-    //         taxId2genusId[speciesTaxID] = genusTaxID;
-    //     } else {
-    //         TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
-    //         TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus");
-    //         while (taxon->taxId != speciesTaxID) {
-    //             taxId2speciesId[taxon->taxId] = speciesTaxID;
-    //             taxId2genusId[taxon->taxId] = genusTaxID;
-    //             taxon = taxonomy->taxonNode(taxon->parentTaxId);
-    //         }
-    //         taxId2speciesId[speciesTaxID] = speciesTaxID;
-    //         taxId2genusId[speciesTaxID] = genusTaxID;
-    //         taxId2speciesId[taxId] = speciesTaxID;
-    //         taxId2genusId[taxId] = genusTaxID;
-    //     }
-    // }
-    // fclose(taxIdFile);
 }
 
 
@@ -160,6 +124,13 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
     MmapedData<DiffIdxSplit> diffIdxSplits = mmapData<DiffIdxSplit>(diffIdxSplitFileName.c_str(), 3);
     size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t);
 
+    // // Print target k-mer information
+    // MmapedData<TargetKmerInfo> targetKmerInfo2 = mmapData<TargetKmerInfo>(targetInfoFileName.c_str(), 3);
+    // size_t numOfTargetKmer = targetKmerInfo2.fileSize / sizeof(TargetKmerInfo);
+    // for (size_t i = 0; i < numOfTargetKmer; i++) {
+    //     cout << targetKmerInfo2.data[i].sequenceID << "\t" << (int) targetKmerInfo2.data[i].redundancy << endl;
+    // }
+
     size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve;
     QueryKmer *queryKmerList = queryKmerBuffer->buffer;
     
@@ -329,10 +300,10 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
                         for (int k = 0; k < currMatchNum; k++) {
                             idx = selectedMatches[k];
                             // Check if candidateKmerInfos[idx].sequenceID is valid
-                            if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
-                                taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
-                                cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
-                            }
+                            // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
+                            //     taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
+                            //     cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
+                            // }
                             matches[matchCnt] = {queryKmerList[j].info,
                                                  candidateKmerInfos[idx].sequenceID,
                                                  taxId2genusId[candidateKmerInfos[idx].sequenceID],
@@ -371,10 +342,10 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
                         for (int k = 0; k < currMatchNum; k++) {
                             idx = selectedMatches[k];
                             // Check if candidateKmerInfos[idx].sequenceID is valid
-                            if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
-                                taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
-                                cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
-                            }
+                            // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
+                            //     taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
+                            //     cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
+                            // }
                             matches[matchCnt] = {queryKmerList[j].info,
                                                  candidateKmerInfos[idx].sequenceID,
                                                  taxId2genusId[candidateKmerInfos[idx].sequenceID],
@@ -468,10 +439,10 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
                     for (int k = 0; k < currMatchNum; k++) {
                         idx = selectedMatches[k];
                         // Check if candidateKmerInfos[idx].sequenceID is valid
-                        if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
-                            taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
-                            cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
-                        }
+                        // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
+                        //     taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
+                        //     cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
+                        // }
                         matches[matchCnt] = {queryKmerList[j].info,
                                              candidateKmerInfos[idx].sequenceID,
                                              taxId2genusId[candidateKmerInfos[idx].sequenceID],

From eb970ca0633c72fba498bb4820b9ab0fa8aa2cdb Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Mon, 11 Sep 2023 16:27:42 +0900
Subject: [PATCH 29/65] database-report for accession-level DB

---
 README.md                    | 33 ++++++++++++++++++++++------
 src/util/database-report.cpp | 42 +++++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 6765aa17..3e6e9170 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,14 @@ In addition, it can classify reads against a database of any size as long as it
 
 <p align="center"><img src="https://raw.githubusercontent.com/steineggerlab/Metabuli/master/.github/marv_metabuli_small.png" height="350" /></p>
 
+## Update notes
+### v1.0.2
+- `--accession-level` option for `build` and `classify` workflow: It reports not only the taxon but also the accession of the best match.
+- Fix minor bugs in `build` workflow.
+- Generate `taxonomyDB` during `build` and load it during `classify` workflow for faster loading of taxonomy information.
+- Support gzipped FASTA/FASTQ files in `add-to-library` and `classify` workflows.
+- low-complexity filtering in `build` workflow as default with `--mask-prob 0.9`.
+- 
 ## Installation
 ### Precompiled binaries
 ```
@@ -79,8 +87,10 @@ metabuli classify --seq-mode 1 read.fna dbdir outdir jobid
    --min-score : The minimum score to be classified (0.15 for precision mode)
    --min-sp-score : The minimum score to be classified at or below species rank. (0.5 for precision mode)
    --taxonomy-path: Directory where the taxonomy dump files are stored. (DBDIR/taxonomy by default)
-   --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids. Give the same value used for DB creation.
-   --spacing-mask : Binary patterend mask for spaced k-mer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip.
+   --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids. 
+                  Give the same value used for DB creation.
+   --accession-level : Set 1 to use accession level classification (0 by default). 
+                       It is available when the DB is also built with accession level taxonomy.
    
   * Values of --min-score and --min-sp-score for precision mode are optimized only for short reads.
   * We don't recommend using them for long reads.
@@ -162,9 +172,12 @@ Accessions that are not included in the `<accession2taxid>` will be skipped and
 #### 3. Build
 
 ```
-metabuli build <DBDIR> <FASTA list> <accession2taxid> [options]
+# Get the list of absoulte paths of files in your library
+find <DBDIR>/library -name '*.fna' > library-files.txt
+
+metabuli build <DBDIR> <LIB_FILES> <accession2taxid> [options]
 - DBDIR: The same DBDIR from the previous step. 
-- FASTA list: A file containing absolute paths of the FASTA files in DBDIR/library
+- LIB_FILES: A file containing absolute paths of the FASTA files in DBDIR/library (library-files.txt)
 - accession2taxid : A path to NCBI-style accession2taxid.
   
   * Options
@@ -172,6 +185,7 @@ metabuli build <DBDIR> <FASTA list> <accession2taxid> [options]
    --taxonomy-path: Directory where the taxonomy dump files are stored. (DBDIR/taxonomy by default)
    --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids.
    --spacing-mask : Binary mask for spaced metamer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip.
+   --accession-level : Set 1 to use accession level taxonomy (0 by default).
 ```
 This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete '\*\_diffIdx' and '\*\_info' if generated.
 
@@ -201,16 +215,21 @@ This will add your FASTA files to DBDIR/library according to their species taxon
 
 #### 2. Build
 ```
-metabuli build <DBDIR> <FASTA list> <accession2taxid> [options]
+# Get the list of absoulte paths of files in your library
+find <DBDIR>/library -name '*.fna' > library-files.txt
+
+metabuli build <DBDIR> <LIB_FILES> <accession2taxid> [options]
 - DBDIR: The same DBDIR from the previous step. 
-- FASTA list: A file containing absolute paths of the FASTA files in DBDIR/library
+- <LIB_FILES>: A file containing absolute paths of the FASTA files in DBDIR/library (library-files.txt)
 - accession2taxid : A path to NCBI-style accession2taxid.
   
   * Options
    --threads : The number of CPU-cores used (all by default)
    --taxonomy-path: Directory where the taxonomy dump files are stored. (DBDIR/taxonomy by default)
    --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids.
-   --spacing-mask : Binary mask for spaced metamer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip.
+   --spacing-mask : Binary mask for spaced metamer. The same mask must be used for DB creation and classification. 
+                    A mask should contain at least eight '1's, and '0' means skip.
+   --accession-level : Set 1 to use accession level taxonomy (0 by default).
 ```
 This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete '\*\_diffIdx' and '\*\_info' if generated.
 
diff --git a/src/util/database-report.cpp b/src/util/database-report.cpp
index 40ea3afb..4aee73e4 100644
--- a/src/util/database-report.cpp
+++ b/src/util/database-report.cpp
@@ -4,6 +4,7 @@
 #include <string>
 #include <iostream>
 #include "IndexCreator.h"
+#include "common.h"
 #include "report.h"
 #include "FileUtil.h"
 
@@ -20,13 +21,15 @@ int databaseReport(int argc, const char **argv, const Command &command) {
     par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0);
     string dbDir = par.filenames[0];
 
-    // Check if taxonomy path exists
+    // Load taxonomy
     if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
-    if (!FileUtil::directoryExists(par.taxonomyPath.c_str())) {
-        cerr << "Error: taxonomy path " << par.taxonomyPath << " does not exist." << endl;
-        cerr << "Please specify the path to the taxonomy directory using the --taxonomy-path option." << endl;
-        return 1;
-    }
+    NcbiTaxonomy * taxonomy = loadTaxonomy(dbDir, par.taxonomyPath);
+
+    // if (!FileUtil::directoryExists(par.taxonomyPath.c_str())) {
+    //     cerr << "Error: taxonomy path " << par.taxonomyPath << " does not exist." << endl;
+    //     cerr << "Please specify the path to the taxonomy directory using the --taxonomy-path option." << endl;
+    //     return 1;
+    // }
 
     // Check if acc2taxid.map exists
     string acc2taxid = dbDir + "/acc2taxid.map";
@@ -35,11 +38,11 @@ int databaseReport(int argc, const char **argv, const Command &command) {
         return 1;
     }
 
-    // Load taxonomy
-    const string names = par.taxonomyPath + "/names.dmp";
-    const string nodes = par.taxonomyPath + "/nodes.dmp";
-    const string merged = par.taxonomyPath + "/merged.dmp";
-    auto * taxonomy = new NcbiTaxonomy(names, nodes, merged);
+    // // Load taxonomy
+    // const string names = par.taxonomyPath + "/names.dmp";
+    // const string nodes = par.taxonomyPath + "/nodes.dmp";
+    // const string merged = par.taxonomyPath + "/merged.dmp";
+    // auto * taxonomy = new NcbiTaxonomy(names, nodes, merged);
 
     // Load only the second column of acc2taxid.map as integers
     vector<int> taxids;
@@ -49,8 +52,23 @@ int databaseReport(int argc, const char **argv, const Command &command) {
         return 1;
     }
     string line;
+    // Check if there is third column
+    getline(acc2taxidFile, line);
+    vector<string> tokens = Util::split(line, "\t");
+    int using_token = 0;
+    if (tokens.size() == 2) { // accession and taxid
+        using_token = 1;
+        taxids.push_back(stoi(tokens[using_token]));        
+    } else if (tokens.size() == 3) { // accession and taxid and accession_id
+        using_token = 2;
+        taxids.push_back(stoi(tokens[using_token]));
+    } else {
+        cerr << "Error: acc2taxid.map file " << acc2taxid << " has wrong format." << endl;
+        return 1;
+    }
     while (getline(acc2taxidFile, line)) {
-        int taxid = stoi(line.substr(line.find('\t') + 1));
+        tokens = Util::split(line, "\t");
+        int taxid = stoi(tokens[using_token]);
         if (find(taxids.begin(), taxids.end(), taxid) == taxids.end()) {
             taxids.push_back(taxid);
         }

From b63a0bab5e740ca545fd8083a193d70633217609 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Mon, 11 Sep 2023 20:55:51 +0900
Subject: [PATCH 30/65] Improve descriptions for options. Remove unused
 options. Improve log prints

---
 src/commons/Classifier.cpp      |   3 +
 src/commons/IndexCreator.cpp    |  10 +-
 src/commons/IndexCreator.h      |   4 +
 src/commons/KmerMatcher.cpp     |   3 +-
 src/commons/LocalParameters.cpp | 572 ++++++++++++++++++++++++++++++--
 src/commons/LocalParameters.h   |  11 +
 src/commons/common.cpp          |  13 +
 src/metabuli.cpp                |   2 +-
 src/workflow/build.cpp          |   9 +
 src/workflow/classify.cpp       |   1 -
 10 files changed, 591 insertions(+), 37 deletions(-)

diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp
index 9258a194..3384ed88 100644
--- a/src/commons/Classifier.cpp
+++ b/src/commons/Classifier.cpp
@@ -7,6 +7,9 @@ Classifier::Classifier(LocalParameters & par) {
     dbDir = par.filenames[1 + (par.seqMode == 2)];
     matchPerKmer = par.matchPerKmer;
     loadDbParameters(par);
+
+    cout << "DB name: " << par.dbName << endl;
+    cout << "DB creation date: " << par.dbDate << endl;
     
     // Taxonomy
     taxonomy = loadTaxonomy(dbDir, par.taxonomyPath);
diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index aaf50291..b7945148 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -16,6 +16,11 @@ IndexCreator::IndexCreator(const LocalParameters & par) {
     reducedAA = par.reducedAA;
     spaceMask = par.spaceMask;
     accessionLevel = par.accessionLevel;
+    lowComplexityMasking = par.maskMode;
+    lowComplexityMaskingThreshold = par.maskProb;
+    dbName = par.dbName;
+    dbDate = par.dbDate;
+    
 
     // Input files
     dbDir = par.filenames[0];
@@ -167,7 +172,6 @@ void IndexCreator::updateIndex(const LocalParameters &par) {
         delete[] uniqKmerIdx;
     }
     delete[] splitChecker;
-
 }
 
 void IndexCreator::makeBlocksForParallelProcessing() {
@@ -1016,9 +1020,13 @@ void IndexCreator::writeDbParameters() {
         Debug(Debug::ERROR) << "Could not open " << paramterFileName << " for writing\n";
         EXIT(EXIT_FAILURE);
     }
+    fprintf(handle, "DB_name\t%s\n", dbName.c_str());
+    fprintf(handle, "Creation_date\t%s\n", dbDate.c_str());
     fprintf(handle, "Reduced_alphabet\t%d\n", reducedAA);
     fprintf(handle, "Spaced_kmer_mask\t%s\n", spaceMask.c_str());
     fprintf(handle, "Accession_level\t%d\n", accessionLevel);
+    fprintf(handle, "Mask_mode\t%d\n", lowComplexityMasking);
+    fprintf(handle, "Mask_prob\t%f\n", lowComplexityMaskingThreshold);
     fclose(handle);
 }
 
diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h
index 1f5b7eb3..33bc1c66 100644
--- a/src/commons/IndexCreator.h
+++ b/src/commons/IndexCreator.h
@@ -46,6 +46,10 @@ class IndexCreator{
     int reducedAA;
     string spaceMask;
     int accessionLevel;
+    int lowComplexityMasking;
+    float lowComplexityMaskingThreshold;
+    string dbName;
+    string dbDate;
     
     // Inputs
     NcbiTaxonomy * taxonomy;
diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 3c0f08f8..9455070e 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -24,6 +24,7 @@ KmerMatcher::~KmerMatcher() {
 }
 
 void KmerMatcher::loadTaxIdList(const LocalParameters & par) {
+    cout << "Loading the list for taxonomy IDs ... ";
     if (par.contamList != "") {
         vector<string> contams = Util::split(par.contamList, ",");
         for (auto &contam : contams) {
@@ -101,7 +102,7 @@ void KmerMatcher::loadTaxIdList(const LocalParameters & par) {
         }
         fclose(taxIdFile);
     }
-    cout << "Taxonomy ID list is loaded." << endl;
+    cout << "Done" << endl;
 }
 
 
diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp
index f38cd6a1..986ab86d 100644
--- a/src/commons/LocalParameters.cpp
+++ b/src/commons/LocalParameters.cpp
@@ -1,12 +1,18 @@
 #include "LocalParameters.h"
 #include "Parameters.h"
+#include "Debug.h"
+#include "CommandCaller.h"
+#include "Parameters.cpp"
+#include "ByteParser.h"
+#include <iomanip>
+#include "DistanceCalculator.h"
 
 LocalParameters::LocalParameters() :
         Parameters(),
         VIRUS_TAX_ID(VIRUS_TAX_ID_ID,
                      "--virus-taxid",
                      "Taxonomy ID of virus taxon",
-                     "NCBI: 10239 [Default]\nCUSTOM: Check names.dmp file ",
+                     "NCBI: 10239 \nCUSTOM: Check names.dmp file ",
                      typeid(int),
                      (void *) &virusTaxId,
                      "^[0-9]+$"),
@@ -34,28 +40,28 @@ LocalParameters::LocalParameters() :
         SEQ_MODE(SEQ_MODE_ID,
                  "--seq-mode",
                  "Sequencing type",
-                 "Single-end: 1 \nPaired-end: 2\nLong read: 3",
+                 "Single-end: 1, Paired-end: 2, Long read: 3",
                  typeid(int),
                  (void *) &seqMode,
                  "[1-3]"),
         REDUCED_AA(REDUCED_AA_ID,
                    "--reduced-aa",
-                   "Using reduced 15 alphabets to encode amino acids. It increases sensitivity",
-                   "Using 20 alphabets: 0 \nUsing 15 alphabets: 1",
+                   "Using 15 alphabets to encode AAs for sensitivity",
+                   "Set as 0 to use 15 alphabets to encode AAs for sensitivity",
                    typeid(int),
                    (void *) &reducedAA,
                    "[0-1]"),
         MIN_SCORE(MIN_SCORE_ID,
                   "--min-score",
-                  "The minimum score for classification",
-                  "You can set a value from 0.0 to 1.0",
+                  "Min. sequence similarity score",
+                  "Min. sequence similarity score (0.0-1.0)",
                   typeid(float),
                   (void *) &minScore,
                   "^0(\\.[0-9]+)?|1(\\.0+)?$"),
         MIN_COVERAGE(MIN_COVERAGE_ID,
                      "--min-cov",
-                     "The minimum coverage for classification",
-                     "You can set a value from 0.0 to 1.0",
+                     "Min. query coverage",
+                     "Min. query coverage (0.0-1.0)",
                      typeid(float),
                      (void *) &minCoverage,
                      "^0(\\.[0-9]+)?|1(\\.0+)?$"),
@@ -76,17 +82,15 @@ LocalParameters::LocalParameters() :
                         "^[0-9]+$"),
         HAMMING_MARGIN(HAMMING_MARGIN_ID,
                        "--hamming-margin",
-                       "If a query k-mer has multiple matches, the matches with hamming distance lower than sum of \n"
-                       "the minimum distance and this margin are selected for later steps.",
-                       "If a query k-mer has multiple matches, the matches with hamming distance lower than sum of \n"
-                       "the minimum distance and this margin are selected for later steps.",
+                       "Allowed extra Hamming distance", 
+                       "It allows extra Hamming distance than the minimum distance.",
                        typeid(int),
                        (void *) &hammingMargin,
                        ""),
         MIN_SP_SCORE(MIN_SP_SCORE_ID,
                      "--min-sp-score",
-                     "Minimum score to be classified at species or lower rank.",
-                     "Minimum score to be classified at the species level.",
+                     "Min. score for species- or lower-level classification.",
+                     "Min. score for species- or lower-level classification.",
                      typeid(float),
                      (void *) &minSpScore,
                      "^0(\\.[0-9]+)?|1(\\.0+)?$"),
@@ -120,22 +124,22 @@ LocalParameters::LocalParameters() :
                 "^[0-9]+$"),
         MIN_CONS_CNT(MIN_CONS_CNT_ID,
                      "--min-cons-cnt",
-                     "Minimum number of consecutive metamer matches to be used for prokaryote/virus classification",
-                     "Minimum number of consecutive metamer matches to be used for prokaryote/virus classification",
+                     "Min. num. of cons. matches for non-euk. classification",
+                     "Min. number of consecutive matches for prokaryote/virus classification",
                      typeid(int),
                      (void *) &minConsCnt,
                      "^[0-9]+$"),
         MIN_CONS_CNT_EUK(MIN_CONS_CNT_EUK_ID,
                          "--min-cons-cnt-euk",
-                         "Minimum number of consecutive metamer matches to be used for eukaryote classification",
-                         "Minimum number of consecutive metamer matches to be used for eukaryote classification",
+                         "Min. num. of cons. matches for euk. classification",
+                         "Min. number of consecutive matches for eukaryote classification",
                          typeid(int),
                          (void *) &minConsCntEuk,
                          "^[0-9]+$"),
         MATCH_PER_KMER(MATCH_PER_KMER_ID,
                        "--match-per-kmer",
-                       "Number of matches per query k-mer",
-                       "Number of matches per query k-mer. Larger values assign more memory for storing k-mer matches.",
+                       "Number of matches per query k-mer. ",
+                       "Num. of matches per query k-mer. Larger values assign more memory for storing k-mer matches. ",
                        typeid(int),
                        (void *) &matchPerKmer,
                        "^[0-9]+$"),
@@ -176,11 +180,25 @@ LocalParameters::LocalParameters() :
                     "^[0-9]+$"),
         ACCESSION_LEVEL(ACCESSION_LEVEL_ID,
                         "--accession-level",
-                        "Build a database for accession level classification",
-                        "Build a database for accession level classification",
+                        "Accession-level DB build/search",
+                        "Build or search a database for accession-level classification",
                         typeid(int),
                         (void *) &accessionLevel,
                         "[0-1]"),
+        DB_NAME(DB_NAME_ID,
+                "--db-name",
+                "Name of the database (a random number as default)",
+                "Name of the database",
+                typeid(std::string),
+                (void *) &dbName,
+                "^.*$"),
+        DB_DATE(DB_DATE_ID,
+                "--db-date",
+                "Date of the database creation (current date as default)",
+                "Date of the database creation",
+                typeid(std::string),
+                (void *) &dbDate,
+                "^.*$"),
         TEST_RANK(TEST_RANK_ID,
                   "--test-rank",
                   ".",
@@ -257,27 +275,29 @@ LocalParameters::LocalParameters() :
     build.push_back(&PARAM_MASK_RESIDUES);
     build.push_back(&BUFFER_SIZE);
     build.push_back(&ACCESSION_LEVEL);
+    build.push_back(&DB_NAME);
+    build.push_back(&DB_DATE);
 
     //classify
     classify.push_back(&PARAM_THREADS);
     classify.push_back(&SEQ_MODE);
-    classify.push_back(&VIRUS_TAX_ID);
-    classify.push_back(&REDUCED_AA);
+//     classify.push_back(&VIRUS_TAX_ID);
+//     classify.push_back(&REDUCED_AA);
     classify.push_back(&MIN_SCORE);
     classify.push_back(&MIN_COVERAGE);
-    classify.push_back(&SPACED);
-    classify.push_back(&HAMMING_MARGIN);
-    classify.push_back(&MIN_SP_SCORE);
-    classify.push_back(&PARAM_V);
-    classify.push_back(&RAM_USAGE);
-    classify.push_back(&MIN_COVERED_POS);
-    classify.push_back(&PRINT_LOG);
-    classify.push_back(&MAX_GAP);
-    classify.push_back(&TAXONOMY_PATH);
     classify.push_back(&MIN_CONS_CNT);
     classify.push_back(&MIN_CONS_CNT_EUK);
+    classify.push_back(&MIN_SP_SCORE);
+//     classify.push_back(&SPACED);
+    classify.push_back(&HAMMING_MARGIN);
+//     classify.push_back(&PARAM_V);
+//     classify.push_back(&MIN_COVERED_POS);
+//     classify.push_back(&PRINT_LOG);
+//     classify.push_back(&MAX_GAP);
+//     classify.push_back(&TAXONOMY_PATH);
     classify.push_back(&PARAM_MASK_RESIDUES);
     classify.push_back(&PARAM_MASK_PROBABILTY);
+    classify.push_back(&RAM_USAGE);
     classify.push_back(&MATCH_PER_KMER);
     classify.push_back(&ACCESSION_LEVEL);
 
@@ -338,3 +358,489 @@ LocalParameters::LocalParameters() :
     databaseReport.push_back(&TAXONOMY_PATH);
 }
 
+void LocalParameters::printParameters(const std::string &module, int argc, const char* pargv[],
+                                 const std::vector<MMseqsParameter*> &par){
+    if (Debug::debugLevel < Debug::INFO) {
+        return;
+    }
+
+    Debug(Debug::INFO) << module << " ";
+    for (int i = 0; i < argc; i++) {
+        // don't expose users to the interal b64 masking of whitespace characters
+        if (strncmp("b64:", pargv[i], 4) == 0) {
+            Debug(Debug::INFO) << "'" << base64_decode(pargv[i] + 4, strlen(pargv[i]) - 4) << "' ";
+        } else {
+            Debug(Debug::INFO) << pargv[i] << " ";
+        }
+    }
+    Debug(Debug::INFO) << "\n\n";
+
+    if (CommandCaller::getCallDepth() > 0) {
+        return;
+    }
+
+    size_t maxWidth = 0;
+    for(size_t i = 0; i < par.size(); i++) {
+        maxWidth = std::max(strlen(par[i]->display), maxWidth);
+    }
+
+    std::stringstream ss;
+    ss << std::boolalpha;
+
+     ss << std::setw(maxWidth) << std::left  << "Metabuli Version:" << "\t" << "1.0.2" << "\n";
+
+    for (size_t i = 0; i < par.size(); i++) {
+        if (par[i]->category & MMseqsParameter::COMMAND_HIDDEN) {
+            continue;
+        }
+        ss << std::setw(maxWidth) << std::left << par[i]->display << "\t";
+        if (typeid(int) == par[i]->type ) {
+            ss << *((int *)par[i]->value);
+        } else if(typeid(size_t) == par[i]->type ){
+            ss << *((size_t *)par[i]->value);
+        } else if(typeid(ByteParser) == par[i]->type) {
+            ss << ByteParser::format(*((size_t *)par[i]->value), 'a', 'h');
+        } else if(PARAM_SUB_MAT.uniqid == par[i]->uniqid || PARAM_SEED_SUB_MAT.uniqid == par[i]->uniqid) {
+            MultiParam<NuclAA<std::string>>* param = ((MultiParam<NuclAA<std::string>>*) par[i]->value);
+            MultiParam<NuclAA<std::string>> tmpPar(NuclAA<std::string>(
+                BaseMatrix::unserializeName(param->values.aminoacid().c_str()),
+                BaseMatrix::unserializeName(param->values.nucleotide().c_str())
+            ));
+            ss << MultiParam<NuclAA<std::string>>::format(tmpPar);
+        } else if(typeid(MultiParam<NuclAA<std::string>>) == par[i]->type) {
+            ss << MultiParam<NuclAA<std::string>>::format(*((MultiParam<NuclAA<std::string>> *)par[i]->value));
+        } else if(typeid(MultiParam<NuclAA<int>>) == par[i]->type) {
+            ss << MultiParam<NuclAA<int>>::format(*((MultiParam<NuclAA<int>> *)par[i]->value));
+        } else if(typeid(MultiParam<NuclAA<float>>) == par[i]->type) {
+            ss << MultiParam<NuclAA<float>>::format(*((MultiParam<NuclAA<float>> *)par[i]->value));
+        } else if(typeid(MultiParam<SeqProf<int>>) == par[i]->type) {
+            ss << MultiParam<SeqProf<int>>::format(*((MultiParam<SeqProf<int>> *)par[i]->value));
+        } else if(typeid(MultiParam<PseudoCounts>) == par[i]->type) {
+            ss << MultiParam<PseudoCounts>::format(*((MultiParam<PseudoCounts> *)par[i]->value));
+        } else if(typeid(float) == par[i]->type) {
+            ss << *((float *)par[i]->value);
+        } else if(typeid(double) == par[i]->type) {
+            ss << *((double *)par[i]->value);
+        } else if(typeid(std::string) == par[i]->type) {
+            ss << *((std::string *) par[i]->value);
+        } else if (typeid(bool) == par[i]->type) {
+            ss << *((bool *)par[i]->value);
+        }
+        ss << "\n";
+    }
+
+    Debug(Debug::INFO) << ss.str() << "\n";
+}
+
+void LocalParameters::parseParameters(int argc, const char *pargv[], const Command &command, bool printPar, int parseFlags,
+                                 int outputFlags) {
+    filenames.clear();
+    std::vector<MMseqsParameter*> & par = *command.params;
+
+    bool canHandleHelp = false;
+    for (size_t parIdx = 0; parIdx < par.size(); parIdx++) {
+        if (par[parIdx]->uniqid == PARAM_HELP_ID || par[parIdx]->uniqid == PARAM_HELP_LONG_ID) {
+            canHandleHelp = true;
+        }
+    }
+
+    size_t parametersFound = 0;
+    for (int argIdx = 0; argIdx < argc; argIdx++) {
+        // it is a parameter if it starts with - or --
+        const bool longParameter = (pargv[argIdx][0] == '-' && pargv[argIdx][1] == '-');
+        if (longParameter || (pargv[argIdx][0] == '-')) {
+            if ((parseFlags & PARSE_REST) && longParameter && pargv[argIdx][2] == '\0') {
+                restArgv = pargv + argIdx + 1;
+                restArgc = argc - (argIdx + 1);
+                break;
+            }
+            std::string parameter(pargv[argIdx]);
+            if (canHandleHelp == false && (parameter.compare("-h") == 0 || parameter.compare("--help") == 0)) {
+                printUsageMessage(command, 0xFFFFFFFF);
+                EXIT(EXIT_SUCCESS);
+            }
+
+            bool hasUnrecognizedParameter = true;
+            for (size_t parIdx = 0; parIdx < par.size(); parIdx++) {
+                if(parameter.compare(par[parIdx]->name) == 0) {
+                    if (typeid(bool) != par[parIdx]->type && argIdx + 1 == argc) {
+                        printUsageMessage(command, outputFlags);
+                        Debug(Debug::ERROR) << "Missing argument " << par[parIdx]->name << "\n";
+                        EXIT(EXIT_FAILURE);
+                    }
+
+                    if (par[parIdx]->wasSet) {
+                        printUsageMessage(command, outputFlags);
+                        Debug(Debug::ERROR) << "Duplicate parameter " << par[parIdx]->name << "\n";
+                        EXIT(EXIT_FAILURE);
+                    }
+
+                    if (typeid(int) == par[parIdx]->type) {
+                        regex_t regex;
+                        compileRegex(&regex, par[parIdx]->regex);
+                        int nomatch = regexec(&regex, pargv[argIdx+1], 0, NULL, 0);
+                        regfree(&regex);
+                        // if no match found or two matches found (we want exactly one match)
+                        if (nomatch){
+                            printUsageMessage(command, 0xFFFFFFFF);
+                            Debug(Debug::ERROR) << "Error in argument " << par[parIdx]->name << "\n";
+                            EXIT(EXIT_FAILURE);
+                        }else{
+                            *((int *) par[parIdx]->value) = atoi(pargv[argIdx+1]);
+                            par[parIdx]->wasSet = true;
+                        }
+                        argIdx++;
+                    } else if (typeid(size_t) == par[parIdx]->type) {
+                        regex_t regex;
+                        compileRegex(&regex, par[parIdx]->regex);
+                        int nomatch = regexec(&regex, pargv[argIdx+1], 0, NULL, 0);
+                        regfree(&regex);
+                        // if no match found or two matches found (we want exactly one match)
+                        if (nomatch){
+                            printUsageMessage(command, 0xFFFFFFFF);
+                            Debug(Debug::ERROR) << "Error in argument " << par[parIdx]->name << "\n";
+                            EXIT(EXIT_FAILURE);
+                        }else{
+                            *((size_t *) par[parIdx]->value) = atoi(pargv[argIdx+1]);
+                            par[parIdx]->wasSet = true;
+                        }
+                        argIdx++;
+                    } else if (typeid(ByteParser) == par[parIdx]->type) {
+                        regex_t regex;
+                        compileRegex(&regex, par[parIdx]->regex);
+                        int nomatch = regexec(&regex, pargv[argIdx+1], 0, NULL, 0);
+                        regfree(&regex);
+
+                        // if no match found or two matches found (we want exactly one match)
+                        if (nomatch){
+                            printUsageMessage(command, 0xFFFFFFFF);
+                            Debug(Debug::ERROR) << "Error in argument regex " << par[parIdx]->name << "\n";
+                            EXIT(EXIT_FAILURE);
+                        } else {
+                            size_t value = ByteParser::parse(pargv[argIdx+1]);
+                            if (value == ByteParser::INVALID_SIZE) {
+                                printUsageMessage(command, 0xFFFFFFFF);
+                                Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n";
+                                EXIT(EXIT_FAILURE);
+                            } else {
+                                *((size_t *) par[parIdx]->value) = value;
+                                par[parIdx]->wasSet = true;
+                            }
+                        }
+                        argIdx++;
+                    } else if (typeid(MultiParam<NuclAA<std::string>>) == par[parIdx]->type) {
+                        std::string val(pargv[argIdx+1]);
+                        if (Util::startWith("b64:", val)) {
+                            val = base64_decode(val.c_str() + 4, val.size() - 4);
+                        }
+                        NuclAA<std::string> value = MultiParam<NuclAA<std::string>>(val.c_str()).values;
+                        if (value.first == "INVALID" || value.second == "INVALID") {
+                            printUsageMessage(command, 0xFFFFFFFF);
+                            Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n";
+                            EXIT(EXIT_FAILURE);
+                        } else {
+                            *((MultiParam<NuclAA<std::string>> *) par[parIdx]->value) = value;
+                            par[parIdx]->wasSet = true;
+                        }
+                        argIdx++;
+                    }else if (typeid(MultiParam<NuclAA<int>>) == par[parIdx]->type) {
+                        NuclAA<int> value = MultiParam<NuclAA<int>>(pargv[argIdx+1]).values;
+                        if (value.first == INT_MAX || value.second == INT_MAX) {
+                            printUsageMessage(command, 0xFFFFFFFF);
+                            Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n";
+                            EXIT(EXIT_FAILURE);
+                        } else {
+                            *((MultiParam<NuclAA<int>> *) par[parIdx]->value) = value;
+                            par[parIdx]->wasSet = true;
+                        }
+                        argIdx++;
+                    }else if (typeid(MultiParam<NuclAA<float>>) == par[parIdx]->type) {
+                        NuclAA<float> value = MultiParam<NuclAA<float>>(pargv[argIdx + 1]).values;
+                        if (value.first == FLT_MAX || value.second == FLT_MAX) {
+                            printUsageMessage(command, 0xFFFFFFFF);
+                            Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n";
+                            EXIT(EXIT_FAILURE);
+                        } else {
+                            *((MultiParam<NuclAA<float>> *) par[parIdx]->value) = value;
+                            par[parIdx]->wasSet = true;
+                        }
+                        argIdx++;
+                    }else if (typeid(MultiParam<SeqProf<int>>) == par[parIdx]->type) {
+                        SeqProf<int> value = MultiParam<SeqProf<int>>(pargv[argIdx+1]).values;
+                        *((MultiParam<SeqProf<int>> *) par[parIdx]->value) = value;
+                        par[parIdx]->wasSet = true;
+                        argIdx++;
+                    }else if (typeid(MultiParam<PseudoCounts>) == par[parIdx]->type) {
+                        PseudoCounts value = MultiParam<PseudoCounts>(pargv[argIdx + 1]).values;
+                        if (value.first == FLT_MAX || value.second == FLT_MAX) {
+                            printUsageMessage(command, 0xFFFFFFFF);
+                            Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n";
+                            EXIT(EXIT_FAILURE);
+                        } else {
+                            *((MultiParam<PseudoCounts> *) par[parIdx]->value) = value;
+                            par[parIdx]->wasSet = true;
+                        }
+                        argIdx++;
+                    }else if (typeid(float) == par[parIdx]->type) {
+                        regex_t regex;
+                        compileRegex(&regex, par[parIdx]->regex);
+                        int nomatch = regexec(&regex, pargv[argIdx+1], 0, NULL, 0);
+                        regfree(&regex);
+                        if (nomatch){
+                            printUsageMessage(command, 0xFFFFFFFF);
+                            Debug(Debug::ERROR) << "Error in argument " << par[parIdx]->name << "\n";
+                            EXIT(EXIT_FAILURE);
+                        }else{
+                            double input = strtod(pargv[argIdx+1], NULL);
+                            *((float *) par[parIdx]->value) = static_cast<float>(input);
+                            par[parIdx]->wasSet = true;
+                        }
+                        argIdx++;
+                    } else if (typeid(double) == par[parIdx]->type) {
+                        regex_t regex;
+                        compileRegex(&regex, par[parIdx]->regex);
+                        int nomatch = regexec(&regex, pargv[argIdx+1], 0, NULL, 0);
+                        regfree(&regex);
+                        if (nomatch){
+                            printUsageMessage(command, 0xFFFFFFFF);
+                            Debug(Debug::ERROR) << "Error in argument " << par[parIdx]->name << "\n";
+                            EXIT(EXIT_FAILURE);
+                        }else{
+                            *((double *) par[parIdx]->value) = strtod(pargv[argIdx+1], NULL);
+                            par[parIdx]->wasSet = true;
+                        }
+                        argIdx++;
+                    } else if (typeid(std::string) == par[parIdx]->type) {
+                        std::string val(pargv[argIdx+1]);
+                        if (Util::startWith("b64:", val)) {
+                            val = base64_decode(val.c_str() + 4, val.size() - 4);
+                        }
+                        std::string* currVal = (std::string*)par[parIdx]->value;
+                        currVal->assign(val);
+                        par[parIdx]->wasSet = true;
+                        argIdx++;
+                    } else if (typeid(bool) == par[parIdx]->type) {
+                        bool *value = (bool *) par[parIdx]->value;
+                        if (argIdx + 1 == argc || pargv[argIdx+1][0] == '-') {
+                            *value = !*value;
+                        } else {
+                            *value = parseBool(pargv[argIdx+1]);
+                            argIdx++;
+                        }
+                        par[parIdx]->wasSet = true;
+                    } else {
+                        Debug(Debug::ERROR) << "Wrong parameter type in parseParameters. Please inform the developers\n";
+                        EXIT(EXIT_FAILURE);
+                    }
+
+                    hasUnrecognizedParameter = false;
+                    continue;
+                }
+            }
+
+            if (hasUnrecognizedParameter) {
+                printUsageMessage(command, 0xFFFFFFFF);
+
+                // Suggest some parameter that the user might have meant
+                std::vector<MMseqsParameter *>::const_iterator index = par.end();
+                int maxDistance = 0;
+                for (std::vector<MMseqsParameter *>::const_iterator it = par.begin(); it != par.end(); ++it) {
+                    int distance = DistanceCalculator::localLevenshteinDistance(parameter, (*it)->name);
+                    if (distance > maxDistance) {
+                        maxDistance = distance;
+                        index = it;
+                    }
+                }
+
+                Debug(Debug::ERROR) << "Unrecognized parameter \"" << parameter << "\"";
+                if (index != par.end()) {
+                    Debug(Debug::ERROR) << ". Did you mean \"" << (*index)->name << "\" (" << (*index)->display << ")?\n";
+                } else {
+                    Debug(Debug::ERROR) << "\n";
+                }
+
+                EXIT(EXIT_FAILURE);
+            }
+
+            parametersFound++;
+        } else {
+            // parameter is actually a filename
+#ifdef __CYGWIN__
+            // normalize windows paths to cygwin unix paths
+            const char *path = pargv[argIdx];
+            ssize_t size = cygwin_conv_path(CCP_WIN_A_TO_POSIX | CCP_RELATIVE, path, NULL, 0);
+            if (size < 0) {
+                Debug(Debug::ERROR) << "Could not convert cygwin path!\n";
+                EXIT(EXIT_FAILURE);
+            } else {
+                char *posix = new char[size];
+                if (cygwin_conv_path(CCP_WIN_A_TO_POSIX | CCP_RELATIVE, path, posix, size)) {
+                    Debug(Debug::ERROR) << "Could not convert cygwin path!\n";
+                    EXIT(EXIT_FAILURE);
+                }
+                filenames.emplace_back(posix);
+                delete posix;
+            }
+#else
+            filenames.emplace_back(pargv[argIdx]);
+#endif
+        }
+    }
+
+    if (MMseqsMPI::isMaster()) {
+        Debug::setDebugLevel(verbosity);
+    }
+
+#ifdef OPENMP
+    omp_set_num_threads(threads);
+#endif
+#ifndef OPENMP
+    threads = 1;
+#endif
+
+
+    bool ignorePathCountChecks = command.databases.empty() == false && command.databases[0].specialType & DbType::ZERO_OR_ALL && filenames.size() == 0;
+    const size_t MAX_DB_PARAMETER = 6;
+    if (ignorePathCountChecks == false && command.databases.size() > MAX_DB_PARAMETER) {
+        Debug(Debug::ERROR) << "Use argv if you need more than " << MAX_DB_PARAMETER << " db parameters" << "\n";
+        EXIT(EXIT_FAILURE);
+    }
+
+    if (ignorePathCountChecks == false && filenames.size() < command.databases.size()){
+        printUsageMessage(command, outputFlags);
+        Debug(Debug::ERROR) << "Not enough input paths provided. ";
+        if (command.databases.size() == 1) {
+            Debug(Debug::ERROR) << "1 path is required.\n";
+        } else {
+            Debug(Debug::ERROR) << command.databases.size() << " paths are required.\n";
+        }
+        EXIT(EXIT_FAILURE);
+    }
+
+    bool isVar = false;
+    bool isStartVar = false;
+    bool isMiddleVar = false;
+    bool isEndVar = false;
+    if(command.databases.empty() == false && command.databases[0].validator != NULL) {
+        if (command.databases.size() >= 2) {
+            for(size_t i = 0; i < command.databases.size();i++){
+                if(i == 0){
+                    isStartVar |= (command.databases[i].specialType & DbType::VARIADIC);
+                } else if(i == command.databases.size() - 1){
+                    isEndVar |= (command.databases[i].specialType & DbType::VARIADIC);
+                } else {
+                    isMiddleVar |= (command.databases[i].specialType & DbType::VARIADIC);
+                }
+
+            }
+            isVar = isStartVar | isMiddleVar | isEndVar;
+        }
+        if (ignorePathCountChecks == false && isVar == false && filenames.size() > command.databases.size()) {
+            printUsageMessage(command, outputFlags);
+            Debug(Debug::ERROR) << "Too many input paths provided. Only " << SSTR(command.databases.size()) << " are allowed\n";
+            EXIT(EXIT_FAILURE);
+        }
+    }
+    switch (std::min(filenames.size(), MAX_DB_PARAMETER)) {
+        case 6:
+            db6 = filenames[5];
+            db6Index = db6;
+            db6Index.append(".index");
+            db6dbtype = db6;
+            db6dbtype.append(".dbtype");
+            hdr6 = db6;
+            hdr6.append("_h");
+            hdr6Index = hdr6;
+            hdr6Index.append(".index");
+            hdr6dbtype = hdr6;
+            hdr6dbtype.append(".dbtype");
+            // FALLTHROUGH
+        case 5:
+            db5 = filenames[4];
+            db5Index = db5;
+            db5Index.append(".index");
+            db5dbtype = db5;
+            db5dbtype.append(".dbtype");
+            hdr5 = db5;
+            hdr5.append("_h");
+            hdr5Index = hdr5;
+            hdr5Index.append(".index");
+            hdr5dbtype = hdr5;
+            hdr5dbtype.append(".dbtype");
+            // FALLTHROUGH
+        case 4:
+            db4 = filenames[3];
+            db4Index = db4;
+            db4Index.append(".index");
+            db4dbtype = db4;
+            db4dbtype.append(".dbtype");
+            hdr4 = db4;
+            hdr4.append("_h");
+            hdr4Index = hdr4;
+            hdr4Index.append(".index");
+            hdr4dbtype = hdr4;
+            hdr4dbtype.append(".dbtype");
+            // FALLTHROUGH
+        case 3:
+            db3 = filenames[2];
+            db3Index = db3;
+            db3Index.append(".index");
+            db3dbtype = db3;
+            db3dbtype.append(".dbtype");
+            hdr3 = db3;
+            hdr3.append("_h");
+            hdr3Index = hdr3;
+            hdr3Index.append(".index");
+            hdr3dbtype = hdr3;
+            hdr3dbtype.append(".dbtype");
+            // FALLTHROUGH
+        case 2:
+            db2 = filenames[1];
+            db2Index = db2;
+            db2Index.append(".index");
+            db2dbtype = db2;
+            db2dbtype.append(".dbtype");
+            hdr2 = db2;
+            hdr2.append("_h");
+            hdr2Index = hdr2;
+            hdr2Index.append(".index");
+            hdr2dbtype = hdr2;
+            hdr2dbtype.append(".dbtype");
+            // FALLTHROUGH
+        case 1:
+            db1 = filenames[0];
+            db1Index = db1;
+            db1Index.append(".index");
+            db1dbtype = db1;
+            db1dbtype.append(".dbtype");
+            hdr1 = db1;
+            hdr1.append("_h");
+            hdr1Index = hdr1;
+            hdr1Index.append(".index");
+            hdr1dbtype = hdr1;
+            hdr1dbtype.append(".dbtype");
+            break;
+        default:
+            // Do not abort execution if we expect a variable amount of parameters
+            if (parseFlags & PARSE_VARIADIC)
+                break;
+            // FALLTHROUGH
+        case 0:
+            if (parseFlags & PARSE_ALLOW_EMPTY)
+                break;
+            printUsageMessage(command, outputFlags);
+            printParameters(command.cmd, argc, pargv, par);
+            Debug(Debug::ERROR) << "Unrecognized parameters!" << "\n";
+            EXIT(EXIT_FAILURE);
+    }
+
+    initMatrices();
+
+    if (ignorePathCountChecks == false) {
+        checkIfDatabaseIsValid(command, argc, pargv, isStartVar, isMiddleVar, isEndVar);
+    }
+
+    if (printPar == true) {
+        printParameters(command.cmd, argc, pargv, par);
+    }
+}
\ No newline at end of file
diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h
index 2d75912a..ccbd03c6 100644
--- a/src/commons/LocalParameters.h
+++ b/src/commons/LocalParameters.h
@@ -63,6 +63,8 @@ class LocalParameters : public Parameters {
     PARAMETER(SPLIT_NUM)
     PARAMETER(BUFFER_SIZE)
     PARAMETER(ACCESSION_LEVEL)
+    PARAMETER(DB_NAME)
+    PARAMETER(DB_DATE)
 
     // Test parameters
     PARAMETER(TEST_RANK)
@@ -104,6 +106,8 @@ class LocalParameters : public Parameters {
     std::string tinfoPath;
     std::string libraryPath;
     std::string taxonomyPath;
+    std::string dbName;
+    std::string dbDate;
     int splitNum;
     size_t bufferSize;
     int accessionLevel;
@@ -124,6 +128,13 @@ class LocalParameters : public Parameters {
     int printMode;
     std::string contamList;
 
+    void printParameters(const std::string &module, int argc,
+                         const char* pargv[],
+                         const std::vector<MMseqsParameter*> &par);
+    
+    void parseParameters(int argc, const char *pargv[], const Command &command, bool printPar, int parseFlags,
+                        int outputFlags);
+
 private:
     LocalParameters();
 
diff --git a/src/commons/common.cpp b/src/commons/common.cpp
index 9f54ac75..c1ebff2a 100644
--- a/src/commons/common.cpp
+++ b/src/commons/common.cpp
@@ -85,6 +85,15 @@ int loadDbParameters(LocalParameters &par) {
       while (getline(dbParametersFile, eachLine)) {
         std::vector<std::string> tokens = Util::split(eachLine, "\t");
         if (tokens[0] == "Reduced_alphabet") {
+          // if (stoi(tokens[1]) != par.reducedAA){
+          //   if (par.reducedAA == 0){ // DB with reduced AA
+          //     cerr << "Warning: Current DB is built with reduced 15 amino acid alphabets." << endl;
+          //     cerr << "         --reduce-aa option will be ignored " << endl;
+          //   } else {
+          //     cerr << "Warning: Current DB is built with 20 amino acid alphabets." << endl;
+          //     cerr << "         --reduce-aa option will be ignored " << endl;
+          //   }
+          // }
           par.reducedAA = stoi(tokens[1]);
         } else if (tokens[0] == "Spaced_kmer_mask") {
           par.spaceMask = tokens[1];
@@ -96,6 +105,10 @@ int loadDbParameters(LocalParameters &par) {
           if (tokens[1] == "1" && par.accessionLevel == 0){
             par.accessionLevel = 2;
           }
+        } else if (tokens[0] == "DB_name") {
+          par.dbName = tokens[1];
+        } else if (tokens[0] == "Creation_date") {
+          par.dbDate = tokens[1];
         }
       }
       return 1;
diff --git a/src/metabuli.cpp b/src/metabuli.cpp
index 7783d91f..43a8e9de 100644
--- a/src/metabuli.cpp
+++ b/src/metabuli.cpp
@@ -10,7 +10,7 @@
 const char* binary_name = "metabuli";
 const char* tool_name = "metabuli";
 const char* tool_introduction = "Metabuli is a taxonomical classifier that jointly analyzes amino acid and DNA sequences.";
-const char* main_author = "Jaebeom Kim <jbeom0731@gmail.com> ";
+const char* main_author = "Jaebeom Kim <jbeom0731@gmail.com>";
 const char* show_extended_help = "1";
 const char* show_bash_info = nullptr;
 bool hide_base_commands = true;
diff --git a/src/workflow/build.cpp b/src/workflow/build.cpp
index eaf0c367..b9341bc3 100644
--- a/src/workflow/build.cpp
+++ b/src/workflow/build.cpp
@@ -13,6 +13,15 @@ void setDefaults_build(LocalParameters & par){
     par.maskMode = 1;
     par.bufferSize = 1'000'000'000;
     par.accessionLevel = 0;
+    // Get current date
+    time_t now = time(0);
+    tm *ltm = localtime(&now);
+    par.dbDate = to_string(1900 + ltm->tm_year) + "-" + to_string(1 + ltm->tm_mon) + "-" + to_string(ltm->tm_mday);
+    
+    // Get random alphanumeric string fore dbName from current time
+    srand(time(NULL));
+    string randStr = to_string(rand());
+    par.dbName = randStr.substr(0, 32);
 }
 
 int build(int argc, const char **argv, const Command &command){
diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp
index 91977368..495e97db 100644
--- a/src/workflow/classify.cpp
+++ b/src/workflow/classify.cpp
@@ -47,7 +47,6 @@ int classify(int argc, const char **argv, const Command& command)
     omp_set_num_threads(par.threads);
 #endif
 
-    cout << "Number of threads: " << par.threads << endl;
     Classifier * classifier = new Classifier(par);
     classifier->startClassify(par);
     delete classifier;

From 5793500be6f16afec0acf85bb7fc885ca5d96b7a Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 12 Sep 2023 10:49:17 +0900
Subject: [PATCH 31/65] improve user interface

---
 README.md                       |  4 ++--
 src/commons/LocalParameters.cpp |  2 +-
 src/commons/ProdigalWrapper.cpp |  4 ++--
 src/metabuli.cpp                |  2 +-
 src/workflow/add_to_library.cpp |  4 ++--
 src/workflow/classify.cpp       | 15 +++++++++++++++
 6 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 3e6e9170..c240cc0b 100644
--- a/README.md
+++ b/README.md
@@ -173,7 +173,7 @@ Accessions that are not included in the `<accession2taxid>` will be skipped and
 
 ```
 # Get the list of absoulte paths of files in your library
-find <DBDIR>/library -name '*.fna' > library-files.txt
+find <DBDIR>/library -type f -name '*.fna' > library-files.txt
 
 metabuli build <DBDIR> <LIB_FILES> <accession2taxid> [options]
 - DBDIR: The same DBDIR from the previous step. 
@@ -216,7 +216,7 @@ This will add your FASTA files to DBDIR/library according to their species taxon
 #### 2. Build
 ```
 # Get the list of absoulte paths of files in your library
-find <DBDIR>/library -name '*.fna' > library-files.txt
+find <DBDIR>/library -type f -name '*.fna' > library-files.txt
 
 metabuli build <DBDIR> <LIB_FILES> <accession2taxid> [options]
 - DBDIR: The same DBDIR from the previous step. 
diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp
index 986ab86d..bf4a20f5 100644
--- a/src/commons/LocalParameters.cpp
+++ b/src/commons/LocalParameters.cpp
@@ -268,7 +268,7 @@ LocalParameters::LocalParameters() :
     // build
     build.push_back(&PARAM_THREADS);
     build.push_back(&REDUCED_AA);
-    build.push_back(&SPACED);
+    // build.push_back(&SPACED);
     build.push_back(&TAXONOMY_PATH);
     build.push_back(&SPLIT_NUM);
     build.push_back(&PARAM_MASK_PROBABILTY);
diff --git a/src/commons/ProdigalWrapper.cpp b/src/commons/ProdigalWrapper.cpp
index 7b6e39d2..6c27f516 100644
--- a/src/commons/ProdigalWrapper.cpp
+++ b/src/commons/ProdigalWrapper.cpp
@@ -290,8 +290,8 @@ int ProdigalWrapper::getNextSeq(char * line, int training) {
         bctr+=2; len++;
 
         if(len >= MAX_SEQ) {
-            fprintf(stderr, "\n\nWarning:  Sequence is long (max %d).\n", MAX_SEQ);
-            fprintf(stderr, "Use the first %d bases.\n\n", MAX_SEQ);
+            // fprintf(stderr, "\n\nWarning:  Sequence is long (max %d).\n", MAX_SEQ);
+            // fprintf(stderr, "Use the first %d bases.\n\n", MAX_SEQ);
             break;
         }
     }
diff --git a/src/metabuli.cpp b/src/metabuli.cpp
index 43a8e9de..3d9188c2 100644
--- a/src/metabuli.cpp
+++ b/src/metabuli.cpp
@@ -61,7 +61,7 @@ std::vector<Command> commands = {
                 "Assigning taxonomy label to query reads",
                 nullptr,
                 "Jaebeom Kim <jbeom0731@gmail.com>",
-                "<i:QUERY> <i:DB dir> <o:out dir> <job ID> ",
+                "<i:query file(s)> <i:database directory> <o:output directory> <job ID> ",
                 CITATION_SPACEPHARER,
                 {{"FASTA", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile},
                  {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp
index 8abb3ff2..ee19855c 100644
--- a/src/workflow/add_to_library.cpp
+++ b/src/workflow/add_to_library.cpp
@@ -34,7 +34,7 @@ int addToLibrary(int argc, const char **argv, const Command &command){
     }
 
     // Load taxonomy
-    NcbiTaxonomy * taxonomy = loadTaxonomy(dbDir);
+    NcbiTaxonomy * taxonomy = loadTaxonomy(dbDir, par.taxonomyPath);
 
     // Load file names
     ifstream fileListFile;
@@ -60,7 +60,7 @@ int addToLibrary(int argc, const char **argv, const Command &command){
             char accession_version[2048];
             int taxID;
             fscanf(mappingFile, "%*s\t%*s\t%*s\t%*s");
-            while (fscanf(mappingFile, "%s\t%s\t%d\t%*d", accession, accession_version, &taxID) == 2) {
+            while (fscanf(mappingFile, "%s\t%s\t%d\t%*d", accession, accession_version, &taxID) == 3) {
                 acc2taxid[string(accession_version)] = taxID;
                 acc2taxid[string(accession)] = taxID;
             }
diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp
index 495e97db..460bab34 100644
--- a/src/workflow/classify.cpp
+++ b/src/workflow/classify.cpp
@@ -34,10 +34,25 @@ int classify(int argc, const char **argv, const Command& command)
     par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0);
 
     if (par.seqMode == 2) {
+        // Check if the second argument is a directory
+        if (FileUtil::directoryExists(par.filenames[1].c_str())) {
+            cerr << "Error: " << par.filenames[1] << " is a directory. Please specify a query file name." << endl;
+            cerr << "       For '--seq-mode 2', please provide two query files." << endl;
+            exit(1);
+        }
+
         if (!FileUtil::directoryExists(par.filenames[3].c_str())) {
             FileUtil::makeDir(par.filenames[3].c_str());
         }
     } else {
+        // Check if the second argument is file
+        if (FileUtil::fileExists(par.filenames[1].c_str()) 
+            && !FileUtil::directoryExists(par.filenames[1].c_str())) {
+            cerr << "Error: " << par.filenames[1] << " is a file. Please specify a database directory." << endl;
+            cerr << "       For '--seq-mode 1' and '--seq-mode 3', please provide one query file." << endl;
+            exit(1);
+        }
+
         if (!FileUtil::directoryExists(par.filenames[2].c_str())) {
             FileUtil::makeDir(par.filenames[2].c_str());
         }

From 43bfa6f607fe4d79e02d1c211a64c36dcb38ebf7 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 12 Sep 2023 14:00:51 +0900
Subject: [PATCH 32/65] 1) accession-level FileMerger 2) Support accession with
 prefix (NZ_~~~~)

---
 data/metabulidatabases.sh       |  6 +--
 src/commons/FileMerger.cpp      | 44 +++++++++++-----------
 src/commons/FileMerger.h        |  3 ++
 src/commons/IndexCreator.cpp    | 66 +++++++++++++++++++++------------
 src/commons/common.cpp          | 40 +++++++++++++++++++-
 src/commons/common.h            |  2 +
 src/metabuli.cpp                |  6 +--
 src/workflow/add_to_library.cpp | 29 +++++++++------
 8 files changed, 131 insertions(+), 65 deletions(-)

diff --git a/data/metabulidatabases.sh b/data/metabulidatabases.sh
index 706c3135..653da5ae 100644
--- a/data/metabulidatabases.sh
+++ b/data/metabulidatabases.sh
@@ -115,10 +115,10 @@ case "${SELECTION}" in
          # INPUT_TYPE="METABULI_DB"
     ;;
     "RefSeq_virus")
-	        if notExists "${TMP_PATH}/refseq_virus.tar.gz"; then
-		          downloadFile "https://metabuli.steineggerlab.workers.dev/refseq_virus.tar.gz" "${TMP_PATH}/refseq_virus.tar.gz"
+	        if notExists "${TMP_PATH}/refseq_virus+human.tar.gz"; then
+		          downloadFile "https://metabuli.steineggerlab.workers.dev/refseq_virus+human.tar.gz" "${TMP_PATH}/refseq_virus+human.tar.gz"
           fi
-          tar zxvf "${TMP_PATH}/refseq_virus.tar.gz" -C "${OUTDB}"
+          tar zxvf "${TMP_PATH}/refseq_virus+human.tar.gz" -C "${OUTDB}"
           # push_back "${TMP_PATH}/refseq_virus"
           # INPUT_TYPE="METABULI_DB"
     ;;
diff --git a/src/commons/FileMerger.cpp b/src/commons/FileMerger.cpp
index f0301680..91d55bf6 100644
--- a/src/commons/FileMerger.cpp
+++ b/src/commons/FileMerger.cpp
@@ -1,6 +1,9 @@
 #include "FileMerger.h"
+#include "common.h"
 
 FileMerger::FileMerger(const LocalParameters & par) {
+    // Load parameters
+    dbDir = par.filenames[0];
     splitNum = par.splitNum;
     bufferSize = par.bufferSize;
     if (par.reducedAA == 1){
@@ -10,10 +13,11 @@ FileMerger::FileMerger(const LocalParameters & par) {
         MARKER = 16777215;
         MARKER = ~ MARKER;
     }
+    taxonomy = loadTaxonomy(dbDir, "");
 }
 
 FileMerger::~FileMerger() {
-
+    delete taxonomy;
 }
 
 //void FileMerger::mergeTargetFiles(std::vector<char*> diffIdxFileNames, std::vector<char*> infoFileNames, vector<int> & taxIdListAtRank, vector<int> & taxIdList) {
@@ -177,38 +181,32 @@ FileMerger::~FileMerger() {
 // Merge differential index and k-mer information files, reducing redundancy
 void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits) {
     size_t writtenKmerCnt = 0;
-    const string dbDirectory = par.filenames[0];
-
-    // Taxonomy
-    NcbiTaxonomy taxonomy(par.taxonomyPath + "/names.dmp",
-                          par.taxonomyPath + "/nodes.dmp",
-                          par.taxonomyPath + "/merged.dmp");
-
+   
     // Load taxonomy ids
     FILE * taxIdFile;
-    if((taxIdFile = fopen((string(dbDirectory) + "/taxID_list").c_str(),"r")) == NULL){
+    if((taxIdFile = fopen((string(dbDir) + "/taxID_list").c_str(),"r")) == NULL){
         cout<<"Cannot open the taxID list file."<<endl;
         return;
     }
+
     char taxID[100];
     unordered_map<TaxID, TaxID> taxId2speciesId;
-    while(feof(taxIdFile) == 0)
-    {
+    while(feof(taxIdFile) == 0) {
         fscanf(taxIdFile,"%s",taxID);
         TaxID taxId = atol(taxID);
-        TaxonNode const * taxon = taxonomy.taxonNode(taxId);
+        TaxonNode const * taxon = taxonomy->taxonNode(taxId);
         if (taxId == taxon->taxId){
-            TaxID speciesTaxID = taxonomy.getTaxIdAtRank(taxId, "species");
+            TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
             while (taxon->taxId != speciesTaxID) {
                 taxId2speciesId[taxon->taxId] = speciesTaxID;
-                taxon = taxonomy.taxonNode(taxon->parentTaxId);
+                taxon = taxonomy->taxonNode(taxon->parentTaxId);
             }
             taxId2speciesId[speciesTaxID] = speciesTaxID;
         } else { // merged
-            TaxID speciesTaxID = taxonomy.getTaxIdAtRank(taxId, "species");
+            TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species");
             while (taxon->taxId != speciesTaxID) {
                 taxId2speciesId[taxon->taxId] = speciesTaxID;
-                taxon = taxonomy.taxonNode(taxon->parentTaxId);
+                taxon = taxonomy->taxonNode(taxon->parentTaxId);
             }
             taxId2speciesId[speciesTaxID] = speciesTaxID;
             taxId2speciesId[taxId] = speciesTaxID;
@@ -217,9 +215,9 @@ void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits)
     fclose(taxIdFile);
 
     // File names for the final DB
-    string mergedDiffFileName = dbDirectory + "/diffIdx";
-    string mergedInfoFileName = dbDirectory + "/info";
-    string diffIdxSplitFileName = dbDirectory + "/split";
+    string mergedDiffFileName = dbDir + "/diffIdx";
+    string mergedInfoFileName = dbDir + "/info";
+    string diffIdxSplitFileName = dbDir + "/split";
 
     // Files to write
     FILE * mergedDiffFile = fopen(mergedDiffFileName.c_str(), "wb");
@@ -246,8 +244,8 @@ void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits)
     struct MmapedData<uint16_t> *diffFileList = new struct MmapedData<uint16_t>[numOfSplits];
     struct MmapedData<TargetKmerInfo> *infoFileList = new struct MmapedData<TargetKmerInfo>[numOfSplits];
     for (int file = 0; file < numOfSplits; file++) {
-        diffFileList[file] = mmapData<uint16_t>((dbDirectory + "/" + to_string(file) + "_diffIdx").c_str());
-        infoFileList[file] = mmapData<TargetKmerInfo>((dbDirectory + "/" + to_string(file) + "_info").c_str());
+        diffFileList[file] = mmapData<uint16_t>((dbDir + "/" + to_string(file) + "_diffIdx").c_str());
+        infoFileList[file] = mmapData<TargetKmerInfo>((dbDir + "/" + to_string(file) + "_info").c_str());
         maxIdxOfEachFiles[file] = diffFileList[file].fileSize / sizeof(uint16_t);
         numOfKmerBeforeMerge += infoFileList[file].fileSize / sizeof(TargetKmerInfo);
     }
@@ -329,7 +327,7 @@ void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits)
         }
 
         if (taxIds.size() > 1) {
-            entryInfo.sequenceID = taxonomy.LCA(taxIds)->taxId;
+            entryInfo.sequenceID = taxonomy->LCA(taxIds)->taxId;
         } else {
             entryInfo.sequenceID = taxIds[0];
         }
@@ -388,7 +386,7 @@ void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits)
     cout<<"Reference DB files you need are as below"<<endl;
     cout<<mergedDiffFileName<<endl;
     cout<<mergedInfoFileName<<endl;
-    cout<<string(dbDirectory) + "/taxID_list"<<endl;
+    cout<<string(dbDir) + "/taxID_list"<<endl;
     cout<<diffIdxSplitFileName<<endl;
 }
 
diff --git a/src/commons/FileMerger.h b/src/commons/FileMerger.h
index dffa45e7..346b87c6 100644
--- a/src/commons/FileMerger.h
+++ b/src/commons/FileMerger.h
@@ -5,6 +5,7 @@
 #include "Kmer.h"
 #include <iostream>
 #include "IndexCreator.h"
+#include "NcbiTaxonomy.h"
 #include "printBinary.h"
 #include "common.h"
 
@@ -14,6 +15,8 @@ using namespace std;
 
 class FileMerger {
 private:
+    NcbiTaxonomy * taxonomy;
+    string dbDir;
     uint64_t MARKER;
     int splitNum;
     size_t bufferSize;
diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index b7945148..f6cbb6f7 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -8,6 +8,7 @@
 #include <unordered_map>
 #include <utility>
 #include "NcbiTaxonomy.cpp"
+#include "common.h"
 
 IndexCreator::IndexCreator(const LocalParameters & par) {
     // Parameters
@@ -190,6 +191,7 @@ void IndexCreator::makeBlocksForParallelProcessing() {
     }
     string eachFile;
     string seqHeader;
+    string accession_version;
 
     unordered_map<string, TaxID> foundAcc2taxid;
     for (int i = 0; i < fileNum; ++i) {
@@ -198,8 +200,8 @@ void IndexCreator::makeBlocksForParallelProcessing() {
         fastaList[i].path = eachFile;
         processedSeqCnt.push_back(taxIdList.size());
         seqHeader = getSeqSegmentsWithHead(fastaList[i].sequences, eachFile, acc2taxid, foundAcc2taxid);
-        seqHeader = seqHeader.substr(1, seqHeader.find('.') - 1);
-        TaxID speciesTaxid = taxonomy->getTaxIdAtRank(acc2taxid[seqHeader], "species");
+        accession_version = seqHeader.substr(1, LocalUtil::getFirstWhiteSpacePos(seqHeader) - 1);
+        TaxID speciesTaxid = taxonomy->getTaxIdAtRank(searchAccession2TaxID(accession_version, acc2taxid), "species");
 
         // Split current file into blocks for parallel processing
         splitFastaForProdigalTraining(i, speciesTaxid);
@@ -246,19 +248,9 @@ void IndexCreator::makeBlocksForParallelProcessing_accession_level() {
         getline(fnaListFile, eachFile);
         fastaList[i].path = eachFile;
         processedSeqCnt.push_back(taxIdList.size());
-
         seqHeader = getSeqSegmentsWithHead(fastaList[i].sequences, eachFile, acc2taxid, newAcc2taxid);
-        // accession_version = seqHeader.substr(1, seqHeader.find('.') - 1);
-        accession = seqHeader.substr(1, seqHeader.find('.') - 1);
         accession_version = seqHeader.substr(1, LocalUtil::getFirstWhiteSpacePos(seqHeader) - 1);
-        // newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid[accession], newTaxID));
-        tempTaxIDList.push_back(acc2taxid[accession]);   
-        
-        // TaxID speciesTaxid = taxonomy->getTaxIdAtRank(acc2taxid[accession], "species");
-
-        // // Split current file into blocks for parallel processing
-        // splitFastaForProdigalTraining(i, speciesTaxid);
-        // fastaList[i].speciesID = speciesTaxid;
+        tempTaxIDList.push_back(searchAccession2TaxID(accession_version, acc2taxid));   
     }
 
     // Edit taxonomy dump files
@@ -284,7 +276,6 @@ void IndexCreator::makeBlocksForParallelProcessing_accession_level() {
         fprintf(acc2taxidFile, "%s\t%d\t%d\n", it.first.c_str(), it.second.first, it.second.second);
     }
     fclose(acc2taxidFile);
-
 }
 
 void IndexCreator::splitFastaForProdigalTraining(int file_idx, TaxID speciesID) {
@@ -700,19 +691,30 @@ string IndexCreator::getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments,
     vector<SequenceBlock> seqSegmentsTmp;
     string accession;
     string accession_version;
+    int taxid;
 
     if (seqFile.is_open()) {
         getline(seqFile, firstLine, '\n');
-        accession = firstLine.substr(1, firstLine.find('.') - 1);
         accession_version = firstLine.substr(1, LocalUtil::getFirstWhiteSpacePos(firstLine) - 1);
-        newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid.at(accession), newTaxID));
+        taxid = searchAccession2TaxID(accession_version, acc2taxid);
+        if (taxid == 0) {
+            cerr << "Cannot find accession: " << accession_version << endl;
+            cerr << "Please run 'add-to-library' first." << endl;
+            exit(1);
+        }
+        newAcc2taxid.emplace_back(accession_version, make_pair(taxid, newTaxID));
         taxIdList.push_back(newTaxID++);
 
         while (getline(seqFile, eachLine, '\n')) {
             if (eachLine[0] == '>') {
-                accession = eachLine.substr(1, eachLine.find('.') - 1);
                 accession_version = eachLine.substr(1, LocalUtil::getFirstWhiteSpacePos(eachLine) - 1);
-                newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid.at(accession), newTaxID));
+                taxid = searchAccession2TaxID(accession_version, acc2taxid);
+                if (taxid == 0) {
+                    cerr << "Cannot find accession: " << accession_version << endl;
+                    cerr << "Please run 'add-to-library' first." << endl;
+                    exit(1);
+                }
+                newAcc2taxid.emplace_back(accession_version, make_pair(taxid, newTaxID));
                 taxIdList.push_back(newTaxID++);
                 pos = (size_t) seqFile.tellg();
                 seqSegmentsTmp.emplace_back(start, pos - eachLine.length() - 3,pos - eachLine.length() - start - 2);
@@ -744,16 +746,32 @@ string IndexCreator::getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments,
     vector<SequenceBlock> seqSegmentsTmp;
     vector<string> headers;
     size_t seqCnt = taxIdList.size();
+    string accession_version;
+    int taxid;
+
     if (seqFile.is_open()) {
         getline(seqFile, firstLine, '\n');
-//        cout << firstLine << endl;
-        taxIdList.push_back(acc2taxid.at(firstLine.substr(1, firstLine.find('.') - 1)));
-        foundAcc2taxid[firstLine.substr(1, firstLine.find(' ') - 1)] = taxIdList.back();
+        accession_version = firstLine.substr(1, LocalUtil::getFirstWhiteSpacePos(firstLine) - 1);
+        taxid = searchAccession2TaxID(accession_version, acc2taxid);
+        if (taxid == 0) {
+            cerr << "Cannot find accession: " << accession_version << endl;
+            cerr << "Please run 'add-to-library' first." << endl;
+            exit(1);
+        }
+        taxIdList.push_back(taxid);
+
+        foundAcc2taxid[accession_version] = taxIdList.back();
         while (getline(seqFile, eachLine, '\n')) {
             if (eachLine[0] == '>') {
-//                cout << eachLine << endl;
-                taxIdList.push_back(acc2taxid.at(eachLine.substr(1, eachLine.find('.') - 1)));
-                foundAcc2taxid[eachLine.substr(1, eachLine.find(' ') - 1)] = taxIdList.back();
+                accession_version = eachLine.substr(1, LocalUtil::getFirstWhiteSpacePos(eachLine) - 1);
+                taxid = searchAccession2TaxID(accession_version, acc2taxid);
+                if (taxid == 0) {
+                    cerr << "Cannot find accession: " << accession_version << endl;
+                    cerr << "Please run 'add-to-library' first." << endl;
+                    exit(1);
+                }
+                taxIdList.push_back(taxid);
+                foundAcc2taxid[accession_version] = taxIdList.back();
                 pos = (size_t) seqFile.tellg();
                 seqSegmentsTmp.emplace_back(start, pos - eachLine.length() - 3,pos - eachLine.length() - start - 2);
                 start = pos - eachLine.length() - 1;
diff --git a/src/commons/common.cpp b/src/commons/common.cpp
index c1ebff2a..01334b75 100644
--- a/src/commons/common.cpp
+++ b/src/commons/common.cpp
@@ -115,4 +115,42 @@ int loadDbParameters(LocalParameters &par) {
     }
   }
   return 0;
-}
\ No newline at end of file
+}
+
+int searchAccession2TaxID(const std::string &name,
+                          const std::unordered_map<std::string, int> &acc2taxid) {
+  if (acc2taxid.find(name) != acc2taxid.end()) {
+    return acc2taxid.at(name);
+  } 
+
+  // Cannot fine with version --> Remove the version number
+  size_t pos = name.find('.');
+  if (pos != std::string::npos) {
+    std::string nameWithoutVersion = name.substr(0, pos);
+    if (acc2taxid.find(nameWithoutVersion) != acc2taxid.end()) {
+      return acc2taxid.at(nameWithoutVersion);
+    }
+  }
+
+  // With prefix? Ex) NZ_CP083375.1
+  pos = name.find('_');
+  std::string nameWithoutPrefix;
+  if (pos != std::string::npos) {
+    // Try without prefix
+    nameWithoutPrefix = name.substr(pos + 1); // CP083375.1
+    if (acc2taxid.find(nameWithoutPrefix) != acc2taxid.end()) {
+      return acc2taxid.at(nameWithoutPrefix);
+    }
+
+    // Remove version
+    pos = nameWithoutPrefix.find('.');
+    if (pos != std::string::npos) {
+      nameWithoutPrefix = nameWithoutPrefix.substr(0, pos); // CP083375
+      if (acc2taxid.find(nameWithoutPrefix) != acc2taxid.end()) {
+        return acc2taxid.at(nameWithoutPrefix);
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/src/commons/common.h b/src/commons/common.h
index 7749c39a..493f64cd 100644
--- a/src/commons/common.h
+++ b/src/commons/common.h
@@ -86,4 +86,6 @@ NcbiTaxonomy * loadTaxonomy(const std::string & dbDir, const std::string & taxon
 
 int loadDbParameters(LocalParameters & par);
 
+int searchAccession2TaxID(const std::string & name, const std::unordered_map<std::string, int> & acc2taxid);
+
 #endif //ADCLASSIFIER2_COMMON_H
diff --git a/src/metabuli.cpp b/src/metabuli.cpp
index 3d9188c2..40a8f2c3 100644
--- a/src/metabuli.cpp
+++ b/src/metabuli.cpp
@@ -137,7 +137,7 @@ std::vector<KmerThreshold> externalThreshold = {};
 std::vector<DatabaseDownload> externalDownloads = {
         {
                 "RefSeq",
-                "Database built with NCBI RefSeq assemblies (Complete/Chromosome level only, Prokaryote & Virus) and Human genome (GRCh38.p14)",
+                "Database built with NCBI RefSeq assemblies (Complete/Chromosome level only, Prokaryote & Virus) and Human genome (CHM13v2.0)",
                 "O'Leary et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res. (2016)",
                 "https://www.ncbi.nlm.nih.gov/refseq/",
                 true, LocalParameters::DBTYPE_INDEX_DB, metabulidatabases_sh, metabulidatabases_sh_len,
@@ -145,7 +145,7 @@ std::vector<DatabaseDownload> externalDownloads = {
         },
         {
                 "RefSeq217",
-                "Database built with genomes of NCBI release 217 (Prokaryote & Virus) and Human genome (GRCh38.p14)",
+                "Database built with genomes of NCBI release 217 (Prokaryote & Virus) and Human genome (CHM13v2.0)",
                 "O'Leary et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res. (2016)",
                 "https://www.ncbi.nlm.nih.gov/refseq/",
                 true, LocalParameters::DBTYPE_INDEX_DB, metabulidatabases_sh, metabulidatabases_sh_len,
@@ -161,7 +161,7 @@ std::vector<DatabaseDownload> externalDownloads = {
         },
         {
                 "RefSeq_virus",
-                "Database built with NCBI RefSeq virus assemblies (Complete/Chromosome level only)",
+                "Database built with NCBI RefSeq virus assemblies (Complete/Chromosome level) and Human genome (CHM13v2.0)",
                 "O'Leary et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res. (2016)",
                 "https://www.ncbi.nlm.nih.gov/refseq/",
                 true, LocalParameters::DBTYPE_INDEX_DB, metabulidatabases_sh, metabulidatabases_sh_len,
diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp
index ee19855c..69bedded 100644
--- a/src/workflow/add_to_library.cpp
+++ b/src/workflow/add_to_library.cpp
@@ -77,26 +77,33 @@ int addToLibrary(int argc, const char **argv, const Command &command){
             while (kseq->ReadEntry()) {
                 const KSeqWrapper::KSeqEntry & e = kseq->entry;
 
-                // Extract accession and Remove the version number
-                string accession = string(e.name.s);
-                size_t pos = accession.find('.');
-                if (pos != string::npos) { accession = accession.substr(0, pos); }
-
-                // Skip if accession is not in the mapping file
-                if (acc2taxid.find(accession) == acc2taxid.end()) {
-                    cout << "During processing " << fileNames[i] << ", accession " << accession <<
+                int taxID = searchAccession2TaxID(e.name.s, acc2taxid);
+                if (taxID == 0) {
+                    cout << "During processing " << fileNames[i] << ", accession " << e.name.s <<
                          " is not found in the mapping file. It is skipped." << endl;
-                    unmapped.push_back(accession);
+                    unmapped.push_back(e.name.s);
                     continue;
                 }
+                // string accession = string(e.name.s);
+                // size_t pos = accession.find('.');
+                // if (pos != string::npos) { accession = accession.substr(0, pos); }
+
+                // // Skip if accession is not in the mapping file
+                // if (acc2taxid.find(accession) == acc2taxid.end()) {
+                //     cout << "During processing " << fileNames[i] << ", accession " << accession <<
+                //          " is not found in the mapping file. It is skipped." << endl;
+                //     unmapped.push_back(accession);
+                //     continue;
+                // }
 
                 // Get species taxID
-                int speciesTaxID = taxonomy->getTaxIdAtRank(acc2taxid[accession], "species");
+                int speciesTaxID = taxonomy->getTaxIdAtRank(taxID, "species");
 
                 // Skip if species taxID is not found
                 if (speciesTaxID == 0) {
-                    cout << "During processing " << fileNames[i] << ", accession " << accession <<
+                    cout << "During processing " << fileNames[i] << ", accession " << e.name.s <<
                          " is not matched to any species. It is skipped." << endl;
+                    unmapped.push_back(e.name.s);
                     continue;
                 }
 

From c2d805633d198069d99693eafcdcdf42cd0732fc Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Thu, 14 Sep 2023 00:21:30 +0900
Subject: [PATCH 33/65] 1) edit readme 2) remove some printed logs 3) fix the
 problme of empty 'my.accession2taxid'

---
 README.md                       | 19 +++++++++++++++----
 lib/prodigal/gene.cpp           |  2 +-
 src/commons/IndexCreator.cpp    |  2 +-
 src/commons/LocalParameters.cpp |  2 +-
 src/workflow/add_to_library.cpp |  4 ++++
 5 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index c240cc0b..d7cb466c 100644
--- a/README.md
+++ b/README.md
@@ -164,7 +164,9 @@ The steps for building a database with NCBI or GTDB taxonomy are described below
 metabuli add-to-library <FASTA list> <accession2taxid> <DBDIR>
 - FASTA list: A file containing absolute paths of each FASTA file.
 - accession2taxid: A path to NCBI-style accession2taxid.
-- DBDIR: Sequences will be stored in 'DBDIR/library'. 
+- DBDIR: Sequences will be stored in 'DBDIR/library'.
+
+** When resume is needed, remove the files in DBDIR/library and run the command again.
 ```
 It groups your sequences into separate files according to their species.
 Accessions that are not included in the `<accession2taxid>` will be skipped and listed in `unmapped.txt`.
@@ -191,17 +193,24 @@ This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some
 
 ### To build a database with GTDB taxonomy
 #### 1. Prepare GTDB taxonomy and accession2taxid
-*Requirements*: You need assembly FASTA files whose file name (or path) includes the assembly accession.
+*Requirements*: 
+You need assembly FASTA files whose file name (or path) includes the assembly accession.
 If you downloaded assemblies using `ncbi-genome-download`, you probably don't have to care about it.
 The regular expression of assembly accessions is (GC[AF]_[0-9].[0-9])
 
 ```
 # 1. 
-In the 'util' directory
+# 1-1. Move to the 'util' directory
+cd METABULI_DIR/util
+
+# 1-2. Run prepare_gtdb_taxonomy.sh
 ./prepare_gtdb_taxonomy.sh <DBDIR>
   - DBDIR : Result files are stored in 'DBDIR/taxonomy'. 
+
+** Please clone Metabuli's repository to use this script.
+** It is not provided in the precompiled binaries or bioconda package.
 ```
-This will generate taxonomy dump files and `assacc_to_taxid.tsv` with other files.
+In `DBDIR/taxonomy`, it will generate taxonomy `dmp` files and `assacc_to_taxid.tsv` with other files.
 
 ```
 # 2. 
@@ -210,6 +219,8 @@ metabuli add-to-library <FASTA list> <accession2taxid> <DBDIR> --assembly true
     Each path must include a corresponding assembly accession. 
   - accession2taxid : 'assacc_to_taxid.tsv' from the previous step
   - DBDIR : The same DBDIR from the previous step.
+
+** When resume is needed, remove the files in DBDIR/library and run the command again.
 ```
 This will add your FASTA files to DBDIR/library according to their species taxonomy ID and generate 'my.accession2taxid'
 
diff --git a/lib/prodigal/gene.cpp b/lib/prodigal/gene.cpp
index 3cbd7f01..40debaf1 100644
--- a/lib/prodigal/gene.cpp
+++ b/lib/prodigal/gene.cpp
@@ -51,7 +51,7 @@ int add_genes(struct _gene *glist, struct _node *nod, int dbeg) {
     }
     path = nod[path].tracef;
     if(ctr == MAX_GENES) {
-      fprintf(stderr, "warning, max # of genes exceeded, truncating...\n");
+      // fprintf(stderr, "warning, max # of genes exceeded, truncating...\n");
       return ctr;
     }
   }
diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index f6cbb6f7..d3cb7db4 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -1004,7 +1004,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
                     munmap(fastaFile.data, fastaFile.fileSize + 1);
                 } else {
                     // Withdraw the reservation if the buffer is full.
-                    cout << "Buffer is full. Withdraw the reservation." << endl;
+                    // cout << "Buffer is full. Withdraw the reservation." << endl;
                     checker[i] = false;
                     __sync_fetch_and_add(&hasOverflow, 1);
                     __sync_fetch_and_sub(&kmerBuffer.startIndexOfReserve, estimatedKmerCnt);
diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp
index bf4a20f5..9c79fdea 100644
--- a/src/commons/LocalParameters.cpp
+++ b/src/commons/LocalParameters.cpp
@@ -294,7 +294,7 @@ LocalParameters::LocalParameters() :
 //     classify.push_back(&MIN_COVERED_POS);
 //     classify.push_back(&PRINT_LOG);
 //     classify.push_back(&MAX_GAP);
-//     classify.push_back(&TAXONOMY_PATH);
+    classify.push_back(&TAXONOMY_PATH);
     classify.push_back(&PARAM_MASK_RESIDUES);
     classify.push_back(&PARAM_MASK_PROBABILTY);
     classify.push_back(&RAM_USAGE);
diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp
index 69bedded..e6b5c9e0 100644
--- a/src/workflow/add_to_library.cpp
+++ b/src/workflow/add_to_library.cpp
@@ -174,6 +174,10 @@ int addToLibrary(int argc, const char **argv, const Command &command){
             KSeqWrapper* kseq = KSeqFactory(fileNames[i].c_str());
             while (kseq->ReadEntry()){
                 const KSeqWrapper::KSeqEntry & e = kseq->entry;
+                // Extract accession
+                string accession = string(e.name.s);
+                acc2taxid[accession] = assembly2taxid[assemblyID];
+
                 // Write to file
                 FILE *file = fopen((dbDir + "/library/" + to_string(speciesTaxID) + ".fna").c_str(), "a");
                 fprintf(file, ">%s %s\n", e.name.s, e.comment.s);

From ecbc9aa587cdf11f063156e557d23110aabb922a Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Mon, 25 Sep 2023 15:06:27 +0900
Subject: [PATCH 34/65] fix errors in 1) query indexing 2) grade

---
 src/commons/Classifier.cpp    | 18 ---------------
 src/commons/FileMerger.cpp    |  2 ++
 src/commons/IndexCreator.cpp  |  7 +++---
 src/commons/KmerExtractor.cpp |  2 +-
 src/commons/KmerMatcher.cpp   |  7 +++++-
 src/commons/QueryIndexer.cpp  | 26 +++++++++++++---------
 src/commons/SeqIterator.cpp   |  1 +
 src/commons/Taxonomer.cpp     | 20 ++++++-----------
 src/commons/common.cpp        |  6 -----
 src/util/grade.cpp            | 42 +++++++++++------------------------
 10 files changed, 50 insertions(+), 81 deletions(-)

diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp
index 3384ed88..58130092 100644
--- a/src/commons/Classifier.cpp
+++ b/src/commons/Classifier.cpp
@@ -98,27 +98,10 @@ void Classifier::startClassify(const LocalParameters &par) {
                                          kseq2);
         numOfTatalQueryKmerCnt += kmerBuffer.startIndexOfReserve;
 
-//#ifdef OPENMP
-//        if (par.printLog == 1) {
-//            omp_set_num_threads(1);
-//        } else {
-//            omp_set_num_threads(par.threads);
-//        }
-//#endif
-
         // Search matches between query and target k-mers
         kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer);
         kmerMatcher->sortMatches(&matchBuffer);
 
-
-//#ifdef OPENMP
-//        if (par.printLog == 1) {
-//            omp_set_num_threads(1);
-//        } else {
-//            omp_set_num_threads(par.threads);
-//        }
-//#endif
-
         // Classify queries based on the matches
         taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par);
         processedSeqCnt += queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start;
@@ -146,5 +129,4 @@ void Classifier::startClassify(const LocalParameters &par) {
     free(matchBuffer.buffer);
     delete kseq1;
     delete kseq2;
-
 }
diff --git a/src/commons/FileMerger.cpp b/src/commons/FileMerger.cpp
index 91d55bf6..291c1316 100644
--- a/src/commons/FileMerger.cpp
+++ b/src/commons/FileMerger.cpp
@@ -305,6 +305,8 @@ void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits)
         int hasSeenOtherStrains = 0;
         taxIds.clear();
         taxIds.push_back(entryInfo.sequenceID); // Wrong
+        
+        // Scan redundant k-mers
         while(taxId2speciesId[entryInfo.sequenceID] == taxId2speciesId[lookingInfos[idxOfMin].sequenceID]){
             if(entryKmer != lookingKmers[idxOfMin]) break;
 
diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index d3cb7db4..a08283c4 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -131,7 +131,6 @@ void IndexCreator::updateIndex(const LocalParameters &par) {
 
     // Train Prodigal for each species
     time_t prodigalStart = time(nullptr);
-    // trainProdigal();
     time_t prodigalEnd = time(nullptr);
     cout << "Prodigal training time: " << prodigalEnd - prodigalStart << " seconds" << endl;
 
@@ -521,6 +520,8 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq
                 hasSeenOtherStrains = 0;
                 taxIds.clear();
                 taxIds.push_back(taxIdList[lookingKmer->info.sequenceID]);
+
+                // Scan redundancy
                 while(lookingKmer->taxIdAtRank == kmerBuffer.buffer[i].taxIdAtRank){
                     if (lookingKmer->ADkmer != kmerBuffer.buffer[i].ADkmer) {
                         break;
@@ -882,7 +883,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
                     seq = kseq_init(&buffer);
                     kseq_read(seq);
                     lengthOfTrainingSeq = seq->seq.l;
-                    cout << "T: " << seq->name.s << " " << lengthOfTrainingSeq << " " << estimatedKmerCnt << endl;
+                    // cout << "T: " << seq->name.s << " " << lengthOfTrainingSeq << " " << estimatedKmerCnt << endl;
 
                     // Train prodigal
                     prodigal->is_meta = 0;
@@ -1009,7 +1010,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
                     __sync_fetch_and_add(&hasOverflow, 1);
                     __sync_fetch_and_sub(&kmerBuffer.startIndexOfReserve, estimatedKmerCnt);
                 }
-                cout << totalLength << " " << prodigal->fng << endl;
+                // cout << totalLength << " " << prodigal->fng << endl;
                 delete prodigal;
                 
             }
diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp
index 91f5ee8f..4d656cdf 100644
--- a/src/commons/KmerExtractor.cpp
+++ b/src/commons/KmerExtractor.cpp
@@ -86,7 +86,7 @@ void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq1,
                 size_t queryIdx = processedQueryNum - currentQueryNum + i;
                 // Get k-mer count
                 auto kmerCnt = LocalUtil::getQueryKmerNumber<size_t>(reads1[i].length(), spaceNum);
-
+                
                 // Ignore short read
                 if (kmerCnt < 1) { continue; }
 
diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 9455070e..646bd1cf 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -178,7 +178,8 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
         // Devide query k-mers into blocks
         size_t splitWidth = queryKmerNum / (threads - 1);
         querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]);
-        for (size_t i = 1; i < threads; i++) {
+        size_t i = 1;
+        for (; (i < threads) && (splitWidth * i < queryKmerNum); i++) {
             queryAA = AminoAcidPart(queryKmerList[splitWidth * i].ADkmer);
             bool needLastTargetBlock = true;
             for (size_t j = 0; j < numOfDiffIdxSplits_use; j++) {
@@ -205,6 +206,10 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
                 }
             }
         }
+
+        if (i != threads) {
+            threads = querySplits.size();
+        }
     }
 
     bool *splitCheckList = (bool *) malloc(sizeof(bool) * threads);
diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp
index 0651b479..66ec70a4 100644
--- a/src/commons/QueryIndexer.cpp
+++ b/src/commons/QueryIndexer.cpp
@@ -30,26 +30,32 @@ void QueryIndexer::setAvailableRam() {
 void QueryIndexer::indexQueryFile() {
     // Read 1
     if (seqMode == 1 || seqMode == 3) {
-        KSeqWrapper* kseq;
-        kseq = KSeqFactory(queryPath_1.c_str());
+        KSeqWrapper* kseq = KSeqFactory(queryPath_1.c_str());
         size_t kmerCnt = 0;
         size_t seqCnt = 0;
         size_t start = 0;
         while (kseq->ReadEntry()) {
             readNum_1++;
-            const KSeqWrapper::KSeqEntry &e = kseq->entry;
-            totalReadLength += e.sequence.l;
-            size_t currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(e.sequence.l, spaceNum);
-            kmerCnt += currentKmerCnt;
             seqCnt++;
+            totalReadLength += kseq->entry.sequence.l;
+            size_t currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(kseq->entry.sequence.l, spaceNum);
+            kmerCnt += currentKmerCnt;
+            // std::cout << "currentKmerCnt: " << kmerCnt << "\n";
+        
             if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) {
-                querySplits.emplace_back(start, readNum_1, kmerCnt - currentKmerCnt);
+                querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt);
                 kmerCnt = currentKmerCnt;
-                start = readNum_1;
+                start = readNum_1 - 1;
                 seqCnt = 1;
             }
         }
         querySplits.emplace_back(start, readNum_1, kmerCnt);
+        // Print elements
+        for (auto & querySplit : querySplits) {
+            std::cout << "start: " << querySplit.start << "\t";
+            std::cout << "end: " << querySplit.end << "\t";
+            std::cout << "kmerCnt: " << querySplit.kmerCnt << "\n";
+        }
         delete kseq;
     } else {
         KSeqWrapper* kseq_1 = KSeqFactory(queryPath_1.c_str());
@@ -87,9 +93,9 @@ void QueryIndexer::indexQueryFile() {
             }
 
             if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) {
-                querySplits.emplace_back(start, readNum_1, kmerCnt - currentKmerCnt);
+                querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt);
                 kmerCnt = currentKmerCnt;
-                start = readNum_1;
+                start = readNum_1 - 1;
                 seqCnt_1 = 1;
             }
 
diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp
index 525ac651..55255342 100644
--- a/src/commons/SeqIterator.cpp
+++ b/src/commons/SeqIterator.cpp
@@ -355,6 +355,7 @@ void SeqIterator::fillQueryKmerBuffer(const char *seq, int seqLen, QueryKmerBuff
             posToWrite++;
         }
     }
+    // cout << "posToWrite: " << posToWrite << endl;
 }
 
 void
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index 663ebba4..adf2dbdd 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -75,13 +75,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
                                  vector<Query> & queryList,
                                  const LocalParameters &par) {
     TaxID selectedTaxon;
-//    if (par.printLog) {
+
+//    if (true) {
 //        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
 //        for (size_t i = offset; i < end + 1; i++) {
-//            cout << taxId2genusId[matchList[i].targetId] << " " << taxId2speciesId[matchList[i].targetId] <<
-//            " "  << matchList[i].targetId << " " << matchList[i].qInfo.frame << " ";
-//            print_binary16(16, matchList[i].rightEndHamming);
-//            cout << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
+//            cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
 //        }
 //    }
 
@@ -111,14 +109,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
         }
     }
 
-//    if (par.printLog) {
+//    if (true) {
 //        cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n";
 //        for (size_t i = 0; i < genusMatches.size(); i++) {
-//            cout << taxId2genusId[genusMatches[i].targetId] << " " << taxId2speciesId[genusMatches[i].targetId] <<
-//                 " "  << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " ";
-//            print_binary16(16, genusMatches[i].rightEndHamming);
-//            cout << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) <<  " "  << int(genusMatches[i].redundancy) << endl;
-//        }
+//           cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) <<  " "  << int(genusMatches[i].redundancy) << endl;
+//         }
 //        cout << "Genus score: " << genusScore.score << "\n";
 //    }
 
@@ -481,7 +476,6 @@ void Taxonomer::remainConsecutiveMatches(vector<const Match *> & curFrameMatches
     for (const auto& entry : linkedMatches) {
         if (!used.count(entry.first)) {
             used.insert(entry.first);
-            vector<const Match*> curMatches;
             DFS(entry.first, linkedMatches, filteredMatchIdx, 0, MIN_DEPTH, used, idx2depth);
         }
     }
@@ -506,7 +500,7 @@ size_t Taxonomer::DFS(size_t curMatchIdx, const map<size_t, vector<size_t>> & li
     depth++;
     size_t maxDepth = 0;
     size_t returnDepth = 0;
-    if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { //|| linkedMatches.at(curMatchIdx).empty()) {
+    if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { 
         // reached a leaf node
         idx2depth[curMatchIdx] = depth;
         if (depth > MIN_DEPTH) {
diff --git a/src/commons/common.cpp b/src/commons/common.cpp
index 01334b75..edba4fe0 100644
--- a/src/commons/common.cpp
+++ b/src/commons/common.cpp
@@ -4,16 +4,11 @@
 #include <fstream>
 #include <iostream>
 #include <unistd.h>
-// #include "MathUtil.h"
 #include "Debug.h"
 #include "Reporter.h"
 #include "Util.h"
 #include "sys/mman.h"
 
-// #include <fstream>
-// #include <algorithm>
-// #include <cassert>
-
 void process_mem_usage(double &vm_usage, double &resident_set) {
   vm_usage = 0.0;
   resident_set = 0.0;
@@ -68,7 +63,6 @@ NcbiTaxonomy *loadTaxonomy(const std::string &dbDir,
                             taxonomyDir + "/nodes.dmp",
                             taxonomyDir + "/merged.dmp");
   }
-
   return new NcbiTaxonomy(dbDir + "/taxonomy/names.dmp",
                           dbDir + "/taxonomy/nodes.dmp",
                           dbDir + "/taxonomy/merged.dmp");
diff --git a/src/util/grade.cpp b/src/util/grade.cpp
index e009a361..79dbaaa1 100644
--- a/src/util/grade.cpp
+++ b/src/util/grade.cpp
@@ -121,7 +121,7 @@ int grade(int argc, const char **argv, const Command &command) {
     }
     cout << "Classification results loaded" << endl;
 
-    size_t numberOfFiles = mappingFileNames.size();
+    size_t numberOfFiles = readClassificationFileNames.size();
     vector<GradeResult> results;
     results.resize(numberOfFiles);
 
@@ -172,37 +172,21 @@ ncbiTaxonomy, par, cout, printColumnsIdx, cerr)
             mappingFile = mappingFileNames[i];
             readClassificationFileName = readClassificationFileNames[i];
 
-            if (par.testType == "cami-long"){
-                // Load mapping file
-                ifstream mappingFileFile;
-                mappingFileFile.open(mappingFile);
-                string line;
-                if (mappingFileFile.is_open()) {
-                    getline(mappingFileFile, line);
-                    while (getline(mappingFileFile, line)) {
-                        vector<string> splitLine = Util::split(line, "\t");
-                        assacc2taxid[splitLine[0]] = stoi(splitLine[2]);
-                    }
-                } else {
-                    cerr << "Cannot open file for answer" << endl;
+            // Load the mapping file (answer sheet) (accession to taxID)
+            string key, value;
+            ifstream map;
+            map.open(mappingFile);
+            size_t numberOfAnswers = 0;
+            if (map.is_open()) {
+                while (getline(map, key, '\t')) {
+                    getline(map, value, '\n');
+                    assacc2taxid[key] = stoi(value);
+                    numberOfAnswers++;
                 }
             } else {
-                // Load the mapping file (answer sheet) (accession to taxID)
-                string key, value;
-                ifstream map;
-                map.open(mappingFile);
-                size_t numberOfAnswers = 0;
-                if (map.is_open()) {
-                    while (getline(map, key, '\t')) {
-                        getline(map, value, '\n');
-                        assacc2taxid[key] = stoi(value);
-                        numberOfAnswers++;
-                    }
-                } else {
-                    cout << "Cannot open file for answer" << endl;
-                }
-                map.close();
+                cout << "Cannot open file for answer" << endl;
             }
+            map.close();
 
             // Load classification results
             string resultLine;

From 04d20aad60616405ab8c04d70881c26524058d1b Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Wed, 4 Oct 2023 17:07:07 +0900
Subject: [PATCH 35/65] fix unintended initialization of prodigal results

---
 src/commons/ProdigalWrapper.cpp | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/commons/ProdigalWrapper.cpp b/src/commons/ProdigalWrapper.cpp
index 6c27f516..9e9c6aec 100644
--- a/src/commons/ProdigalWrapper.cpp
+++ b/src/commons/ProdigalWrapper.cpp
@@ -51,6 +51,12 @@ ProdigalWrapper::ProdigalWrapper() {
 
 void ProdigalWrapper::
 trainASpecies(char * genome){
+    // Initialize memories to reuse them
+    memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char));
+    memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char));
+    memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char));
+    memset(nodes, 0, nn * sizeof(struct _node));
+    nn = 0; slen = 0; ipath = 0; nmask = 0;
 
     // Initialize training information
     memset(mlist, 0, MAX_MASKS*sizeof(mask));
@@ -116,17 +122,17 @@ trainASpecies(char * genome){
     train_starts_sd(seq, rseq, slen, nodes, nn, &tinf);
     determine_sd_usage(&tinf);
     if(force_nonsd == 1) tinf.uses_sd = 0;
-    if(tinf.uses_sd == 0) train_starts_nonsd(seq, rseq, slen, nodes, nn, &tinf);
+    if(tinf.uses_sd == 0) train_starts_nonsd(seq, rseq, slen, nodes, nn, &tinf); 
+}
 
+void ProdigalWrapper::trainMeta(char *genome) {
     // Initialize memories to reuse them
     memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char));
     memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char));
     memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char));
     memset(nodes, 0, nn * sizeof(struct _node));
     nn = 0; slen = 0; ipath = 0; nmask = 0;
-}
 
-void ProdigalWrapper::trainMeta(char *genome) {
     // Initialize training information
     memset(&tinf, 0, sizeof(struct _training));
     tinf.st_wt = 4.35;
@@ -173,15 +179,16 @@ void ProdigalWrapper::trainMeta(char *genome) {
             max_score = nodes[ipath].score;
         }
     }
+}
 
+void ProdigalWrapper::getPredictedGenes(char * genome) {
     // Initialize memories to reuse them
+    // Initialization should be done here not at the end of the function
     memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char));
     memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char));
     memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char));
-    memset(nodes, 0, nn * sizeof(struct _node));
-    nn = 0; slen = 0; ipath = 0; nmask = 0;
-}
-void ProdigalWrapper::getPredictedGenes(char * genome){
+    memset(nodes, 0, nn*sizeof(struct _node));
+    nn = 0; slen = 0; nmask = 0; ipath=0;
 
     /* Initialize structure */
     slen = getNextSeq(genome, 0);
@@ -241,13 +248,6 @@ void ProdigalWrapper::getPredictedGenes(char * genome){
         tweak_final_starts(genes, ng, nodes, nn, meta[max_phase].tinf);
         record_gene_data(genes, ng, nodes, meta[max_phase].tinf, num_seq);
     }
-
-    // Initialize memories to reuse them
-    memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char));
-    memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char));
-    memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char));
-    memset(nodes, 0, nn*sizeof(struct _node));
-    nn = 0; slen = 0; nmask = 0; ipath=0;
 }
 
 int ProdigalWrapper::getNextSeq(char * line, int training) {

From 907796dbec2eeb9226b71b88d7d4f958fda96fe2 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Wed, 4 Oct 2023 17:07:33 +0900
Subject: [PATCH 36/65] remove codes used for debugging

---
 src/commons/SeqIterator.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp
index 55255342..d9677a93 100644
--- a/src/commons/SeqIterator.cpp
+++ b/src/commons/SeqIterator.cpp
@@ -446,10 +446,10 @@ SeqIterator::fillBufferWithKmerFromBlock(const PredictedBlock &block, const char
             kmerBuffer.buffer[posToWrite] = {UINT64_MAX, -1, 0, false};
         } else {
             addDNAInfo_TargetKmer(tempKmer, seq, block, kmerCnt);
-            if(posToWrite >= kmerBuffer.bufferSize - 2) {
-                cout << "HERE " << posToWrite << endl;
-                return -1;
-            }
+            // if(posToWrite >= kmerBuffer.bufferSize - 2) {
+            //     cout << "HERE " << posToWrite << endl;
+            //     return -1;
+            // }
             kmerBuffer.buffer[posToWrite] = {tempKmer, taxIdAtRank, seqID, false};
         }
         posToWrite++;

From 8468627b7880057c3e0bcf8eae8505ae90942475 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 10 Oct 2023 14:23:36 +0900
Subject: [PATCH 37/65] mapping2taxon for metamaps without EM

---
 src/LocalCommandDeclarations.h |  1 +
 src/commons/LocalParameters.h  |  2 +-
 src/metabuli.cpp               | 10 +++++++++-
 src/util/CMakeLists.txt        |  1 +
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h
index 000aa648..56b470f5 100644
--- a/src/LocalCommandDeclarations.h
+++ b/src/LocalCommandDeclarations.h
@@ -13,5 +13,6 @@ extern int applyThreshold(int argc, const char **argv, const Command& command);
 extern int binning2report(int argc, const char **argv, const Command& command);
 extern int filterByGenus(int argc, const char **argv, const Command& command);
 extern int databaseReport(int argc, const char **argv, const Command& command);
+extern int mapping2taxon(int argc, const char **argv, const Command& command);
 
 #endif //ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H
diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h
index ccbd03c6..0a8be0c7 100644
--- a/src/commons/LocalParameters.h
+++ b/src/commons/LocalParameters.h
@@ -31,7 +31,7 @@ class LocalParameters : public Parameters {
     std::vector<MMseqsParameter*> binning2report;
     std::vector<MMseqsParameter*> filterByGenus;
     std::vector<MMseqsParameter*> databaseReport;
-
+    std::vector<MMseqsParameter*> mapping2taxon;
 
     // Superkingdom taxonomy id
     PARAMETER(VIRUS_TAX_ID)
diff --git a/src/metabuli.cpp b/src/metabuli.cpp
index 40a8f2c3..82f42a76 100644
--- a/src/metabuli.cpp
+++ b/src/metabuli.cpp
@@ -129,7 +129,15 @@ std::vector<Command> commands = {
                     CITATION_SPACEPHARER,
                     {{"Binning Result", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                      {"Genus list", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile},
-                     {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}}
+                     {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
+        {"mapping2taxon", mapping2taxon, &localPar.mapping2taxon, COMMAND_EXPERT,
+                    "It takes a mapping file (multiple targets for each read) and generates a read2taxon file (one target for each read)",
+                nullptr,
+                    "Jaebeom Kim <jbeom0731@gmail.com>",
+                    "<i:mapping file> <i: taxonomy directory> ",
+                    CITATION_SPACEPHARER,
+                    {{"mapping file", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
+                     {"taxonomy directory", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}}
 
 };
 std::vector<KmerThreshold> externalThreshold = {};
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index 6a68f515..89e7f0f9 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -6,4 +6,5 @@ set(util_source_files
         util/report.cpp
         util/grade.cpp
         util/database-report.cpp
+        util/mapping2taxon.cpp
         PARENT_SCOPE)
\ No newline at end of file

From 9c62d6c505084527b654eaae58609b2594314d99 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Thu, 12 Oct 2023 00:16:20 +0900
Subject: [PATCH 38/65] new parameter: minimum number of ssp. specific matches
 for lower-rank classification

---
 src/commons/LocalParameters.cpp | 8 ++++++++
 src/commons/LocalParameters.h   | 2 ++
 src/commons/Taxonomer.cpp       | 3 ++-
 src/commons/Taxonomer.h         | 1 +
 src/workflow/classify.cpp       | 1 +
 5 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp
index 9c79fdea..dc9ffe04 100644
--- a/src/commons/LocalParameters.cpp
+++ b/src/commons/LocalParameters.cpp
@@ -143,6 +143,13 @@ LocalParameters::LocalParameters() :
                        typeid(int),
                        (void *) &matchPerKmer,
                        "^[0-9]+$"),
+        MIN_SS_MATCH(MIN_SS_MATCH_ID,
+                    "--min-ss-match",
+                    "Min. num. of ssp.-specific matches for ssp. classification",
+                    "Min. number of ssp.-specific matches for ssp. classification",
+                    typeid(int),
+                    (void *) &minSSMatch,
+                    "^[0-9]+$"),
         LIBRARY_PATH(LIBRARY_PATH_ID,
                      "--library-path",
                      "Path to library where the FASTA files are stored",
@@ -300,6 +307,7 @@ LocalParameters::LocalParameters() :
     classify.push_back(&RAM_USAGE);
     classify.push_back(&MATCH_PER_KMER);
     classify.push_back(&ACCESSION_LEVEL);
+    classify.push_back(&MIN_SS_MATCH);
 
     // filter 
     filter.push_back(&PARAM_THREADS);
diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h
index 0a8be0c7..858d73a6 100644
--- a/src/commons/LocalParameters.h
+++ b/src/commons/LocalParameters.h
@@ -55,6 +55,7 @@ class LocalParameters : public Parameters {
     PARAMETER(MIN_CONS_CNT)
     PARAMETER(MIN_CONS_CNT_EUK)
     PARAMETER(MATCH_PER_KMER)
+    PARAMETER(MIN_SS_MATCH)
 
     // DB build parameters
     PARAMETER(LIBRARY_PATH)
@@ -101,6 +102,7 @@ class LocalParameters : public Parameters {
     int maxGap;
     int minConsCntEuk;
     int matchPerKmer;
+    int minSSMatch;
 
     // Database creation
     std::string tinfoPath;
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index adf2dbdd..0f97a3dc 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -18,6 +18,7 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon
     maxGap = par.maxGap;
     minCoveredPos = par.minCoveredPos;
     accessionLevel = par.accessionLevel;
+    minSSMatch = par.minSSMatch;
 }
 
 Taxonomer::~Taxonomer() {
@@ -308,7 +309,7 @@ TaxID Taxonomer::BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID r
     if (cladeCnt.at(root).children.empty()) { // root is a leaf
         return root;
     }
-    unsigned int maxCnt = 3;
+    unsigned int maxCnt = minSSMatch;
     unsigned int currentCnt;
     vector<TaxID> bestChildren;
     for (auto it = cladeCnt.at(root).children.begin(); it != cladeCnt.at(root).children.end(); it++) {
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index 6c597dd2..45687923 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -31,6 +31,7 @@ class Taxonomer {
     int maxGap;
     int minCoveredPos;
     int accessionLevel;
+    int minSSMatch;
 
     struct MatchBlock {
         MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {}
diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp
index 460bab34..0ce445ff 100644
--- a/src/workflow/classify.cpp
+++ b/src/workflow/classify.cpp
@@ -25,6 +25,7 @@ void setClassifyDefaults(LocalParameters & par){
     par.maskProb = 0.9;
     par.matchPerKmer = 4;
     par.accessionLevel = 0;
+    par.minSSMatch = 3;
 }
 
 int classify(int argc, const char **argv, const Command& command)

From ba21d6fa33003c3ac4cb5f534749f687ea89a1f6 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Thu, 12 Oct 2023 00:16:43 +0900
Subject: [PATCH 39/65] fix missed new line

---
 src/util/grade.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/util/grade.cpp b/src/util/grade.cpp
index 79dbaaa1..a5712d63 100644
--- a/src/util/grade.cpp
+++ b/src/util/grade.cpp
@@ -314,6 +314,7 @@ ncbiTaxonomy, par, cout, printColumnsIdx, cerr)
                         for (const auto & value : idx2values[idx]) {
                             fnFile << value << "\t";
                         }
+                        fnFile << endl;
                     }
                     fnFile.close();
                 }
@@ -482,4 +483,4 @@ char compareTaxon_hivExclusion(TaxID shot, TaxID target, CountAtRank & count){
         count.FP++;
         return 'X';
     }
-}
\ No newline at end of file
+}

From e505ee2e699e2956b465d1e528adbe8556c7025a Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 17 Oct 2023 15:44:05 +0900
Subject: [PATCH 40/65] There was a minor error during extending the last ORF.
 Now it is fixed, and performance imporved little

---
 src/commons/IndexCreator.cpp |   2 +-
 src/commons/SeqIterator.cpp  | 109 +++++++++++++++++++++--------------
 2 files changed, 66 insertions(+), 45 deletions(-)

diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index a08283c4..2e011d7e 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -961,7 +961,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
                             }
                         } else { // Reverse complement
                             reverseCompliment = seqIterator.reverseCompliment(seq->seq.s, seq->seq.l);
-
+                            
                             // Get extended ORFs
                             prodigal->getPredictedGenes(reverseCompliment);
                             prodigal->removeCompletelyOverlappingGenes();
diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp
index d9677a93..dc49b367 100644
--- a/src/commons/SeqIterator.cpp
+++ b/src/commons/SeqIterator.cpp
@@ -628,56 +628,77 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect
         }
     }
 
-    //For the last gene
-    if (find(intergenicKmerList.begin(), intergenicKmerList.end(), leftKmerHash) !=
-        intergenicKmerList.end()) { //extension to left
-        if (!isReverse) { //forward
+    // For the last gene
+    // Extend to the end of the genome
+    isReverse = !(nodes[genes[numOfGene - 1].start_ndx].strand == 1);    
+    rightEnd = length - 1;
+    if (isReverse) {
+        frame = (genes[numOfGene - 1].end - 1) % 3;
+        while (rightEnd % 3 != frame) rightEnd--;
+    }
+    // If left region is not covered, cover it.
+    leftEnd = genes[numOfGene - 1].begin - 1;
+    if (hasBeenExtendedToLeft) {
+        leftEnd = genes[numOfGene - 2].end - 1 - 22;
+        if (!isReverse) {
             frame = (genes[numOfGene - 1].begin - 1) % 3;
-            leftEnd = genes[numOfGene - 2].end - 1 - 22;
             while (leftEnd % 3 != frame) leftEnd++;
-            blocks.emplace_back(leftEnd, length - 1, 1);
-            blockIdx++;
-        } else { // reverse
-            frame = (genes[numOfGene - 1].end - 1) % 3;
-            rightEnd = length - 1;
-            while (rightEnd % 3 != frame) rightEnd--;
-            blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1);
-            blockIdx++;
-        }
-    } else { //extension to right
-        if (hasBeenExtendedToLeft) {
-            if (!isReverse) { //forward
-                frame = (genes[numOfGene - 1].begin - 1) % 3;
-                leftEnd = genes[numOfGene - 2].end - 1 - 22;
-                while (leftEnd % 3 != frame) leftEnd++;
-                blocks.emplace_back(leftEnd, length - 1, 1);
-                blockIdx++;
-            } else {
-                frame = (genes[numOfGene - 1].end - 1) % 3;
-                rightEnd = length - 1;
-                while (rightEnd % 3 != frame) rightEnd--;
-                blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1);
-                blockIdx++;
-            }
-        } else {
-            if (!isReverse) {
-                blocks.emplace_back(genes[numOfGene - 1].begin, length - 1, 1);
-                blockIdx++;
-            } else {
-                frame = (genes[numOfGene - 1].end - 1) % 3;
-                rightEnd = length - 1;
-                while (rightEnd % 3 != frame) rightEnd--;
-                blocks.emplace_back(genes[numOfGene - 1].begin - 1, rightEnd, -1);
-                blockIdx++;
-            }
         }
-
-        //If current intergenic sequences is new, update intergenicKmerList.
-        if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) {
+    }
+    blocks.emplace_back(leftEnd, rightEnd, isReverse ? -1 : 1);
+    if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) {
             intergenicKmerList.push_back(rightKmerHash);
-        }
     }
 
+    // if (find(intergenicKmerList.begin(), intergenicKmerList.end(), leftKmerHash) !=
+    //     intergenicKmerList.end()) { //extension to left
+    //     if (!isReverse) { //forward
+    //         frame = (genes[numOfGene - 1].begin - 1) % 3;
+    //         leftEnd = genes[numOfGene - 2].end - 1 - 22;
+    //         while (leftEnd % 3 != frame) leftEnd++;
+    //         blocks.emplace_back(leftEnd, length - 1, 1);
+    //         blockIdx++;
+    //     } else { // reverse
+    //         frame = (genes[numOfGene - 1].end - 1) % 3;
+    //         rightEnd = length - 1;
+    //         while (rightEnd % 3 != frame) rightEnd--;
+    //         blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1);
+    //         blockIdx++;
+    //     }
+    // } else { //extension to right
+    //     if (hasBeenExtendedToLeft) {
+    //         if (!isReverse) { //forward
+    //             frame = (genes[numOfGene - 1].begin - 1) % 3;
+    //             leftEnd = genes[numOfGene - 2].end - 1 - 22;
+    //             while (leftEnd % 3 != frame) leftEnd++;
+    //             blocks.emplace_back(leftEnd, length - 1, 1);
+    //             blockIdx++;
+    //         } else {
+    //             frame = (genes[numOfGene - 1].end - 1) % 3;
+    //             rightEnd = length - 1;
+    //             while (rightEnd % 3 != frame) rightEnd--;
+    //             blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1);
+    //             blockIdx++;
+    //         }
+    //     } else {
+    //         if (!isReverse) {
+    //             blocks.emplace_back(genes[numOfGene - 1].begin, length - 1, 1);
+    //             blockIdx++;
+    //         } else {
+    //             frame = (genes[numOfGene - 1].end - 1) % 3;
+    //             rightEnd = length - 1;
+    //             while (rightEnd % 3 != frame) rightEnd--;
+    //             blocks.emplace_back(genes[numOfGene - 1].begin - 1, rightEnd, -1);
+    //             blockIdx++;
+    //         }
+    //     }
+
+    //     //If current intergenic sequences is new, update intergenicKmerList.
+    //     if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) {
+    //         intergenicKmerList.push_back(rightKmerHash);
+    //     }
+    // }
+
     free(newIntergenicKmer);
     free(leftKmer);
     free(rightKmer);

From ffe587bad3df2ffa2f15ca359724cc35acfe306c Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Wed, 18 Oct 2023 16:46:33 +0900
Subject: [PATCH 41/65] remove unused vairable

---
 src/commons/Taxonomer.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index 0f97a3dc..40f9302b 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -87,8 +87,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
     // Get the best genus for current query
     vector<Match> genusMatches;
     genusMatches.reserve(end - offset + 1);
-
-    int res;
     TaxonScore genusScore(0, 0, 0, 0);
     if (par.seqMode == 2) {
         if (par.spaceMask != "11111111"){

From bc74ffd99e7e81e2b733160408eebe839bc47e09 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Fri, 20 Oct 2023 17:38:10 +0900
Subject: [PATCH 42/65] first commit

---
 src/commons/KmerMatcher.cpp | 123 ++++++++++++++++++++++--------------
 1 file changed, 76 insertions(+), 47 deletions(-)

diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 646bd1cf..df603657 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -159,57 +159,87 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
     // Each split has start and end points of query list + proper offset point of target k-mer list
     std::vector<QueryKmerSplit> querySplits;
     uint64_t queryAA;
- 
-    if (threads == 1) { //Single thread
-        querySplits.emplace_back(0, queryKmerNum - 1, queryKmerNum, diffIdxSplits.data[0]);
-    } else if (threads == 2) { //Two threads
-        size_t splitWidth = queryKmerNum / 2;
-        querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]);
-        for (size_t tSplitCnt = 0; tSplitCnt < numOfDiffIdxSplits_use; tSplitCnt++) {
-            queryAA = AminoAcidPart(queryKmerList[splitWidth].ADkmer);
-            if (queryAA <= AminoAcidPart(diffIdxSplits.data[tSplitCnt].ADkmer)) {
-                tSplitCnt = tSplitCnt - (tSplitCnt != 0);
-                querySplits.emplace_back(splitWidth, queryKmerNum - 1, queryKmerNum - splitWidth,
-                                         diffIdxSplits.data[tSplitCnt]);
+    size_t quotient = queryKmerNum / threads;
+    size_t remainder = queryKmerNum % threads;
+    size_t startIdx = 0;
+    size_t endIdx = 0; // endIdx is inclusive
+    for (size_t i = 0; i < threads; i++) {
+        endIdx = startIdx + quotient - 1;
+        if (remainder > 0) {
+            endIdx++;
+            remainder--;
+        }
+        bool needLastTargetBlock = true;
+        queryAA = AminoAcidPart(queryKmerList[startIdx].ADkmer);
+        for (size_t j = 0; j < numOfDiffIdxSplits_use; j ++) {
+            if (queryAA <= AminoAcidPart(diffIdxSplits.data[j].ADkmer)) {
+                j = j - (j != 0);
+                querySplits.emplace_back(startIdx, endIdx, endIdx - startIdx + 1, diffIdxSplits.data[j]);
+                needLastTargetBlock = false;
                 break;
             }
         }
-    } else { //More than two threads
-        // Devide query k-mers into blocks
-        size_t splitWidth = queryKmerNum / (threads - 1);
-        querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]);
-        size_t i = 1;
-        for (; (i < threads) && (splitWidth * i < queryKmerNum); i++) {
-            queryAA = AminoAcidPart(queryKmerList[splitWidth * i].ADkmer);
-            bool needLastTargetBlock = true;
-            for (size_t j = 0; j < numOfDiffIdxSplits_use; j++) {
-                if (queryAA <= AminoAcidPart(diffIdxSplits.data[j].ADkmer)) {
-                    j = j - (j != 0);
-                    if (i != threads - 1) {
-                        querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth,
-                                                 diffIdxSplits.data[j]);
-                    } else {
-                        querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i,
-                                                 diffIdxSplits.data[j]);
-                    }
-                    needLastTargetBlock = false;
-                    break;
-                }
-            }
-            if (needLastTargetBlock) {
-                if (i != threads - 1) { // If it is not the last split
-                    querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth,
-                                             diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
-                } else {
-                    querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i,
-                                             diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
-                }
-            }
+        if (needLastTargetBlock) {
+            querySplits.emplace_back(startIdx, endIdx, endIdx - startIdx + 1, diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
         }
+        startIdx = endIdx + 1;
+    }
+ 
+    // if (threads == 1) { //Single thread
+    //     querySplits.emplace_back(0, queryKmerNum - 1, queryKmerNum, diffIdxSplits.data[0]);
+    // } else if (threads == 2) { //Two threads
+    //     size_t splitWidth = queryKmerNum / 2;
+    //     querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]);
+    //     for (size_t tSplitCnt = 0; tSplitCnt < numOfDiffIdxSplits_use; tSplitCnt++) {
+    //         queryAA = AminoAcidPart(queryKmerList[splitWidth].ADkmer);
+    //         if (queryAA <= AminoAcidPart(diffIdxSplits.data[tSplitCnt].ADkmer)) {
+    //             tSplitCnt = tSplitCnt - (tSplitCnt != 0);
+    //             querySplits.emplace_back(splitWidth, queryKmerNum - 1, queryKmerNum - splitWidth,
+    //                                      diffIdxSplits.data[tSplitCnt]);
+    //             break;
+    //         }
+    //     }
+    // } else { //More than two threads
+    //     // Devide query k-mers into blocks
+    //     size_t splitWidth = queryKmerNum / (threads - 1);
+    //     querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]);
+    //     size_t i = 1;
+    //     for (; (i < threads) && (splitWidth * i < queryKmerNum); i++) {
+    //         queryAA = AminoAcidPart(queryKmerList[splitWidth * i].ADkmer);
+    //         bool needLastTargetBlock = true;
+    //         for (size_t j = 0; j < numOfDiffIdxSplits_use; j++) {
+    //             if (queryAA <= AminoAcidPart(diffIdxSplits.data[j].ADkmer)) {
+    //                 j = j - (j != 0);
+    //                 if (i != threads - 1) {
+    //                     querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth,
+    //                                              diffIdxSplits.data[j]);
+    //                 } else {
+    //                     querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i,
+    //                                              diffIdxSplits.data[j]);
+    //                 }
+    //                 needLastTargetBlock = false;
+    //                 break;
+    //             }
+    //         }
+    //         if (needLastTargetBlock) {
+    //             if (i != threads - 1) { // If it is not the last split
+    //                 querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth,
+    //                                          diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
+    //             } else {
+    //                 querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i,
+    //                                          diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
+    //             }
+    //         }
+    //     }
+
+    //     if (i != threads) {
+    //         threads = querySplits.size();
+    //     }
+    // }
 
-        if (i != threads) {
-            threads = querySplits.size();
-        }
+    // Print query splits
+    for (size_t i = 0; i < querySplits.size(); i++) {
+        cout << i << "\t" << querySplits[i].start << "\t" << querySplits[i].end << endl;
     }
 
     bool *splitCheckList = (bool *) malloc(sizeof(bool) * threads);
@@ -411,7 +441,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
 //                            cout << (int) getHammingDistanceSum(currentQuery, currentTargetKmer) << "\t";
 //                            print_binary16(16, getHammings(currentQuery, currentTargetKmer)); cout << endl;
 //                        }
-
                         if (unlikely(BufferSize < diffIdxBufferIdx + 7)){
                             loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx,
                                        BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 );

From 9bfeab18b128e37679b412d24e39947975060b43 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Fri, 20 Oct 2023 17:56:02 +0900
Subject: [PATCH 43/65] print diffIdxSplits

---
 src/commons/IndexCreator.h  | 8 ++++++++
 src/commons/KmerMatcher.cpp | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h
index 33bc1c66..48968fc5 100644
--- a/src/commons/IndexCreator.h
+++ b/src/commons/IndexCreator.h
@@ -166,6 +166,14 @@ class IndexCreator{
 public:
     static void splitSequenceFile(vector<SequenceBlock> & seqSegments, MmapedData<char> seqFile);
 
+    static void printIndexSplitList(DiffIdxSplit * splitList) {
+        for (int i = 0; i < 4096; i++) {
+            cout << splitList[i].infoIdxOffset << " " << 
+                    splitList[i].diffIdxOffset << " " << 
+                    splitList[i].ADkmer << endl;
+        }
+    }
+
     string getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments,
                                   const string & seqFileName,
                                   const unordered_map<string, TaxID> & acc2taxid,
diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index df603657..778e79a5 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -1,4 +1,5 @@
 #include "KmerMatcher.h"
+#include "IndexCreator.h"
 #include "Kmer.h"
 #include "Mmap.h"
 #include <ostream>
@@ -145,6 +146,8 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
         }
     }
 
+    IndexCreator::printIndexSplitList(diffIdxSplits.data);
+    
     // Filter out meaningless target splits
     size_t numOfDiffIdxSplits = diffIdxSplits.fileSize / sizeof(DiffIdxSplit);
     size_t numOfDiffIdxSplits_use = numOfDiffIdxSplits;
@@ -155,6 +158,11 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
         }
     }
 
+
+    cout << numOfDiffIdxSplits_use << endl;
+    IndexCreator::printIndexSplitList(diffIdxSplits.data);
+
+
     // Divide query k-mer list into blocks for multi threading.
     // Each split has start and end points of query list + proper offset point of target k-mer list
     std::vector<QueryKmerSplit> querySplits;

From 802d070bb490f773d99a62b44c33028b39f3335d Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Sat, 21 Oct 2023 09:14:56 +0900
Subject: [PATCH 44/65] fix error in void
 IndexCreator::writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t &
 kmerNum, const LocalParameters & par,

---
 src/commons/IndexCreator.cpp | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index 2e011d7e..cae63d6f 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -407,17 +407,34 @@ void IndexCreator::writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & k
     DiffIdxSplit splitList[par.splitNum];
     memset(splitList, 0, sizeof(DiffIdxSplit) * par.splitNum);
     size_t splitWidth = uniqKmerCnt / par.splitNum;
+    size_t remainder = uniqKmerCnt % par.splitNum;
     size_t splitCnt = 1;
+    size_t start = 0;
     for (size_t i = 1; i < (size_t) par.splitNum; i++) {
-        for (size_t j = uniqKmerIdx[0] + splitWidth * i; j + 1 < uniqKmerCnt; j++) {
-            if (AminoAcidPart(kmerBuffer[j].ADkmer) != AminoAcidPart(kmerBuffer[j + 1].ADkmer)) {
-                if (kmerBuffer[j].ADkmer != splitList[splitCnt - 1].ADkmer){
-                    splitList[splitCnt].ADkmer = kmerBuffer[j].ADkmer;
-                    splitCnt ++;
-                }
+        start = start + splitWidth;
+        if (remainder > 0) {
+            start++;
+            remainder--;
+        }
+        for (size_t j = start; j + 1 < start + splitWidth; j++) {
+            if (AminoAcidPart(kmerBuffer[uniqKmerIdx[j]].ADkmer) 
+                != AminoAcidPart(kmerBuffer[uniqKmerIdx[j + 1]].ADkmer)) {
+                splitList[splitCnt].ADkmer = kmerBuffer[uniqKmerIdx[j + 1]].ADkmer;
+                cout << splitList[splitCnt].ADkmer << endl;
+                splitCnt++;
                 break;
             }
         }
+        // for (size_t j = uniqKmerIdx[0] + splitWidth * i; j + 1 < uniqKmerCnt; j++) { // here is a bug
+        //     if (AminoAcidPart(kmerBuffer[j].ADkmer) != AminoAcidPart(kmerBuffer[j + 1].ADkmer)) {
+        //         if (kmerBuffer[j].ADkmer != splitList[splitCnt - 1].ADkmer){
+        //             splitList[splitCnt].ADkmer = kmerBuffer[j].ADkmer;
+        //             cout << splitList[splitCnt].ADkmer << endl;
+        //             splitCnt ++;
+        //         }
+        //         break;
+        //     }
+        // }
     }
 
     FILE * diffIdxFile = fopen(diffIdxFileName.c_str(), "wb");
@@ -446,6 +463,8 @@ void IndexCreator::writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & k
         if((splitIdx < splitCnt) && (lastKmer == splitList[splitIdx].ADkmer)){
             splitList[splitIdx].diffIdxOffset = totalDiffIdx;
             splitList[splitIdx].infoIdxOffset = write;
+            cout << "Split " << splitIdx << " at " << splitList[splitIdx].infoIdxOffset << " " << 
+            splitList[splitIdx].diffIdxOffset << " " << splitList[splitIdx].ADkmer << endl;
             splitIdx ++;
         }
     }
@@ -454,6 +473,7 @@ void IndexCreator::writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & k
     cout<<"written k-mer count: "<< write << endl;
 
     flushKmerBuf(diffIdxBuffer, diffIdxFile, localBufIdx);
+    printIndexSplitList(splitList);
     fwrite(splitList, sizeof(DiffIdxSplit), par.splitNum, diffIdxSplitFile);
 
     free(diffIdxBuffer);

From 1119a09e8d7770429ed60c217a4e227acc9dc9bb Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Sat, 21 Oct 2023 09:19:11 +0900
Subject: [PATCH 45/65] remove prints

---
 src/commons/FileMerger.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/commons/FileMerger.cpp b/src/commons/FileMerger.cpp
index 291c1316..46924337 100644
--- a/src/commons/FileMerger.cpp
+++ b/src/commons/FileMerger.cpp
@@ -257,6 +257,7 @@ void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits)
     int offsetListIdx = 1;
     for(size_t os = 0; os < splitNum; os++){
         offsetList[os] = os * sizeOfSplit;
+        // cout << os * sizeOfSplit << endl;
     }
     offsetList[splitNum] = UINT64_MAX;
     DiffIdxSplit splitList[splitNum];

From 3f3bf9d410770c365c44170cd5b3bbafd92cb0f4 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Mon, 23 Oct 2023 17:22:43 +0900
Subject: [PATCH 46/65] remove prints and old codes

---
 src/commons/IndexCreator.cpp | 14 ++------
 src/commons/KmerMatcher.cpp  | 64 ++----------------------------------
 2 files changed, 4 insertions(+), 74 deletions(-)

diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
index cae63d6f..fe864baf 100644
--- a/src/commons/IndexCreator.cpp
+++ b/src/commons/IndexCreator.cpp
@@ -425,16 +425,6 @@ void IndexCreator::writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & k
                 break;
             }
         }
-        // for (size_t j = uniqKmerIdx[0] + splitWidth * i; j + 1 < uniqKmerCnt; j++) { // here is a bug
-        //     if (AminoAcidPart(kmerBuffer[j].ADkmer) != AminoAcidPart(kmerBuffer[j + 1].ADkmer)) {
-        //         if (kmerBuffer[j].ADkmer != splitList[splitCnt - 1].ADkmer){
-        //             splitList[splitCnt].ADkmer = kmerBuffer[j].ADkmer;
-        //             cout << splitList[splitCnt].ADkmer << endl;
-        //             splitCnt ++;
-        //         }
-        //         break;
-        //     }
-        // }
     }
 
     FILE * diffIdxFile = fopen(diffIdxFileName.c_str(), "wb");
@@ -463,8 +453,8 @@ void IndexCreator::writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & k
         if((splitIdx < splitCnt) && (lastKmer == splitList[splitIdx].ADkmer)){
             splitList[splitIdx].diffIdxOffset = totalDiffIdx;
             splitList[splitIdx].infoIdxOffset = write;
-            cout << "Split " << splitIdx << " at " << splitList[splitIdx].infoIdxOffset << " " << 
-            splitList[splitIdx].diffIdxOffset << " " << splitList[splitIdx].ADkmer << endl;
+            // cout << "Split " << splitIdx << " at " << splitList[splitIdx].infoIdxOffset << " " << 
+            // splitList[splitIdx].diffIdxOffset << " " << splitList[splitIdx].ADkmer << endl;
             splitIdx ++;
         }
     }
diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 778e79a5..9f8c975c 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -145,8 +145,6 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
             break;
         }
     }
-
-    IndexCreator::printIndexSplitList(diffIdxSplits.data);
     
     // Filter out meaningless target splits
     size_t numOfDiffIdxSplits = diffIdxSplits.fileSize / sizeof(DiffIdxSplit);
@@ -158,11 +156,6 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
         }
     }
 
-
-    cout << numOfDiffIdxSplits_use << endl;
-    IndexCreator::printIndexSplitList(diffIdxSplits.data);
-
-
     // Divide query k-mer list into blocks for multi threading.
     // Each split has start and end points of query list + proper offset point of target k-mer list
     std::vector<QueryKmerSplit> querySplits;
@@ -192,62 +185,9 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer,
         }
         startIdx = endIdx + 1;
     }
- 
-    // if (threads == 1) { //Single thread
-    //     querySplits.emplace_back(0, queryKmerNum - 1, queryKmerNum, diffIdxSplits.data[0]);
-    // } else if (threads == 2) { //Two threads
-    //     size_t splitWidth = queryKmerNum / 2;
-    //     querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]);
-    //     for (size_t tSplitCnt = 0; tSplitCnt < numOfDiffIdxSplits_use; tSplitCnt++) {
-    //         queryAA = AminoAcidPart(queryKmerList[splitWidth].ADkmer);
-    //         if (queryAA <= AminoAcidPart(diffIdxSplits.data[tSplitCnt].ADkmer)) {
-    //             tSplitCnt = tSplitCnt - (tSplitCnt != 0);
-    //             querySplits.emplace_back(splitWidth, queryKmerNum - 1, queryKmerNum - splitWidth,
-    //                                      diffIdxSplits.data[tSplitCnt]);
-    //             break;
-    //         }
-    //     }
-    // } else { //More than two threads
-    //     // Devide query k-mers into blocks
-    //     size_t splitWidth = queryKmerNum / (threads - 1);
-    //     querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]);
-    //     size_t i = 1;
-    //     for (; (i < threads) && (splitWidth * i < queryKmerNum); i++) {
-    //         queryAA = AminoAcidPart(queryKmerList[splitWidth * i].ADkmer);
-    //         bool needLastTargetBlock = true;
-    //         for (size_t j = 0; j < numOfDiffIdxSplits_use; j++) {
-    //             if (queryAA <= AminoAcidPart(diffIdxSplits.data[j].ADkmer)) {
-    //                 j = j - (j != 0);
-    //                 if (i != threads - 1) {
-    //                     querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth,
-    //                                              diffIdxSplits.data[j]);
-    //                 } else {
-    //                     querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i,
-    //                                              diffIdxSplits.data[j]);
-    //                 }
-    //                 needLastTargetBlock = false;
-    //                 break;
-    //             }
-    //         }
-    //         if (needLastTargetBlock) {
-    //             if (i != threads - 1) { // If it is not the last split
-    //                 querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth,
-    //                                          diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
-    //             } else {
-    //                 querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i,
-    //                                          diffIdxSplits.data[numOfDiffIdxSplits_use - 2]);
-    //             }
-    //         }
-    //     }
-
-    //     if (i != threads) {
-    //         threads = querySplits.size();
-    //     }
-    // }
 
-    // Print query splits
-    for (size_t i = 0; i < querySplits.size(); i++) {
-        cout << i << "\t" << querySplits[i].start << "\t" << querySplits[i].end << endl;
+    if (querySplits.size() != threads) {
+        threads = querySplits.size();
     }
 
     bool *splitCheckList = (bool *) malloc(sizeof(bool) * threads);

From b75d2ae74652c43a651d09901e33262c6a543a64 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Fri, 27 Oct 2023 15:27:58 +0900
Subject: [PATCH 47/65] Fix error in KmerMatcher.cpp: changed condition to
 start with getNextTargetKmer

---
 src/commons/KmerMatcher.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 9f8c975c..f99ccbd3 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -251,11 +251,13 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
                 loadBuffer(kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx, BufferSize);
                 fseek(diffIdxFp, 2 * (long) (diffIdxBufferIdx), SEEK_SET);
                 loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize);
-
-                if (i == 0) {
+                
+                if (querySplits[i].diffIdxSplit.ADkmer == 0 && querySplits[i].diffIdxSplit.diffIdxOffset == 0 
+                    && querySplits[i].diffIdxSplit.infoIdxOffset == 0) {
                     currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer,
                                                           diffIdxBufferIdx, diffIdxPos);
                 }
+            
                 currentQuery = UINT64_MAX;
                 currentQueryAA = UINT64_MAX;
 

From a7db9f9af6deb61e1b2838e073a71b55e32e20c3 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Mon, 30 Oct 2023 16:39:14 +0900
Subject: [PATCH 48/65] fixed the problem related to reads < 26 bp. Thank you
 Niko!

---
 src/commons/KmerExtractor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp
index 4d656cdf..232cc89a 100644
--- a/src/commons/KmerExtractor.cpp
+++ b/src/commons/KmerExtractor.cpp
@@ -85,7 +85,7 @@ void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq1,
             for (size_t i = 0; i < currentQueryNum; i ++) {
                 size_t queryIdx = processedQueryNum - currentQueryNum + i;
                 // Get k-mer count
-                auto kmerCnt = LocalUtil::getQueryKmerNumber<size_t>(reads1[i].length(), spaceNum);
+                int kmerCnt = LocalUtil::getQueryKmerNumber<int>(reads1[i].length(), spaceNum);
                 
                 // Ignore short read
                 if (kmerCnt < 1) { continue; }

From 49dc8090c333a9ebae7db10dd26ac90b43ae5719 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Mon, 30 Oct 2023 19:17:24 +0900
Subject: [PATCH 49/65] Choose the best species directly skipping genus
 selection

---
 src/commons/Taxonomer.cpp | 74 ++++++++++++++++++++++++++++++++++++++-
 src/commons/Taxonomer.h   | 10 ++++++
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index 40f9302b..c60e66d0 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -1,5 +1,7 @@
 #include "Taxonomer.h"
+#include "Match.h"
 #include "NcbiTaxonomy.h"
+#include <sys/types.h>
 #include <unordered_map>
 
 
@@ -327,6 +329,76 @@ TaxID Taxonomer::BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID r
     }
 }
 
+TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
+                                            const Match *matchList,
+                                            size_t end,
+                                            size_t offset,
+                                            int queryLength,
+                                            const LocalParameters &par) {
+    TaxID currentSpecies;
+    vector<const Match *> filteredMatches;
+    vector<vector<const Match *>> matchesForEachSpecies;
+    vector<TaxonScore> speciesScores;
+    TaxonScore bestScore;
+    size_t i = offset;
+    uint8_t curFrame;
+    vector<const Match *> curFrameMatches;
+
+     while (i  < end + 1) {
+        currentSpecies = matchList[i].speciesId;
+        // For current species
+        while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
+            curFrame = matchList[i].qInfo.frame;
+            curFrameMatches.clear();
+            // For current frame
+            while ((i < end + 1) && currentSpecies == matchList[i].speciesId && curFrame == matchList[i].qInfo.frame) {
+                curFrameMatches.push_back(&matchList[i]);
+                i ++;
+            }
+            if (curFrameMatches.size() > 1) {
+                remainConsecutiveMatches(curFrameMatches, filteredMatches, currentSpecies, par);
+            }
+        }
+        // Construct a match combination using filtered matches of current species
+        // so that it can best cover the query, and score the combination
+        if (!filteredMatches.empty()) {
+            matchesForEachSpecies.push_back(filteredMatches);
+            speciesScores.push_back(scoreGenus(filteredMatches, queryLength));
+        }
+        filteredMatches.clear();
+    }
+    
+    // If there are no meaningful species
+    if (speciesScores.empty()) {
+        bestScore.score = 0;
+        return bestScore;
+    }
+
+    TaxonScore maxScore = *max_element(speciesScores.begin(), speciesScores.end(),
+                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
+
+    vector<size_t> maxIdx;
+    for (size_t g = 0; g < speciesScores.size(); g++) {
+        if (speciesScores[g].score == maxScore.score) {
+            maxIdx.push_back(g);
+        }
+    }
+    bestScore = maxScore;
+
+    for (unsigned long g : maxIdx) {
+        for (const Match * m : matchesForEachSpecies[g]) {
+            speciesMatches.push_back(*m);
+        }
+    }
+
+    // More than one species
+    if (maxIdx.size() > 1) {
+        bestScore.taxId = 0;
+    }
+
+    return bestScore;                    
+}
+
 TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
                                            size_t offset, int readLength1, int readLength2, const LocalParameters & par) {
     TaxID currentGenus;
@@ -1093,7 +1165,7 @@ TaxonScore Taxonomer::scoreSpecies(const vector<Match> &matches,
                                     int queryLength,
                                     int queryLength2) {
 
-    // Get the smallest hamming distance at each position of query
+    // Get the largest hamming distance at each position of query
     int aminoAcidNum_total = queryLength / 3 + queryLength2 / 3;
     int aminoAcidNum_read1 = queryLength / 3;
     auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3];
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index 45687923..8a1e0999 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -62,6 +62,13 @@ class Taxonomer {
                          const Match *matchList,
                          vector<Query> & queryList,
                          const LocalParameters &par);
+    
+    void chooseBestTaxon2(uint32_t currentQuery,
+                          size_t offset,
+                          size_t end,
+                          const Match *matchList,
+                          vector<Query> & queryList,
+                          const LocalParameters &par);
 
     void remainConsecutiveMatches(vector<const Match *> & curFrameMatches,
                                   vector<const Match *> & filteredMatches,
@@ -80,6 +87,9 @@ class Taxonomer {
     TaxonScore getBestGenusMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
                                    int readLength1, int readLength2, const LocalParameters &par);
 
+    TaxonScore getBestSpeciesMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end,
+                                     size_t offset, int queryLength, const LocalParameters &par);
+
     TaxonScore getBestGenusMatches_spaced(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
                                           int readLength1, int readLength2);
     TaxonScore getBestGenusMatches_spaced(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,

From d0f53447c3161f0eb0a8ab3b71efec4cd4597bf9 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 31 Oct 2023 11:17:56 +0900
Subject: [PATCH 50/65] First running version

---
 src/commons/Taxonomer.cpp | 726 ++++++++++++++++++++++++--------------
 src/commons/Taxonomer.h   |  31 +-
 2 files changed, 486 insertions(+), 271 deletions(-)

diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index c60e66d0..b52f4447 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -21,6 +21,9 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon
     minCoveredPos = par.minCoveredPos;
     accessionLevel = par.accessionLevel;
     minSSMatch = par.minSSMatch;
+    minConsCnt = par.minConsCnt;
+    minConsCntEuk = par.minConsCntEuk;
+    eukaryotaTaxId = par.eukaryotaTaxId;
 }
 
 Taxonomer::~Taxonomer() {
@@ -54,7 +57,7 @@ void Taxonomer::assignTaxonomy(const Match *matchList,
     {
 #pragma omp for schedule(dynamic, 1)
         for (size_t i = 0; i < blockIdx; ++i) {
-            chooseBestTaxon(matchBlocks[i].id,
+            chooseBestTaxon2(matchBlocks[i].id,
                             matchBlocks[i].start,
                             matchBlocks[i].end,
                             matchList,
@@ -71,6 +74,127 @@ void Taxonomer::assignTaxonomy(const Match *matchList,
 
 }
 
+void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
+                                 size_t offset,
+                                 size_t end,
+                                 const Match *matchList,
+                                 vector<Query> & queryList,
+                                 const LocalParameters &par) {
+    TaxID selectedTaxon;
+
+//    if (true) {
+//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
+//        for (size_t i = offset; i < end + 1; i++) {
+//            cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
+//        }
+//    }
+
+    // Get the best species for current query
+    vector<Match> speciesMatches;
+    speciesMatches.reserve(end - offset + 1);
+    TaxonScore speciesScore(0, 0, 0, 0);
+    if (par.seqMode == 2) {
+        speciesScore = getBestSpeciesMatches(speciesMatches, matchList, end, offset,
+                                             queryList[currentQuery].queryLength,
+                                             queryList[currentQuery].queryLength2);
+    } else {
+        speciesScore = getBestSpeciesMatches(speciesMatches, matchList, end, offset,
+                                             queryList[currentQuery].queryLength);
+    }
+
+//    if (true) {
+//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n";
+//        for (size_t i = 0; i < genusMatches.size(); i++) {
+//           cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) <<  " "  << int(genusMatches[i].redundancy) << endl;
+//         }
+//        cout << "Genus score: " << genusScore.score << "\n";
+//    }
+
+    // If there is no proper species for current query, it is un-classified.
+    if (speciesScore.score == 0 || speciesScore.coverage < par.minCoverage || speciesScore.score < par.minScore) {
+        queryList[currentQuery].isClassified = false;
+        queryList[currentQuery].classification = 0;
+        queryList[currentQuery].score = speciesScore.score;
+        queryList[currentQuery].coverage = speciesScore.coverage;
+        queryList[currentQuery].hammingDist = speciesScore.hammingDist;
+        queryList[currentQuery].newSpecies = false;
+        return;
+    }
+
+    // If there are two or more good genus level candidates, find the LCA.
+    if (speciesScore.taxId == 0) {
+        vector<TaxID> genusList;
+        genusList.reserve(speciesMatches.size());
+        for (auto & genusMatch : speciesMatches) {
+            genusList.push_back(genusMatch.genusId);
+        }
+        selectedTaxon = taxonomy->LCA(genusList)->taxId;
+        queryList[currentQuery].isClassified = true;
+        queryList[currentQuery].classification = selectedTaxon;
+        queryList[currentQuery].score = speciesScore.score;
+        queryList[currentQuery].coverage = speciesScore.coverage;
+        queryList[currentQuery].hammingDist = speciesScore.hammingDist;
+        for (auto & spMatch : speciesMatches) {
+            queryList[currentQuery].taxCnt[spMatch.targetId]++;
+        }
+        return;
+    }
+
+    // If score is not enough, classify to the parent of the selected species
+    if (speciesScore.score < par.minSpScore) {
+        queryList[currentQuery].isClassified = true;
+        queryList[currentQuery].classification = taxonomy->taxonNode(
+                taxonomy->getTaxIdAtRank(speciesScore.taxId, "species"))->parentTaxId;
+        queryList[currentQuery].score = speciesScore.score;
+        queryList[currentQuery].coverage = speciesScore.coverage;
+        queryList[currentQuery].hammingDist = speciesScore.hammingDist;
+        for (auto & spMatch : speciesMatches) {
+            queryList[currentQuery].taxCnt[spMatch.targetId]++;
+        }
+        return;
+    }
+
+    // Sort matches by the position of the query sequence
+//    sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first,
+//         genusMatches.begin() + speciesMatchRange[selectedSpecies].second,
+//         [](const Match & a, const Match & b) {
+//        if (a.qInfo.position / 3 == b.qInfo.position / 3)
+//            return a.hamming < b.hamming;
+//        else
+//            return a.qInfo.position / 3 < b.qInfo.position / 3;
+//    });
+
+    sort(speciesMatches.begin(), speciesMatches.end(),
+         [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
+
+
+    TaxID result = lowerRankClassification(speciesMatches, speciesScore.taxId);
+
+    // Record matches of selected species
+    for (auto & spMatch : speciesMatches) {
+            queryList[currentQuery].taxCnt[spMatch.targetId]++;
+    }
+
+    // Store classification results
+    queryList[currentQuery].isClassified = true;
+    queryList[currentQuery].classification = result;
+    queryList[currentQuery].score = speciesScore.score;
+    queryList[currentQuery].coverage = speciesScore.coverage;
+    queryList[currentQuery].hammingDist = speciesScore.hammingDist;
+    queryList[currentQuery].newSpecies = false;
+//    if (par.printLog) {
+//        cout << "# " << currentQuery << endl;
+//        for (size_t i = 0; i < genusMatches.size(); i++) {
+//            cout << i << " " << genusMatches[i].qInfo.pos << " " <<
+//            genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl;
+//        }
+//        cout << "Score: " << speciesScore.score << "  " << selectedSpecies << " "
+//             << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx)
+//
+//             << endl;
+//    }
+}
+
 void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
                                  size_t offset,
                                  size_t end,
@@ -91,23 +215,12 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
     genusMatches.reserve(end - offset + 1);
     TaxonScore genusScore(0, 0, 0, 0);
     if (par.seqMode == 2) {
-        if (par.spaceMask != "11111111"){
-            genusScore = getBestGenusMatches_spaced(genusMatches, matchList, end, offset,
-                                                    queryList[currentQuery].queryLength,
-                                                    queryList[currentQuery].queryLength2);
-        } else {
-            genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
+        genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
                                              queryList[currentQuery].queryLength,
-                                             queryList[currentQuery].queryLength2, par);
-        }
+                                             queryList[currentQuery].queryLength2);
     } else {
-        if (par.spaceMask != "11111111") {
-            genusScore = getBestGenusMatches_spaced(genusMatches, matchList, end, offset,
-                                                    queryList[currentQuery].queryLength);
-        } else {
-            genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
-                                             queryList[currentQuery].queryLength, par);
-        }
+        genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
+                                             queryList[currentQuery].queryLength);
     }
 
 //    if (true) {
@@ -207,12 +320,15 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
 //            return a.qInfo.position / 3 < b.qInfo.position / 3;
 //    });
 
-    sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first,
-         genusMatches.begin() + speciesMatchRange[selectedSpecies].second,
-         [](const Match & a, const Match & b) { return a.qInfo.pos > b.qInfo.pos; });
+    vector<Match>::const_iterator first = genusMatches.begin() + speciesMatchRange[selectedSpecies].first;
+    vector<Match>::const_iterator last = genusMatches.begin() + speciesMatchRange[selectedSpecies].second;
+    vector<Match> speciesMatches(first, last);
 
 
-    TaxID result = lowerRankClassification(genusMatches, speciesMatchRange[selectedSpecies], selectedSpecies);
+    sort(speciesMatches.begin(), speciesMatches.end(),
+         [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
+         
+    TaxID result = lowerRankClassification(speciesMatches, selectedSpecies);
 
     // Record matches of selected species
     for (size_t i = speciesMatchRange[selectedSpecies].first; i < speciesMatchRange[selectedSpecies].second; i++) {
@@ -239,17 +355,17 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
 //    }
 }
 
-TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, pair<int, int> &matchRange, TaxID spTaxId) {
-    int i = matchRange.second - 1;
+TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, TaxID spTaxId) {
+    
     unordered_map<TaxID, unsigned int> taxCnt;
+    size_t matchNum = matches.size();
 
-    while ( i >= matchRange.first ) {
+    for (size_t i = 0; i < matchNum; i++) {
         size_t currQuotient = matches[i].qInfo.pos / 3;
         uint8_t minHamming = matches[i].hamming;
         Match * minHammingMatch = & matches[i];
         TaxID minHammingTaxId = minHammingMatch->targetId;
-        i --;
-        while ( (i >= matchRange.first) && (currQuotient == matches[i].qInfo.pos / 3) ) {
+        while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) {
             if (matches[i].hamming < minHamming) {
                 minHamming = matches[i].hamming;
                 minHammingMatch = & matches[i];
@@ -259,11 +375,33 @@ TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, pair<int, int>
                 minHammingMatch->redundancy = true;
                 matches[i].redundancy = true;
             }
-            i--;
+            i++;
         }
-        taxCnt[minHammingTaxId]++;
+        taxCnt[minHammingTaxId]++;       
     }
 
+    // int i = matchRange.second - 1;
+    // while ( i >= matchRange.first ) {
+    //     size_t currQuotient = matches[i].qInfo.pos / 3;
+    //     uint8_t minHamming = matches[i].hamming;
+    //     Match * minHammingMatch = & matches[i];
+    //     TaxID minHammingTaxId = minHammingMatch->targetId;
+    //     i --;
+    //     while ( (i >= matchRange.first) && (currQuotient == matches[i].qInfo.pos / 3) ) {
+    //         if (matches[i].hamming < minHamming) {
+    //             minHamming = matches[i].hamming;
+    //             minHammingMatch = & matches[i];
+    //             minHammingTaxId = minHammingMatch->targetId;
+    //         } else if (matches[i].hamming == minHamming) {
+    //             minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId);
+    //             minHammingMatch->redundancy = true;
+    //             matches[i].redundancy = true;
+    //         }
+    //         i--;
+    //     }
+    //     taxCnt[minHammingTaxId]++;
+    // }
+
     unordered_map<TaxID, TaxonCounts> cladeCnt;
     getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
 
@@ -333,8 +471,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
                                             const Match *matchList,
                                             size_t end,
                                             size_t offset,
-                                            int queryLength,
-                                            const LocalParameters &par) {
+                                            int queryLength) {
     TaxID currentSpecies;
     vector<const Match *> filteredMatches;
     vector<vector<const Match *>> matchesForEachSpecies;
@@ -356,14 +493,84 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
                 i ++;
             }
             if (curFrameMatches.size() > 1) {
-                remainConsecutiveMatches(curFrameMatches, filteredMatches, currentSpecies, par);
+                remainConsecutiveMatches(curFrameMatches, filteredMatches, currentSpecies);
             }
         }
         // Construct a match combination using filtered matches of current species
         // so that it can best cover the query, and score the combination
         if (!filteredMatches.empty()) {
             matchesForEachSpecies.push_back(filteredMatches);
-            speciesScores.push_back(scoreGenus(filteredMatches, queryLength));
+            speciesScores.push_back(scoreTaxon(filteredMatches, currentSpecies, queryLength));
+        }
+        filteredMatches.clear();
+    }
+    
+    // If there are no meaningful species
+    if (speciesScores.empty()) {
+        bestScore.score = 0;
+        return bestScore;
+    }
+
+    TaxonScore maxScore = *max_element(speciesScores.begin(), speciesScores.end(),
+                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
+
+    vector<size_t> maxIdx;
+    for (size_t g = 0; g < speciesScores.size(); g++) {
+        if (speciesScores[g].score == maxScore.score) {
+            maxIdx.push_back(g);
+        }
+    }
+    bestScore = maxScore;
+
+    for (unsigned long g : maxIdx) {
+        for (const Match * m : matchesForEachSpecies[g]) {
+            speciesMatches.push_back(*m);
+        }
+    }
+
+    // More than one species
+    if (maxIdx.size() > 1) {
+        bestScore.taxId = 0;
+    }
+
+    return bestScore;                    
+}
+
+TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
+                                            const Match *matchList,
+                                            size_t end,
+                                            size_t offset,
+                                            int readLength1,
+                                            int readLength2) {
+    TaxID currentSpecies;
+    vector<const Match *> filteredMatches;
+    vector<vector<const Match *>> matchesForEachSpecies;
+    vector<TaxonScore> speciesScores;
+    TaxonScore bestScore;
+    size_t i = offset;
+    uint8_t curFrame;
+    vector<const Match *> curFrameMatches;
+
+     while (i  < end + 1) {
+        currentSpecies = matchList[i].speciesId;
+        // For current species
+        while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
+            curFrame = matchList[i].qInfo.frame;
+            curFrameMatches.clear();
+            // For current frame
+            while ((i < end + 1) && currentSpecies == matchList[i].speciesId && curFrame == matchList[i].qInfo.frame) {
+                curFrameMatches.push_back(&matchList[i]);
+                i ++;
+            }
+            if (curFrameMatches.size() > 1) {
+                remainConsecutiveMatches(curFrameMatches, filteredMatches, currentSpecies);
+            }
+        }
+        // Construct a match combination using filtered matches of current species
+        // so that it can best cover the query, and score the combination
+        if (!filteredMatches.empty()) {
+            matchesForEachSpecies.push_back(filteredMatches);
+            speciesScores.push_back(scoreTaxon(filteredMatches, currentSpecies, readLength1, readLength2));
         }
         filteredMatches.clear();
     }
@@ -400,7 +607,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
 }
 
 TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
-                                           size_t offset, int readLength1, int readLength2, const LocalParameters & par) {
+                                           size_t offset, int readLength1, int readLength2) {
     TaxID currentGenus;
     TaxID currentSpecies;
 
@@ -433,7 +640,7 @@ TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Mat
                     i ++;
                 }
                 if (curFrameMatches.size() > 1) {
-                    remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus, par);
+                    remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus);
                 }
             }
         }
@@ -442,7 +649,7 @@ TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Mat
         // so that it can best cover the query, and score the combination
         if (!filteredMatches.empty()) {
             matchesForEachGenus.push_back(filteredMatches);
-            genusScores.push_back(scoreGenus(filteredMatches, readLength1, readLength2));
+            genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, readLength1, readLength2));
         }
         filteredMatches.clear();
     }
@@ -488,8 +695,7 @@ TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Mat
 
 void Taxonomer::remainConsecutiveMatches(vector<const Match *> & curFrameMatches,
                                           vector<const Match *> & filteredMatches,
-                                          TaxID genusId,
-                                          const LocalParameters & par) {
+                                          TaxID genusId) {
     size_t i = 0;
     size_t end = curFrameMatches.size();
     vector<pair<const Match *, size_t>> curPosMatches; // <match, index>
@@ -537,9 +743,9 @@ void Taxonomer::remainConsecutiveMatches(vector<const Match *> & curFrameMatches
 //    }
 
     // Iterate linkedMatches to get filteredMatches
-    int MIN_DEPTH = par.minConsCnt - 1;
-    if (taxonomy->IsAncestor(par.eukaryotaTaxId, genusId)) {
-        MIN_DEPTH = par.minConsCntEuk - 1;
+    int MIN_DEPTH = minConsCnt - 1;
+    if (taxonomy->IsAncestor(eukaryotaTaxId, genusId)) {
+        MIN_DEPTH = minConsCntEuk - 1;
     }
     unordered_set<size_t> used;
     vector<size_t> filteredMatchIdx;
@@ -597,116 +803,116 @@ size_t Taxonomer::DFS(size_t curMatchIdx, const map<size_t, vector<size_t>> & li
     return maxDepth;
 }
 
-TaxonScore Taxonomer::getBestGenusMatches_spaced(vector<Match> &genusMatches, const Match *matchList, size_t end,
-                                                  size_t offset, int readLength1, int readLength2) {
-    TaxID currentGenus;
-    TaxID currentSpecies;
-
-    vector<const Match *> tempMatchContainer;
-    vector<const Match *> filteredMatches;
-    vector<vector<const Match *>> matchesForEachGenus;
-    vector<bool> conservedWithinGenus;
-    vector<TaxonScore> genusScores;
-    TaxonScore bestScore;
-    size_t i = offset;
-    bool lastIn;
-    while (i + 1 < end + 1) {
-        currentGenus = matchList[i].genusId;
-        // For current genus
-        while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) {
-//            currentSpecies = taxId2speciesId[matchList[i].targetId];
-            currentSpecies = matchList[i].speciesId;
-            // For current species
-            // Filter un-consecutive matches (probably random matches)
-            lastIn = false;
-            int distance = 0;
-            int diffPosCntOfCurrRange = 1;
-            int dnaDist = 0;
-
-            // For the same species
-            while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) {
-                distance = matchList[i+1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3;
-                dnaDist = matchList[i+1].qInfo.pos - matchList[i].qInfo.pos;
-                if (distance == 0) { // At the same position
-                    tempMatchContainer.push_back(matchList + i);
-                } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping
-                    lastIn = true;
-                    tempMatchContainer.push_back(matchList + i);
-                    diffPosCntOfCurrRange ++;
-                } else { // Not consecutive --> End range
-                    if (lastIn){
-                        tempMatchContainer.push_back(matchList + i);
-                        if (diffPosCntOfCurrRange >= minCoveredPos) {
-                            filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
-                                                   tempMatchContainer.end());
-                        }
-                    }
-                    lastIn = false;
-                    // Initialize range info
-                    tempMatchContainer.clear();
-                    diffPosCntOfCurrRange = 1;
-                }
-                i++;
-            }
-
-            // Met next species
-            if (lastIn) {
-                tempMatchContainer.push_back(matchList + i);
-                if (diffPosCntOfCurrRange >= minCoveredPos) {
-                    filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
-                                           tempMatchContainer.end());
-                }
-            }
-            tempMatchContainer.clear();
-            i++;
-        }
-
-        // Construct a match combination using filtered matches of current genus
-        // so that it can best cover the query, and score the combination
-        if (!filteredMatches.empty()) {
-            genusScores.push_back(scoreGenus(filteredMatches, readLength1, readLength2));
-        }
-        filteredMatches.clear();
-    }
+// TaxonScore Taxonomer::getBestGenusMatches_spaced(vector<Match> &genusMatches, const Match *matchList, size_t end,
+//                                                   size_t offset, int readLength1, int readLength2) {
+//     TaxID currentGenus;
+//     TaxID currentSpecies;
+
+//     vector<const Match *> tempMatchContainer;
+//     vector<const Match *> filteredMatches;
+//     vector<vector<const Match *>> matchesForEachGenus;
+//     vector<bool> conservedWithinGenus;
+//     vector<TaxonScore> genusScores;
+//     TaxonScore bestScore;
+//     size_t i = offset;
+//     bool lastIn;
+//     while (i + 1 < end + 1) {
+//         currentGenus = matchList[i].genusId;
+//         // For current genus
+//         while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) {
+// //            currentSpecies = taxId2speciesId[matchList[i].targetId];
+//             currentSpecies = matchList[i].speciesId;
+//             // For current species
+//             // Filter un-consecutive matches (probably random matches)
+//             lastIn = false;
+//             int distance = 0;
+//             int diffPosCntOfCurrRange = 1;
+//             int dnaDist = 0;
+
+//             // For the same species
+//             while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) {
+//                 distance = matchList[i+1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3;
+//                 dnaDist = matchList[i+1].qInfo.pos - matchList[i].qInfo.pos;
+//                 if (distance == 0) { // At the same position
+//                     tempMatchContainer.push_back(matchList + i);
+//                 } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping
+//                     lastIn = true;
+//                     tempMatchContainer.push_back(matchList + i);
+//                     diffPosCntOfCurrRange ++;
+//                 } else { // Not consecutive --> End range
+//                     if (lastIn){
+//                         tempMatchContainer.push_back(matchList + i);
+//                         if (diffPosCntOfCurrRange >= minCoveredPos) {
+//                             filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
+//                                                    tempMatchContainer.end());
+//                         }
+//                     }
+//                     lastIn = false;
+//                     // Initialize range info
+//                     tempMatchContainer.clear();
+//                     diffPosCntOfCurrRange = 1;
+//                 }
+//                 i++;
+//             }
+
+//             // Met next species
+//             if (lastIn) {
+//                 tempMatchContainer.push_back(matchList + i);
+//                 if (diffPosCntOfCurrRange >= minCoveredPos) {
+//                     filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
+//                                            tempMatchContainer.end());
+//                 }
+//             }
+//             tempMatchContainer.clear();
+//             i++;
+//         }
 
-    // If there are no meaningful genus
-    if (genusScores.empty()) {
-        bestScore.score = 0;
-        return bestScore;
-    }
+//         // Construct a match combination using filtered matches of current genus
+//         // so that it can best cover the query, and score the combination
+//         if (!filteredMatches.empty()) {
+//             genusScores.push_back(scoreTaxon(filteredMatches, readLength1, readLength2));
+//         }
+//         filteredMatches.clear();
+//     }
+
+//     // If there are no meaningful genus
+//     if (genusScores.empty()) {
+//         bestScore.score = 0;
+//         return bestScore;
+//     }
+
+//     TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
+//                                        [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
+
+//     vector<size_t> maxIdx;
+//     for (size_t g = 0; g < genusScores.size(); g++) {
+//         if (genusScores[g].score > maxScore.score * 0.95f) {
+//             maxIdx.push_back(g);
+//         }
+//     }
+//     bestScore = maxScore;
 
-    TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
-                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
+//     for (unsigned long g : maxIdx) {
+//         for (const Match * m : matchesForEachGenus[g]) {
+//             genusMatches.push_back(*m);
+//         }
+//     }
 
-    vector<size_t> maxIdx;
-    for (size_t g = 0; g < genusScores.size(); g++) {
-        if (genusScores[g].score > maxScore.score * 0.95f) {
-            maxIdx.push_back(g);
-        }
-    }
-    bestScore = maxScore;
+//     // More than one genus
+//     if (maxIdx.size() > 1) {
+//         bestScore.taxId = 0;
+//         return bestScore;
+//     }
+//     return bestScore;
 
-    for (unsigned long g : maxIdx) {
-        for (const Match * m : matchesForEachGenus[g]) {
-            genusMatches.push_back(*m);
-        }
-    }
-
-    // More than one genus
-    if (maxIdx.size() > 1) {
-        bestScore.taxId = 0;
-        return bestScore;
-    }
-    return bestScore;
-
-    //Three cases
-    //1. one genus
-    //2. more than one genus
-    //4. no genus
-}
+//     //Three cases
+//     //1. one genus
+//     //2. more than one genus
+//     //4. no genus
+// }
 
 TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
-                                           size_t offset, int queryLength, const LocalParameters & par) {
+                                           size_t offset, int queryLength) {
     TaxID currentGenus;
     TaxID currentSpecies;
 
@@ -735,7 +941,7 @@ TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Mat
                     i ++;
                 }
                 if (curFrameMatches.size() > 1) {
-                    remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus, par);
+                    remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus);
                 }
             }
         }
@@ -745,7 +951,7 @@ TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Mat
 
         if (!filteredMatches.empty()) {
             matchesForEachGenus.push_back(filteredMatches);
-            genusScores.push_back(scoreGenus(filteredMatches, queryLength));
+            genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, queryLength));
         }
         filteredMatches.clear();
     }
@@ -787,116 +993,117 @@ TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Mat
     //4. no genus
 }
 
-TaxonScore Taxonomer::getBestGenusMatches_spaced(vector<Match> &genusMatches, const Match *matchList, size_t end,
-                                                 size_t offset, int readLength) {
-    TaxID currentGenus;
-    TaxID currentSpecies;
-
-    vector<const Match *> tempMatchContainer;
-    vector<const Match *> filteredMatches;
-    vector<vector<Match>> matchesForEachGenus;
-    vector<bool> conservedWithinGenus;
-    vector<TaxonScore> genusScores;
-    TaxonScore bestScore;
-    size_t i = offset;
-    bool lastIn;
-    size_t speciesMatchCnt;
-    while (i + 1 < end + 1) {
-        currentGenus = matchList[i].genusId;
-        // For current genus
-        while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) {
-            currentSpecies = matchList[i].speciesId;
-            // For current species
-            // Filter un-consecutive matches (probably random matches)
-            lastIn = false;
-            int distance = 0;
-            int diffPosCntOfCurrRange = 1;
-            int dnaDist = 0;
-
-            // For the same species
-            while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) {
-                distance = matchList[i + 1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3;
-                dnaDist = matchList[i + 1].qInfo.pos - matchList[i].qInfo.pos;
-                if (distance == 0) { // At the same position
-                    tempMatchContainer.push_back(matchList + i);
-                } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping
-                    lastIn = true;
-                    tempMatchContainer.push_back(matchList + i);
-                    diffPosCntOfCurrRange++;
-                } else { // Not consecutive --> End range
-                    if (lastIn) {
-                        tempMatchContainer.push_back(matchList + i);
-                        if (diffPosCntOfCurrRange >= minCoveredPos) {
-                            filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
-                                                   tempMatchContainer.end());
-                        }
-                    }
-                    lastIn = false;
-                    // Initialize range info
-                    tempMatchContainer.clear();
-                    diffPosCntOfCurrRange = 1;
-                }
-                i++;
-            }
-
-            // Met next species
-            if (lastIn) {
-                tempMatchContainer.push_back(matchList + i);
-                if (diffPosCntOfCurrRange >= minCoveredPos) {
-                    filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
-                                           tempMatchContainer.end());
-                }
-            }
-            tempMatchContainer.clear();
-            i++;
-        }
-
-        // Construct a match combination using filtered matches of current genus
-        // so that it can best cover the query, and score the combination
-        if (!filteredMatches.empty()) {
-            genusScores.push_back(scoreGenus(filteredMatches, readLength));
-        }
-        filteredMatches.clear();
-    }
-
-    // If there are no meaningful genus
-    if (genusScores.empty()) {
-        bestScore.score = 0;
-        return bestScore;
-    }
-
-    TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
-                                       [](const TaxonScore &a, const TaxonScore &b) { return a.score < b.score; });
-
-    vector<size_t> maxIdx;
-    for (size_t g = 0; g < genusScores.size(); g++) {
-        if (genusScores[g].score > maxScore.score * 0.95f) {
-            maxIdx.push_back(g);
-        }
-    }
-    bestScore = maxScore;
-
-    for (unsigned long g: maxIdx) {
-        genusMatches.insert(genusMatches.end(),
-                            matchesForEachGenus[g].begin(),
-                            matchesForEachGenus[g].end());
-    }
-
-    // More than one genus
-    if (maxIdx.size() > 1) {
-        bestScore.taxId = 0;
-        return bestScore;
-    }
-    return bestScore;
-
-    //Three cases
-    //1. one genus
-    //2. more than one genus
-    //4. no genus
-}
+// TaxonScore Taxonomer::getBestGenusMatches_spaced(vector<Match> &genusMatches, const Match *matchList, size_t end,
+//                                                  size_t offset, int readLength) {
+//     TaxID currentGenus;
+//     TaxID currentSpecies;
+
+//     vector<const Match *> tempMatchContainer;
+//     vector<const Match *> filteredMatches;
+//     vector<vector<Match>> matchesForEachGenus;
+//     vector<bool> conservedWithinGenus;
+//     vector<TaxonScore> genusScores;
+//     TaxonScore bestScore;
+//     size_t i = offset;
+//     bool lastIn;
+//     size_t speciesMatchCnt;
+//     while (i + 1 < end + 1) {
+//         currentGenus = matchList[i].genusId;
+//         // For current genus
+//         while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) {
+//             currentSpecies = matchList[i].speciesId;
+//             // For current species
+//             // Filter un-consecutive matches (probably random matches)
+//             lastIn = false;
+//             int distance = 0;
+//             int diffPosCntOfCurrRange = 1;
+//             int dnaDist = 0;
+
+//             // For the same species
+//             while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) {
+//                 distance = matchList[i + 1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3;
+//                 dnaDist = matchList[i + 1].qInfo.pos - matchList[i].qInfo.pos;
+//                 if (distance == 0) { // At the same position
+//                     tempMatchContainer.push_back(matchList + i);
+//                 } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping
+//                     lastIn = true;
+//                     tempMatchContainer.push_back(matchList + i);
+//                     diffPosCntOfCurrRange++;
+//                 } else { // Not consecutive --> End range
+//                     if (lastIn) {
+//                         tempMatchContainer.push_back(matchList + i);
+//                         if (diffPosCntOfCurrRange >= minCoveredPos) {
+//                             filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
+//                                                    tempMatchContainer.end());
+//                         }
+//                     }
+//                     lastIn = false;
+//                     // Initialize range info
+//                     tempMatchContainer.clear();
+//                     diffPosCntOfCurrRange = 1;
+//                 }
+//                 i++;
+//             }
+
+//             // Met next species
+//             if (lastIn) {
+//                 tempMatchContainer.push_back(matchList + i);
+//                 if (diffPosCntOfCurrRange >= minCoveredPos) {
+//                     filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
+//                                            tempMatchContainer.end());
+//                 }
+//             }
+//             tempMatchContainer.clear();
+//             i++;
+//         }
 
-TaxonScore Taxonomer::scoreGenus(vector<const Match *> &filteredMatches,
-                                  int queryLength) {
+//         // Construct a match combination using filtered matches of current genus
+//         // so that it can best cover the query, and score the combination
+//         if (!filteredMatches.empty()) {
+//             genusScores.push_back(scoreTaxon(filteredMatches, readLength));
+//         }
+//         filteredMatches.clear();
+//     }
+
+//     // If there are no meaningful genus
+//     if (genusScores.empty()) {
+//         bestScore.score = 0;
+//         return bestScore;
+//     }
+
+//     TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
+//                                        [](const TaxonScore &a, const TaxonScore &b) { return a.score < b.score; });
+
+//     vector<size_t> maxIdx;
+//     for (size_t g = 0; g < genusScores.size(); g++) {
+//         if (genusScores[g].score > maxScore.score * 0.95f) {
+//             maxIdx.push_back(g);
+//         }
+//     }
+//     bestScore = maxScore;
+
+//     for (unsigned long g: maxIdx) {
+//         genusMatches.insert(genusMatches.end(),
+//                             matchesForEachGenus[g].begin(),
+//                             matchesForEachGenus[g].end());
+//     }
+
+//     // More than one genus
+//     if (maxIdx.size() > 1) {
+//         bestScore.taxId = 0;
+//         return bestScore;
+//     }
+//     return bestScore;
+
+//     //Three cases
+//     //1. one genus
+//     //2. more than one genus
+//     //4. no genus
+// }
+
+TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
+                                 TaxID taxId,
+                                 int queryLength) {
     // Calculate Hamming distance & covered length
     int coveredPosCnt = 0;
     uint16_t currHammings;
@@ -948,12 +1155,13 @@ TaxonScore Taxonomer::scoreGenus(vector<const Match *> &filteredMatches,
     float score = ((float) coveredLength - hammingSum) / (float) queryLength;
     float coverage = (float) (coveredLength) / (float) (queryLength);
 
-    return {filteredMatches[0]->genusId, score, coverage, (int) hammingSum};
+    return {taxId, score, coverage, (int) hammingSum};
 }
 
-TaxonScore Taxonomer::scoreGenus(vector<const Match *> &filteredMatches,
-                                  int readLength1,
-                                  int readLength2) {
+TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
+                                 TaxID taxId,       
+                                 int readLength1,
+                                 int readLength2) {
 
     // Calculate Hamming distance & covered length
     uint16_t currHammings;
@@ -1024,7 +1232,7 @@ TaxonScore Taxonomer::scoreGenus(vector<const Match *> &filteredMatches,
     float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (readLength1 + readLength2);
 
 //    matchesForEachGenus.push_back(move(filteredMatches));
-    return {filteredMatches[0]->genusId, score, coverage, (int) hammingSum};
+    return {taxId, score, coverage, (int) hammingSum};
 }
 
 TaxonScore Taxonomer::chooseSpecies(const vector<Match> &matches,
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index 8a1e0999..a756db73 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -32,6 +32,9 @@ class Taxonomer {
     int minCoveredPos;
     int accessionLevel;
     int minSSMatch;
+    int minConsCnt;
+    int minConsCntEuk;
+    int eukaryotaTaxId;
 
     struct MatchBlock {
         MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {}
@@ -72,8 +75,7 @@ class Taxonomer {
 
     void remainConsecutiveMatches(vector<const Match *> & curFrameMatches,
                                   vector<const Match *> & filteredMatches,
-                                  TaxID genusId,
-                                  const LocalParameters & par);
+                                  TaxID genusId);
 
     size_t DFS(size_t curMatchIdx, const map<size_t, vector<size_t>>& linkedMatches,
                vector<size_t>& fiteredMatchIdx, size_t depth, size_t MIN_DEPTH, unordered_set<size_t>& used,
@@ -82,23 +84,28 @@ class Taxonomer {
     static bool isConsecutive(const Match * match1, const Match * match2);
 
     TaxonScore getBestGenusMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end,
-                                   size_t offset, int queryLength, const LocalParameters &par);
+                                   size_t offset, int queryLength);
 
     TaxonScore getBestGenusMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
-                                   int readLength1, int readLength2, const LocalParameters &par);
+                                   int readLength1, int readLength2);
 
     TaxonScore getBestSpeciesMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end,
-                                     size_t offset, int queryLength, const LocalParameters &par);
+                                     size_t offset, int queryLength);
+    
+    TaxonScore getBestSpeciesMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end,
+                                     size_t offset, int readLength1, int readLength2);
 
-    TaxonScore getBestGenusMatches_spaced(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
-                                          int readLength1, int readLength2);
-    TaxonScore getBestGenusMatches_spaced(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
-                                          int readLength1);
+    // TaxonScore getBestGenusMatches_spaced(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
+    //                                       int readLength1, int readLength2);
+    // TaxonScore getBestGenusMatches_spaced(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
+    //                                       int readLength1);
 
-    TaxonScore scoreGenus(vector<const Match *> &filteredMatches,
+    TaxonScore scoreTaxon(vector<const Match *> &filteredMatches,
+                          TaxID taxId,
                           int queryLength);
 
-    TaxonScore scoreGenus(vector<const Match *> &filteredMatches,
+    TaxonScore scoreTaxon(vector<const Match *> &filteredMatches,
+                          TaxID taxId,
                           int readLength1,
                           int readLength2);
 
@@ -129,7 +136,7 @@ class Taxonomer {
                             int queryLength,
                             int queryLength2);
 
-    TaxID lowerRankClassification(vector<Match> &matches, pair<int, int> &matchRange, TaxID speciesID);
+    TaxID lowerRankClassification(vector<Match> &matches, TaxID speciesID);
 
     void getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> & taxCnt,
                                unordered_map<TaxID, TaxonCounts> & cladeCnt,

From 4ce6f0aecaf664395e814482ed4643a64cc2e56c Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 31 Oct 2023 17:31:05 +0900
Subject: [PATCH 51/65] use minimum Hamming dist. instead of maximum

---
 src/commons/Taxonomer.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index b52f4447..8d4bf351 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -362,7 +362,7 @@ TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, TaxID spTaxId)
 
     for (size_t i = 0; i < matchNum; i++) {
         size_t currQuotient = matches[i].qInfo.pos / 3;
-        uint8_t minHamming = matches[i].hamming;
+        uint8_t minHamming = 0; //matches[i].hamming;
         Match * minHammingMatch = & matches[i];
         TaxID minHammingTaxId = minHammingMatch->targetId;
         while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) {
@@ -1114,7 +1114,7 @@ TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
 
     // Get the largest hamming distance at each position of query
     auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1];
-    memset(hammingsAtEachPos, -1, (aminoAcidNum + 1));
+    memset(hammingsAtEachPos, 24, (aminoAcidNum + 1));
     while (f < matchNum) {
         currPos = filteredMatches[f]->qInfo.pos / 3;
         currHammings = filteredMatches[f]->rightEndHamming;
@@ -1171,27 +1171,27 @@ TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
     size_t matchNum = filteredMatches.size();
     size_t f = 0;
 
-    // Get the largest hamming distance at each position of query
+    // Get the smallest hamming distance at each position of query
     auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3];
-    memset(hammingsAtEachPos, -1, (aminoAcidNum_total + 3));
+    memset(hammingsAtEachPos, 24, (aminoAcidNum_total + 3));
     while (f < matchNum) {
         currPos = (int) filteredMatches[f]->qInfo.pos / 3;
         currHammings = filteredMatches[f]->rightEndHamming;
-        if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]])
+        if (GET_2_BITS(currHammings) < hammingsAtEachPos[currPos + unmaskedPos[0]])
             hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings);
-        if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]])
+        if (GET_2_BITS(currHammings >> 2) < hammingsAtEachPos[currPos + unmaskedPos[1]])
             hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2);
-        if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]])
+        if (GET_2_BITS(currHammings >> 4) < hammingsAtEachPos[currPos + unmaskedPos[2]])
             hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4);
-        if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]])
+        if (GET_2_BITS(currHammings >> 6) < hammingsAtEachPos[currPos + unmaskedPos[3]])
             hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6);
-        if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]])
+        if (GET_2_BITS(currHammings >> 8) < hammingsAtEachPos[currPos + unmaskedPos[4]])
             hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8);
-        if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]])
+        if (GET_2_BITS(currHammings >> 10) < hammingsAtEachPos[currPos + unmaskedPos[5]])
             hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10);
-        if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]])
+        if (GET_2_BITS(currHammings >> 12) < hammingsAtEachPos[currPos + unmaskedPos[6]])
             hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12);
-        if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]])
+        if (GET_2_BITS(currHammings >> 14) < hammingsAtEachPos[currPos + unmaskedPos[7]])
             hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14);
         f++;
     }

From cf407d729ba8c813ed56121de5a42c8528cfa515 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 31 Oct 2023 22:01:20 +0900
Subject: [PATCH 52/65] util.maping2taxon.cpp

---
 src/util/mapping2taxon.cpp | 103 +++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 src/util/mapping2taxon.cpp

diff --git a/src/util/mapping2taxon.cpp b/src/util/mapping2taxon.cpp
new file mode 100644
index 00000000..d7aedfdd
--- /dev/null
+++ b/src/util/mapping2taxon.cpp
@@ -0,0 +1,103 @@
+#include <iostream>
+#include <istream>
+#include <string>
+#include <vector>
+#include "Command.h"
+#include "LocalParameters.h"
+#include "NcbiTaxonomy.h"
+#include "common.h"
+#include "fstream"
+#include <sstream>
+#include <unordered_map>
+
+using namespace std;
+
+struct read2taxon {
+    string read;
+    TaxID taxon;
+};
+
+int parseTaxId_metamaps(const string & mappingRes) {
+    vector<string> tokens = Util::split(mappingRes, " ");
+    return stoi(Util::split(tokens[5], "|")[2]);
+}
+
+// It takes a mapping result of Metamaps.
+// The mapping result includes mutliple mappings for a read, which have mapping scores.
+// The function returns the taxon ID of the best mapping.
+// If there are multiple mappings with the same best score, it returns the LCA of them.
+int mapping2taxon(int argc, const char **argv, const Command &command) {
+    LocalParameters &par = LocalParameters::getLocalInstance();
+    par.parseParameters(argc, argv, command, false, Parameters::PARSE_ALLOW_EMPTY, 0);
+    string mappingFile = par.filenames[0];
+    string taxonomyDir = par.filenames[1];
+    string output = mappingFile + ".reads2taxon";
+    ofstream out(output);
+
+    vector<read2taxon> read2taxon;
+    NcbiTaxonomy *taxonomy = loadTaxonomy("", taxonomyDir);
+    cout << "Taxonomy loaded" << endl;
+    
+    // Iterate through mapping file
+    ifstream mapping(mappingFile);
+    string line;
+    vector<TaxID> taxIds;
+    string previousRead = "";
+    double bestScore = -2;
+    TaxID bestTaxId = -1;
+    bool lastStored = false;
+    
+    while (getline(mapping, line)) {
+        vector<string> tokens = Util::split(line, " ");
+        string currentRead = tokens[0];
+        if (currentRead == previousRead) { // Same read
+            // Get score
+            stringstream scoreString(tokens[13]);
+            double curScore = 0;
+            scoreString >> curScore;
+
+            if (curScore > bestScore) {
+                taxIds.clear();
+                bestScore = curScore;
+                bestTaxId = parseTaxId_metamaps(line);
+                taxIds.push_back(bestTaxId);
+            } else if (curScore == bestScore) {
+                taxIds.push_back(parseTaxId_metamaps(line));
+                bestTaxId = taxonomy->LCA(taxIds)->taxId;
+            }
+            lastStored = false;
+        } else { // New read
+            // Store results for previous read
+            // out << previousRead << "\t" << bestTaxId << endl;
+            read2taxon.push_back({previousRead, bestTaxId});
+            lastStored = true;
+            
+            // Initialize variables
+            previousRead = currentRead;
+            taxIds.clear();
+
+            // Get score
+            stringstream scoreString(tokens[13]);
+            double curScore = 0;
+            scoreString >> curScore;
+
+            // Update variables
+            bestScore = curScore;
+            bestTaxId = parseTaxId_metamaps(line);
+            taxIds.push_back(bestTaxId);
+        }
+    }
+
+    if (!lastStored) {
+        // out << previousRead << "\t" << bestTaxId << endl;
+        read2taxon.push_back({previousRead, bestTaxId});
+    }
+
+    // Write to file
+    cout << "Writing to file" << endl;
+    for (size_t i = 1; i < read2taxon.size(); i++) {
+        out << read2taxon[i].read << "\t" << read2taxon[i].taxon << "\n";
+    }
+
+    return 0;
+}

From 3bb9c2c8de092252bb7e9c8ee67ae1c52db116ae Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 31 Oct 2023 23:13:41 +0900
Subject: [PATCH 53/65] max DNA Hamming dist. is set as 6

---
 src/commons/KmerMatcher.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index f99ccbd3..8a13ceb1 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -516,7 +516,7 @@ void KmerMatcher::compareDna(uint64_t query,
 
     // Select target k-mers that passed hamming criteria
     for (size_t h = 0; h < size; h++) {
-        if (hammingSums[h] <= minHammingSum + hammingMargin) {
+        if (hammingSums[h] <= 6) {// minHammingSum + hammingMargin) {
             selectedMatches.push_back(h);
             selectedHammingSum.push_back(hammingSums[h]);
             if (frame < 3) {

From 3533ff4493296b15f20debb829d6e0cc3fadd33a Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 31 Oct 2023 23:24:53 +0900
Subject: [PATCH 54/65] fix error in scoreTaxon: -1 --> 24

---
 src/commons/Taxonomer.cpp | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index 8d4bf351..2e28be47 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -356,7 +356,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
 }
 
 TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, TaxID spTaxId) {
-    
     unordered_map<TaxID, unsigned int> taxCnt;
     size_t matchNum = matches.size();
 
@@ -1112,27 +1111,27 @@ TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
     size_t matchNum = filteredMatches.size();
     size_t f = 0;
 
-    // Get the largest hamming distance at each position of query
+    // Get the smallest hamming distance at each position of query
     auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1];
     memset(hammingsAtEachPos, 24, (aminoAcidNum + 1));
     while (f < matchNum) {
         currPos = filteredMatches[f]->qInfo.pos / 3;
         currHammings = filteredMatches[f]->rightEndHamming;
-        if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]])
+        if (GET_2_BITS(currHammings) < hammingsAtEachPos[currPos + unmaskedPos[0]])
             hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings);
-        if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]])
+        if (GET_2_BITS(currHammings >> 2) < hammingsAtEachPos[currPos + unmaskedPos[1]])
             hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2);
-        if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]])
+        if (GET_2_BITS(currHammings >> 4) < hammingsAtEachPos[currPos + unmaskedPos[2]])
             hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4);
-        if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]])
+        if (GET_2_BITS(currHammings >> 6) < hammingsAtEachPos[currPos + unmaskedPos[3]])
             hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6);
-        if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]])
+        if (GET_2_BITS(currHammings >> 8) < hammingsAtEachPos[currPos + unmaskedPos[4]])
             hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8);
-        if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]])
+        if (GET_2_BITS(currHammings >> 10) < hammingsAtEachPos[currPos + unmaskedPos[5]])
             hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10);
-        if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]])
+        if (GET_2_BITS(currHammings >> 12) < hammingsAtEachPos[currPos + unmaskedPos[6]])
             hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12);
-        if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]])
+        if (GET_2_BITS(currHammings >> 14) < hammingsAtEachPos[currPos + unmaskedPos[7]])
             hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14);
         f++;
     }
@@ -1142,7 +1141,7 @@ TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
     for (int h = 0; h < aminoAcidNum; h++) {
         if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
             coveredPosCnt++;
-        } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
+        } else if (hammingsAtEachPos[h] != 24) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
             hammingSum += 1.0f + (0.5f * hammingsAtEachPos[h]);
             coveredPosCnt++;
         }
@@ -1205,7 +1204,7 @@ TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
         if (h < aminoAcidNum_read1) {
             if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
                 coveredPosCnt_read1++;
-            } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
+            } else if (hammingsAtEachPos[h] != 24) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
                 hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]);
                 coveredPosCnt_read1++;
             }
@@ -1214,7 +1213,7 @@ TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
         else {
             if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist.
                 coveredPosCnt_read2++;
-            } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
+            } else if (hammingsAtEachPos[h] != 24) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively
                 hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]);
                 coveredPosCnt_read2++;
             }

From 0741eb4da5c73278e95e735e6fd403cf1a4bdf60 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 7 Nov 2023 19:51:21 +0900
Subject: [PATCH 55/65] first running version

---
 src/commons/Classifier.cpp    |    3 +-
 src/commons/KmerExtractor.cpp |    2 +-
 src/commons/Match.h           |   19 +-
 src/commons/Taxonomer.cpp     | 1003 ++++++++++++++++++++-------------
 src/commons/Taxonomer.h       |   53 +-
 5 files changed, 682 insertions(+), 398 deletions(-)

diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp
index 58130092..46eee5c6 100644
--- a/src/commons/Classifier.cpp
+++ b/src/commons/Classifier.cpp
@@ -102,7 +102,8 @@ void Classifier::startClassify(const LocalParameters &par) {
         kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer);
         kmerMatcher->sortMatches(&matchBuffer);
 
-        // Classify queries based on the matches
+        // Classify queries based on the matches.
+        //  omp_set_num_threads(1);
         taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par);
         processedSeqCnt += queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start;
         cout << "The number of processed sequences: " << processedSeqCnt << " (" << (double) processedSeqCnt / (double) numOfSeq << ")" << endl;
diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp
index 232cc89a..30addeb5 100644
--- a/src/commons/KmerExtractor.cpp
+++ b/src/commons/KmerExtractor.cpp
@@ -196,7 +196,7 @@ void KmerExtractor::fillQueryKmerBufferParallel_paired(KSeqWrapper *kseq1,
                 // Process Read 2
                 seqIterator2.sixFrameTranslation(maskedSeq2, (int) reads2[i].length());
                 seqIterator2.fillQueryKmerBuffer(maskedSeq2, (int) reads2[i].length(), kmerBuffer, posToWrite,
-                                                 (uint32_t) queryIdx, queryList[queryIdx].queryLength);
+                                                 (uint32_t) queryIdx, queryList[queryIdx].queryLength+3);
 
                 if (maskMode) {
                     delete[] maskedSeq1;
diff --git a/src/commons/Match.h b/src/commons/Match.h
index 436eb0bb..f47881ef 100644
--- a/src/commons/Match.h
+++ b/src/commons/Match.h
@@ -2,7 +2,9 @@
 #define ADCLASSIFIER2_MATCH_H
 
 #include "Kmer.h"
+#include <cstdint>
 #include <iostream>
+#include "BitManipulateMacros.h"
 
 struct Match { // 24 byte
     Match(){}
@@ -26,7 +28,22 @@ struct Match { // 24 byte
 
     void printMatch() const {
         std::cout << qInfo.sequenceID << " " << qInfo.pos << " " << qInfo.frame << " "
-        << targetId << " " << genusId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << std::endl;
+        << targetId << " " << genusId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << " " << getScore() << std::endl;
+    }
+
+
+    float getScore(float score = 0.0f, int cnt = 0) const {
+        int currentHamming = GET_2_BITS(rightEndHamming >> cnt * 2);
+        if (currentHamming == 0) {
+            score += 3.0f;
+        } else {
+            score += 2.0f - 0.5f * currentHamming;
+        }
+        if (cnt == 7) {
+            return score;
+        } else {
+            return getScore(score, cnt + 1);
+        }
     }
 };
 
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index 2e28be47..a3a3b3b5 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -1,6 +1,7 @@
 #include "Taxonomer.h"
 #include "Match.h"
 #include "NcbiTaxonomy.h"
+#include <cstdint>
 #include <sys/types.h>
 #include <unordered_map>
 
@@ -80,7 +81,6 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
                                  const Match *matchList,
                                  vector<Query> & queryList,
                                  const LocalParameters &par) {
-    TaxID selectedTaxon;
 
 //    if (true) {
 //        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
@@ -92,7 +92,7 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
     // Get the best species for current query
     vector<Match> speciesMatches;
     speciesMatches.reserve(end - offset + 1);
-    TaxonScore speciesScore(0, 0, 0, 0);
+    TaxonScore speciesScore(0, 0, 0, 0, 0);
     if (par.seqMode == 2) {
         speciesScore = getBestSpeciesMatches(speciesMatches, matchList, end, offset,
                                              queryList[currentQuery].queryLength,
@@ -121,22 +121,23 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
         return;
     }
 
-    // If there are two or more good genus level candidates, find the LCA.
-    if (speciesScore.taxId == 0) {
-        vector<TaxID> genusList;
-        genusList.reserve(speciesMatches.size());
-        for (auto & genusMatch : speciesMatches) {
-            genusList.push_back(genusMatch.genusId);
-        }
-        selectedTaxon = taxonomy->LCA(genusList)->taxId;
+    // If there are two or more good species level candidates, find the LCA.
+    if (speciesScore.LCA) {
+        // cout << "LCA" << endl;
+        // vector<TaxID> genusList;
+        // genusList.reserve(speciesMatches.size());
+        // for (auto & genusMatch : speciesMatches) {
+        //     genusList.push_back(genusMatch.genusId);
+        // }
+        // selectedTaxon = taxonomy->LCA(genusList)->taxId;
         queryList[currentQuery].isClassified = true;
-        queryList[currentQuery].classification = selectedTaxon;
+        queryList[currentQuery].classification = speciesScore.taxId;
         queryList[currentQuery].score = speciesScore.score;
         queryList[currentQuery].coverage = speciesScore.coverage;
         queryList[currentQuery].hammingDist = speciesScore.hammingDist;
-        for (auto & spMatch : speciesMatches) {
-            queryList[currentQuery].taxCnt[spMatch.targetId]++;
-        }
+        // for (auto & spMatch : speciesMatches) {
+        //     queryList[currentQuery].taxCnt[spMatch.targetId]++;
+        // }
         return;
     }
 
@@ -164,9 +165,10 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
 //            return a.qInfo.position / 3 < b.qInfo.position / 3;
 //    });
 
-    sort(speciesMatches.begin(), speciesMatches.end(),
-         [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
+    // sort(speciesMatches.begin(), speciesMatches.end(),
+    //      [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
 
+    cout << "7 " << currentQuery << endl;
 
     TaxID result = lowerRankClassification(speciesMatches, speciesScore.taxId);
 
@@ -182,6 +184,7 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
     queryList[currentQuery].coverage = speciesScore.coverage;
     queryList[currentQuery].hammingDist = speciesScore.hammingDist;
     queryList[currentQuery].newSpecies = false;
+    cout << "8" << currentQuery << endl;
 //    if (par.printLog) {
 //        cout << "# " << currentQuery << endl;
 //        for (size_t i = 0; i < genusMatches.size(); i++) {
@@ -195,188 +198,192 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
 //    }
 }
 
-void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
-                                 size_t offset,
-                                 size_t end,
-                                 const Match *matchList,
-                                 vector<Query> & queryList,
-                                 const LocalParameters &par) {
-    TaxID selectedTaxon;
-
-//    if (true) {
-//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
-//        for (size_t i = offset; i < end + 1; i++) {
-//            cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
-//        }
-//    }
+// void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
+//                                  size_t offset,
+//                                  size_t end,
+//                                  const Match *matchList,
+//                                  vector<Query> & queryList,
+//                                  const LocalParameters &par) {
+//     TaxID selectedTaxon;
+
+// //    if (true) {
+// //        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
+// //        for (size_t i = offset; i < end + 1; i++) {
+// //            cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
+// //        }
+// //    }
+
+//     // Get the best genus for current query
+//     vector<Match> genusMatches;
+//     genusMatches.reserve(end - offset + 1);
+//     TaxonScore genusScore(0, 0, 0, 0);
+//     if (par.seqMode == 2) {
+//         genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
+//                                              queryList[currentQuery].queryLength,
+//                                              queryList[currentQuery].queryLength2);
+//     } else {
+//         genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
+//                                              queryList[currentQuery].queryLength);
+//     }
 
-    // Get the best genus for current query
-    vector<Match> genusMatches;
-    genusMatches.reserve(end - offset + 1);
-    TaxonScore genusScore(0, 0, 0, 0);
-    if (par.seqMode == 2) {
-        genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
-                                             queryList[currentQuery].queryLength,
-                                             queryList[currentQuery].queryLength2);
-    } else {
-        genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
-                                             queryList[currentQuery].queryLength);
-    }
+// //    if (true) {
+// //        cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n";
+// //        for (size_t i = 0; i < genusMatches.size(); i++) {
+// //           cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) <<  " "  << int(genusMatches[i].redundancy) << endl;
+// //         }
+// //        cout << "Genus score: " << genusScore.score << "\n";
+// //    }
+
+//     // If there is no proper genus for current query, it is un-classified.
+//     if (genusScore.score == 0 || genusScore.coverage < par.minCoverage || genusScore.score < par.minScore) {
+//         queryList[currentQuery].isClassified = false;
+//         queryList[currentQuery].classification = 0;
+//         queryList[currentQuery].score = genusScore.score;
+//         queryList[currentQuery].coverage = genusScore.coverage;
+//         queryList[currentQuery].hammingDist = genusScore.hammingDist;
+//         queryList[currentQuery].newSpecies = false;
+//         return;
+//     }
 
-//    if (true) {
-//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n";
-//        for (size_t i = 0; i < genusMatches.size(); i++) {
-//           cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) <<  " "  << int(genusMatches[i].redundancy) << endl;
+//     // If there are two or more good genus level candidates, find the LCA.
+//     if (genusScore.taxId == 0) {
+//         vector<TaxID> genusList;
+//         genusList.reserve(genusMatches.size());
+//         for (auto & genusMatch : genusMatches) {
+//             genusList.push_back(genusMatch.genusId);
 //         }
-//        cout << "Genus score: " << genusScore.score << "\n";
-//    }
-
-    // If there is no proper genus for current query, it is un-classified.
-    if (genusScore.score == 0 || genusScore.coverage < par.minCoverage || genusScore.score < par.minScore) {
-        queryList[currentQuery].isClassified = false;
-        queryList[currentQuery].classification = 0;
-        queryList[currentQuery].score = genusScore.score;
-        queryList[currentQuery].coverage = genusScore.coverage;
-        queryList[currentQuery].hammingDist = genusScore.hammingDist;
-        queryList[currentQuery].newSpecies = false;
-        return;
-    }
-
-    // If there are two or more good genus level candidates, find the LCA.
-    if (genusScore.taxId == 0) {
-        vector<TaxID> genusList;
-        genusList.reserve(genusMatches.size());
-        for (auto & genusMatch : genusMatches) {
-            genusList.push_back(genusMatch.genusId);
-        }
-        selectedTaxon = taxonomy->LCA(genusList)->taxId;
-        queryList[currentQuery].isClassified = true;
-        queryList[currentQuery].classification = selectedTaxon;
-        queryList[currentQuery].score = genusScore.score;
-        queryList[currentQuery].coverage = genusScore.coverage;
-        queryList[currentQuery].hammingDist = genusScore.hammingDist;
-        for (auto & genusMatch : genusMatches) {
-            queryList[currentQuery].taxCnt[genusMatch.targetId]++;
-        }
-        return;
-    }
+//         selectedTaxon = taxonomy->LCA(genusList)->taxId;
+//         queryList[currentQuery].isClassified = true;
+//         queryList[currentQuery].classification = selectedTaxon;
+//         queryList[currentQuery].score = genusScore.score;
+//         queryList[currentQuery].coverage = genusScore.coverage;
+//         queryList[currentQuery].hammingDist = genusScore.hammingDist;
+//         for (auto & genusMatch : genusMatches) {
+//             queryList[currentQuery].taxCnt[genusMatch.targetId]++;
+//         }
+//         return;
+//     }
 
-    // Choose the species with the highest coverage.
-    TaxID selectedSpecies;
-    TaxonScore speciesScore;
-    vector<TaxID> species;
-    unordered_map<TaxID, pair<int, int>> speciesMatchRange;
-    if (par.seqMode == 2) {
-        speciesScore = chooseSpecies(genusMatches,
-                                     queryList[currentQuery].queryLength,
-                                     queryList[currentQuery].queryLength2,
-                                     species,
-                                     speciesMatchRange);
-    } else {
-        speciesScore = chooseSpecies(genusMatches,
-                                     queryList[currentQuery].queryLength,
-                                     species,
-                                     speciesMatchRange);
-    }
+//     // Choose the species with the highest coverage.
+//     TaxID selectedSpecies;
+//     TaxonScore speciesScore;
+//     vector<TaxID> species;
+//     unordered_map<TaxID, pair<int, int>> speciesMatchRange;
+//     if (par.seqMode == 2) {
+//         speciesScore = chooseSpecies(genusMatches,
+//                                      queryList[currentQuery].queryLength,
+//                                      queryList[currentQuery].queryLength2,
+//                                      species,
+//                                      speciesMatchRange);
+//     } else {
+//         speciesScore = chooseSpecies(genusMatches,
+//                                      queryList[currentQuery].queryLength,
+//                                      species,
+//                                      speciesMatchRange);
+//     }
 
 
-    // Classify to LCA if more than one species are selected
-    if (species.size() > 1) {
-        queryList[currentQuery].isClassified = true;
-        queryList[currentQuery].classification = taxonomy->LCA(species)->taxId;
-        queryList[currentQuery].score = genusScore.score;
-        queryList[currentQuery].coverage = genusScore.coverage;
-        queryList[currentQuery].hammingDist = genusScore.hammingDist;
-        for (auto & genusMatch : genusMatches) {
-            queryList[currentQuery].taxCnt[genusMatch.targetId]++;
-        }
-        return;
-    }
+//     // Classify to LCA if more than one species are selected
+//     if (species.size() > 1) {
+//         queryList[currentQuery].isClassified = true;
+//         queryList[currentQuery].classification = taxonomy->LCA(species)->taxId;
+//         queryList[currentQuery].score = genusScore.score;
+//         queryList[currentQuery].coverage = genusScore.coverage;
+//         queryList[currentQuery].hammingDist = genusScore.hammingDist;
+//         for (auto & genusMatch : genusMatches) {
+//             queryList[currentQuery].taxCnt[genusMatch.targetId]++;
+//         }
+//         return;
+//     }
 
-    // If score is not enough, classify to the parent of the selected species
-    if (speciesScore.score < par.minSpScore) {
-        queryList[currentQuery].isClassified = true;
-        queryList[currentQuery].classification = taxonomy->taxonNode(
-                taxonomy->getTaxIdAtRank(species[0], "species"))->parentTaxId;
-        queryList[currentQuery].score = genusScore.score;
-        queryList[currentQuery].coverage = genusScore.coverage;
-        queryList[currentQuery].hammingDist = genusScore.hammingDist;
-        for (auto & genusMatch : genusMatches) {
-            if(genusMatch.speciesId == species[0]){
-                queryList[currentQuery].taxCnt[genusMatch.targetId]++;
-            }
-        }
-        return;
-    }
+//     // If score is not enough, classify to the parent of the selected species
+//     if (speciesScore.score < par.minSpScore) {
+//         queryList[currentQuery].isClassified = true;
+//         queryList[currentQuery].classification = taxonomy->taxonNode(
+//                 taxonomy->getTaxIdAtRank(species[0], "species"))->parentTaxId;
+//         queryList[currentQuery].score = genusScore.score;
+//         queryList[currentQuery].coverage = genusScore.coverage;
+//         queryList[currentQuery].hammingDist = genusScore.hammingDist;
+//         for (auto & genusMatch : genusMatches) {
+//             if(genusMatch.speciesId == species[0]){
+//                 queryList[currentQuery].taxCnt[genusMatch.targetId]++;
+//             }
+//         }
+//         return;
+//     }
 
-    // Sort matches by the position of the query sequence
-    selectedSpecies = species[0];
-//    sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first,
-//         genusMatches.begin() + speciesMatchRange[selectedSpecies].second,
-//         [](const Match & a, const Match & b) {
-//        if (a.qInfo.position / 3 == b.qInfo.position / 3)
-//            return a.hamming < b.hamming;
-//        else
-//            return a.qInfo.position / 3 < b.qInfo.position / 3;
-//    });
+//     // Sort matches by the position of the query sequence
+//     selectedSpecies = species[0];
+// //    sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first,
+// //         genusMatches.begin() + speciesMatchRange[selectedSpecies].second,
+// //         [](const Match & a, const Match & b) {
+// //        if (a.qInfo.position / 3 == b.qInfo.position / 3)
+// //            return a.hamming < b.hamming;
+// //        else
+// //            return a.qInfo.position / 3 < b.qInfo.position / 3;
+// //    });
 
-    vector<Match>::const_iterator first = genusMatches.begin() + speciesMatchRange[selectedSpecies].first;
-    vector<Match>::const_iterator last = genusMatches.begin() + speciesMatchRange[selectedSpecies].second;
-    vector<Match> speciesMatches(first, last);
+//     vector<Match>::const_iterator first = genusMatches.begin() + speciesMatchRange[selectedSpecies].first;
+//     vector<Match>::const_iterator last = genusMatches.begin() + speciesMatchRange[selectedSpecies].second;
+//     vector<Match> speciesMatches(first, last);
 
 
-    sort(speciesMatches.begin(), speciesMatches.end(),
-         [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
+//     sort(speciesMatches.begin(), speciesMatches.end(),
+//          [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
          
-    TaxID result = lowerRankClassification(speciesMatches, selectedSpecies);
+//     TaxID result = lowerRankClassification(speciesMatches, selectedSpecies);
 
-    // Record matches of selected species
-    for (size_t i = speciesMatchRange[selectedSpecies].first; i < speciesMatchRange[selectedSpecies].second; i++) {
-        queryList[currentQuery].taxCnt[genusMatches[i].targetId]++;
-    }
+//     // Record matches of selected species
+//     for (size_t i = speciesMatchRange[selectedSpecies].first; i < speciesMatchRange[selectedSpecies].second; i++) {
+//         queryList[currentQuery].taxCnt[genusMatches[i].targetId]++;
+//     }
 
-    // Store classification results
-    queryList[currentQuery].isClassified = true;
-    queryList[currentQuery].classification = result;
-    queryList[currentQuery].score = speciesScore.score;
-    queryList[currentQuery].coverage = speciesScore.coverage;
-    queryList[currentQuery].hammingDist = speciesScore.hammingDist;
-    queryList[currentQuery].newSpecies = false;
-//    if (par.printLog) {
-//        cout << "# " << currentQuery << endl;
-//        for (size_t i = 0; i < genusMatches.size(); i++) {
-//            cout << i << " " << genusMatches[i].qInfo.pos << " " <<
-//            genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl;
-//        }
-//        cout << "Score: " << speciesScore.score << "  " << selectedSpecies << " "
-//             << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx)
-//
-//             << endl;
-//    }
-}
+//     // Store classification results
+//     queryList[currentQuery].isClassified = true;
+//     queryList[currentQuery].classification = result;
+//     queryList[currentQuery].score = speciesScore.score;
+//     queryList[currentQuery].coverage = speciesScore.coverage;
+//     queryList[currentQuery].hammingDist = speciesScore.hammingDist;
+//     queryList[currentQuery].newSpecies = false;
+// //    if (par.printLog) {
+// //        cout << "# " << currentQuery << endl;
+// //        for (size_t i = 0; i < genusMatches.size(); i++) {
+// //            cout << i << " " << genusMatches[i].qInfo.pos << " " <<
+// //            genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl;
+// //        }
+// //        cout << "Score: " << speciesScore.score << "  " << selectedSpecies << " "
+// //             << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx)
+// //
+// //             << endl;
+// //    }
+// }
 
 TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, TaxID spTaxId) {
     unordered_map<TaxID, unsigned int> taxCnt;
     size_t matchNum = matches.size();
+    // cout << spTaxId << endl;
+    
 
     for (size_t i = 0; i < matchNum; i++) {
-        size_t currQuotient = matches[i].qInfo.pos / 3;
-        uint8_t minHamming = 0; //matches[i].hamming;
-        Match * minHammingMatch = & matches[i];
-        TaxID minHammingTaxId = minHammingMatch->targetId;
-        while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) {
-            if (matches[i].hamming < minHamming) {
-                minHamming = matches[i].hamming;
-                minHammingMatch = & matches[i];
-                minHammingTaxId = minHammingMatch->targetId;
-            } else if (matches[i].hamming == minHamming) {
-                minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId);
-                minHammingMatch->redundancy = true;
-                matches[i].redundancy = true;
-            }
-            i++;
-        }
-        taxCnt[minHammingTaxId]++;       
+        // cout << matches[i].targetId << endl;
+        taxCnt[matches[i].targetId] ++;
+        // size_t currQuotient = matches[i].qInfo.pos / 3;
+        // uint8_t minHamming = 0; //matches[i].hamming;
+        // Match * minHammingMatch = & matches[i];
+        // TaxID minHammingTaxId = minHammingMatch->targetId;
+        // while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) {
+        //     if (matches[i].hamming < minHamming) {
+        //         minHamming = matches[i].hamming;
+        //         minHammingMatch = & matches[i];
+        //         minHammingTaxId = minHammingMatch->targetId;
+        //     } else if (matches[i].hamming == minHamming) {
+        //         minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId);
+        //         minHammingMatch->redundancy = true;
+        //         matches[i].redundancy = true;
+        //     }
+        //     i++;
+        // }
+        // taxCnt[minHammingTaxId]++;       
     }
 
     // int i = matchRange.second - 1;
@@ -402,10 +409,16 @@ TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, TaxID spTaxId)
     // }
 
     unordered_map<TaxID, TaxonCounts> cladeCnt;
+    // cout << "8" << endl;
     getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
-
+    // // print cladeCnt
+    // for (auto it = cladeCnt.begin(); it != cladeCnt.end(); it++) {
+    //     cout << it->first << " " << it->second.taxCount << " " << it->second.cladeCount << endl;
+    // }
+    // cout << "9" << endl;
     if (accessionLevel == 2) { // Don't do accession-level classification
         // Remove leaf nodes
+        // cout << "10" << endl;
         for (auto it = cladeCnt.begin(); it != cladeCnt.end(); it++) {
             TaxonNode const * taxon = taxonomy->taxonNode(it->first);
             if (strcmp(taxonomy->getString(taxon->rankIdx), "") == 0) {
@@ -415,8 +428,10 @@ TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, TaxID spTaxId)
                                                                  it->first));
             } 
         }
+        
         return BFS(cladeCnt, spTaxId);
     } else {
+        // cout << "10-2" << endl;
         return BFS(cladeCnt, spTaxId);
     }
 }
@@ -479,6 +494,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
     size_t i = offset;
     uint8_t curFrame;
     vector<const Match *> curFrameMatches;
+    vector<MatchPath> matchPaths;
 
      while (i  < end + 1) {
         currentSpecies = matchList[i].speciesId;
@@ -492,7 +508,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
                 i ++;
             }
             if (curFrameMatches.size() > 1) {
-                remainConsecutiveMatches(curFrameMatches, filteredMatches, currentSpecies);
+                remainConsecutiveMatches(curFrameMatches, matchPaths, currentSpecies);
             }
         }
         // Construct a match combination using filtered matches of current species
@@ -549,6 +565,10 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
     size_t i = offset;
     uint8_t curFrame;
     vector<const Match *> curFrameMatches;
+    vector<MatchPath> matchPaths;
+    unordered_map<TaxID, float> species2score;
+    unordered_map<TaxID, vector<MatchPath>> species2matchPaths;
+    float bestSpScore = 0;
 
      while (i  < end + 1) {
         currentSpecies = matchList[i].speciesId;
@@ -562,139 +582,247 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
                 i ++;
             }
             if (curFrameMatches.size() > 1) {
-                remainConsecutiveMatches(curFrameMatches, filteredMatches, currentSpecies);
+                // cout << "1" << endl;
+                remainConsecutiveMatches(curFrameMatches, matchPaths, currentSpecies);
             }
         }
-        // Construct a match combination using filtered matches of current species
+        // Combine MatchPaths
         // so that it can best cover the query, and score the combination
-        if (!filteredMatches.empty()) {
-            matchesForEachSpecies.push_back(filteredMatches);
-            speciesScores.push_back(scoreTaxon(filteredMatches, currentSpecies, readLength1, readLength2));
+        if (!matchPaths.empty()) {
+            // Initialize species2matchPaths
+            species2matchPaths[currentSpecies].emplace_back(0, 0, 0, 0);
+            // cout << "2" << endl;
+            cout << currentSpecies << endl;
+            float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], readLength1 + readLength2);
+            cout << endl;
+            species2score[currentSpecies] = score;
+            if (score > bestSpScore) {
+                bestSpScore = score;
+            }
         }
-        filteredMatches.clear();
+        matchPaths.clear();
     }
     
     // If there are no meaningful species
-    if (speciesScores.empty()) {
+    if (species2score.empty()) {
         bestScore.score = 0;
         return bestScore;
     }
-
-    TaxonScore maxScore = *max_element(speciesScores.begin(), speciesScores.end(),
-                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
-
-    vector<size_t> maxIdx;
-    for (size_t g = 0; g < speciesScores.size(); g++) {
-        if (speciesScores[g].score == maxScore.score) {
-            maxIdx.push_back(g);
+    // cout << "4" << endl;
+    vector<TaxID> maxSpecies;
+    for (auto & spScore : species2score) {
+        cout << spScore.first << " " << spScore.second << endl;
+        if (spScore.second == bestSpScore) {
+            maxSpecies.push_back(spScore.first);
         }
     }
-    bestScore = maxScore;
-
-    for (unsigned long g : maxIdx) {
-        for (const Match * m : matchesForEachSpecies[g]) {
-            speciesMatches.push_back(*m);
+    // cout << "5" << endl;
+    // More than one species --> LCA
+    if (maxSpecies.size() > 1) {
+        bestScore.LCA = true;
+        bestScore.taxId = taxonomy->LCA(maxSpecies)->taxId;
+        for (auto & sp : maxSpecies) {
+            bestScore.score += species2score[sp];
         }
+        bestScore.score /= maxSpecies.size();
+        return bestScore;
     }
-
-    // More than one species
-    if (maxIdx.size() > 1) {
-        bestScore.taxId = 0;
+    
+    // One species
+    bestScore.taxId = maxSpecies[0];
+    bestScore.score = species2score[maxSpecies[0]];
+    float coveredLength = 0.f;
+    int hammingDist = 0;
+    for (auto & matchPath : species2matchPaths[maxSpecies[0]]) {
+        // cout << "here" << endl;
+        coveredLength += matchPath.end - matchPath.start + 1;
+        hammingDist += matchPath.hammingDist;
+        for (auto match : matchPath.matches) {
+            // cout << match->targetId << endl;
+            // match->printMatch();
+            speciesMatches.push_back(*match);
+            // speciesMatches.back().printMatch();
+        }
     }
+    bestScore.coverage = coveredLength / (readLength1 + readLength2);
+    bestScore.hammingDist = hammingDist;
 
+// cout << "6" << endl;
     return bestScore;                    
 }
 
-TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
-                                           size_t offset, int readLength1, int readLength2) {
-    TaxID currentGenus;
-    TaxID currentSpecies;
-
-    vector<const Match *> filteredMatches;
-    vector<vector<const Match *>> matchesForEachGenus;
-    vector<TaxonScore> genusScores;
-    TaxonScore bestScore;
-    size_t i = offset;
-    uint8_t curFrame;
-    vector<const Match *> curFrameMatches;
-    while (i  < end + 1) {
-//        currentGenus = taxId2genusId[matchList[i].targetId];
-        currentGenus = matchList[i].genusId;
-        // For current genus
-        while ((i < end + 1) && currentGenus == matchList[i].genusId) {
-//            currentSpecies = taxId2speciesId[matchList[i].targetId];
-            currentSpecies = matchList[i].speciesId;
-//            if (par.printLog) {
-//                cout << currentGenus << " " << currentSpecies << endl;
-//            }
-            // For current species
-            while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
-                curFrame = matchList[i].qInfo.frame;
-                curFrameMatches.clear();
-
-                // For current frame
-                while ((i < end + 1) && currentSpecies == matchList[i].speciesId
-                       && curFrame == matchList[i].qInfo.frame) {
-                    curFrameMatches.push_back(&matchList[i]);
-                    i ++;
-                }
-                if (curFrameMatches.size() > 1) {
-                    remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus);
+float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
+                                   vector<MatchPath> & combinedMatchPaths,
+                                   int readLength) {
+    combinedMatchPaths.clear();
+    // Sort matchPaths by the their score
+    sort(matchPaths.begin(), matchPaths.end(),
+         [](const MatchPath & a, const MatchPath & b) { return a.score > b.score;});
+
+    // Combine matchPaths
+    // 1. Add the matchPath with the highest score to combinedMatchPaths
+    // 2. Add the matchPath with the highest score that is not overlapped with the matchPath in combinedMatchPaths
+    // 3. Repeat 2 until no matchPath can be added
+    for (size_t i = 0; i < matchPaths.size(); i++) {
+        cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << " " << matchPaths[i].matches.back()->targetId << " " << matchPaths[i].matches.back()->qInfo.frame <<endl;
+        if (combinedMatchPaths.empty()) {
+            combinedMatchPaths.push_back(matchPaths[i]);
+            combinedMatchPaths.back().matches = matchPaths[i].matches;
+            // cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << endl;
+            // for (auto & match : matchPaths[i].matches) {
+            //     match->printMatch();
+            // }
+        } else {
+            bool isOverlapped = false;
+            for (size_t j = 0; j < combinedMatchPaths.size(); j++) {
+                if (!isMatchPathNotOverlapped(matchPaths[i], combinedMatchPaths[j])) {
+                    isOverlapped = true;
+                    break;
+                } else {
+                    // cout << matchPaths[i].start << " " << matchPaths[i].end << endl;
+                    // cout << combinedMatchPaths[j].start << " " << combinedMatchPaths[j].end << endl << endl;;
                 }
             }
+            if (!isOverlapped) {
+                combinedMatchPaths.push_back(matchPaths[i]);
+                combinedMatchPaths.back().matches = matchPaths[i].matches;
+                // cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << endl;
+                // for (auto & match : matchPaths[i].matches) {
+                //     match->printMatch();
+                // }
+            }
         }
-
-        // Construct a match combination using filtered matches of current genus
-        // so that it can best cover the query, and score the combination
-        if (!filteredMatches.empty()) {
-            matchesForEachGenus.push_back(filteredMatches);
-            genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, readLength1, readLength2));
-        }
-        filteredMatches.clear();
     }
+    // cout << endl;
 
-    // If there are no meaningful genus
-    if (genusScores.empty()) {
-        bestScore.score = 0;
-        return bestScore;
-    }
 
-    TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
-                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
 
-    vector<size_t> maxIdx;
-    for (size_t g = 0; g < genusScores.size(); g++) {
-        if (genusScores[g].score > maxScore.score * 0.95f) {
-            maxIdx.push_back(g);
-        }
+    // Calculate the score of combinedMatchPaths
+    float score = 0;
+    for (auto & matchPath : combinedMatchPaths) {
+        score += matchPath.score;
     }
-    bestScore = maxScore;
+    return score / readLength;
+}
 
-    for (unsigned long g : maxIdx) {
-        for (const Match * m : matchesForEachGenus[g]) {
-            genusMatches.push_back(*m);
-        }
-    }
+bool Taxonomer::isMatchPathNotOverlapped(const MatchPath & matchPath1,
+                                         const MatchPath & matchPath2) {
+    return (matchPath1.end < matchPath2.start) || (matchPath2.end < matchPath1.start);                                       
+}
 
+\
 
+//     if (matchPath1.start > matchPath2.start) {
+//         return isMatchPathOverlapped(matchPath2, matchPath1, readLength);
+//     }
+//     if (matchPath1.end < matchPath2.start) {
+//         return false;
+//     }
+//     if (matchPath1.endPos >= matchPath2.startPos) {
+//         if (matchPath1.endPos <= matchPath2.endPos) {
+//             return true;
+//         } else {
+//             if (matchPath1.startPos + readLength - 1 >= matchPath2.startPos) {
+//                 return true;
+//             } else {
+//                 return false;
+//             }
+//         }
+//     }
+//     return false;
+// }
 
-    // More than one genus
-    if (maxIdx.size() > 1) {
-        bestScore.taxId = 0;
-        return bestScore;
-    }
 
-    return bestScore;
+// TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
+//                                            size_t offset, int readLength1, int readLength2) {
+//     TaxID currentGenus;
+//     TaxID currentSpecies;
 
-    //Three cases
-    //1. one genus
-    //2. more than one genus
-    //4. no genus
-}
+//     vector<const Match *> filteredMatches;
+//     vector<vector<const Match *>> matchesForEachGenus;
+//     vector<TaxonScore> genusScores;
+//     TaxonScore bestScore;
+//     size_t i = offset;
+//     uint8_t curFrame;
+//     vector<const Match *> curFrameMatches;
+//     while (i  < end + 1) {
+// //        currentGenus = taxId2genusId[matchList[i].targetId];
+//         currentGenus = matchList[i].genusId;
+//         // For current genus
+//         while ((i < end + 1) && currentGenus == matchList[i].genusId) {
+// //            currentSpecies = taxId2speciesId[matchList[i].targetId];
+//             currentSpecies = matchList[i].speciesId;
+// //            if (par.printLog) {
+// //                cout << currentGenus << " " << currentSpecies << endl;
+// //            }
+//             // For current species
+//             while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
+//                 curFrame = matchList[i].qInfo.frame;
+//                 curFrameMatches.clear();
+
+//                 // For current frame
+//                 while ((i < end + 1) && currentSpecies == matchList[i].speciesId
+//                        && curFrame == matchList[i].qInfo.frame) {
+//                     curFrameMatches.push_back(&matchList[i]);
+//                     i ++;
+//                 }
+//                 if (curFrameMatches.size() > 1) {
+//                     remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus);
+//                 }
+//             }
+//         }
 
-void Taxonomer::remainConsecutiveMatches(vector<const Match *> & curFrameMatches,
-                                          vector<const Match *> & filteredMatches,
-                                          TaxID genusId) {
+//         // Construct a match combination using filtered matches of current genus
+//         // so that it can best cover the query, and score the combination
+//         if (!filteredMatches.empty()) {
+//             matchesForEachGenus.push_back(filteredMatches);
+//             genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, readLength1, readLength2));
+//         }
+//         filteredMatches.clear();
+//     }
+
+//     // If there are no meaningful genus
+//     if (genusScores.empty()) {
+//         bestScore.score = 0;
+//         return bestScore;
+//     }
+
+//     TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
+//                                        [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
+
+//     vector<size_t> maxIdx;
+//     for (size_t g = 0; g < genusScores.size(); g++) {
+//         if (genusScores[g].score > maxScore.score * 0.95f) {
+//             maxIdx.push_back(g);
+//         }
+//     }
+//     bestScore = maxScore;
+
+//     for (unsigned long g : maxIdx) {
+//         for (const Match * m : matchesForEachGenus[g]) {
+//             genusMatches.push_back(*m);
+//         }
+//     }
+
+
+
+//     // More than one genus
+//     if (maxIdx.size() > 1) {
+//         bestScore.taxId = 0;
+//         return bestScore;
+//     }
+
+//     return bestScore;
+
+//     //Three cases
+//     //1. one genus
+//     //2. more than one genus
+//     //4. no genus
+// }
+
+void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameMatches,
+                                         vector<MatchPath> & matchPaths,
+                                         TaxID genusId) {
     size_t i = 0;
     size_t end = curFrameMatches.size();
     vector<pair<const Match *, size_t>> curPosMatches; // <match, index>
@@ -741,18 +869,61 @@ void Taxonomer::remainConsecutiveMatches(vector<const Match *> & curFrameMatches
 //        }
 //    }
 
-    // Iterate linkedMatches to get filteredMatches
-    int MIN_DEPTH = minConsCnt - 1;
+    // Iterate linkedMatches to get filteredMatches 
+    //(ignore matches not enoughly consecutive)
+    size_t MIN_DEPTH = minConsCnt - 1;
     if (taxonomy->IsAncestor(eukaryotaTaxId, genusId)) {
         MIN_DEPTH = minConsCntEuk - 1;
     }
     unordered_set<size_t> used;
-    vector<size_t> filteredMatchIdx;
-    unordered_map<size_t, size_t> idx2depth;
+    unordered_map<size_t, depthScore> idx2depthScore;
+    // unordered_map<size_t, size_t> edges;
+    unordered_map<const Match *, const Match *> edges;
+
     for (const auto& entry : linkedMatches) {
         if (!used.count(entry.first)) {
             used.insert(entry.first);
-            DFS(entry.first, linkedMatches, filteredMatchIdx, 0, MIN_DEPTH, used, idx2depth);
+            depthScore bestPath{};
+            size_t bestNextIdx = 0;
+            float curScore = curFrameMatches[entry.first]->getScore();
+            // cout << curFrameMatches[entry.first]
+            for (size_t j = 0; j < entry.second.size(); j++) {
+                used.insert(entry.second[j]);
+                depthScore curPath = DFS(curFrameMatches,
+                                         entry.second[j],
+                                         linkedMatches,
+                                        1,
+                                         MIN_DEPTH, 
+                                         used, 
+                                         idx2depthScore,
+                                         edges, 
+                                         curScore, 0);
+                if (curPath.score > bestPath.score && curPath.depth > MIN_DEPTH) {
+                    bestNextIdx = entry.second[j];
+                    bestPath = curPath;
+                }
+            }
+            // Store the best path
+            if (bestPath.depth > MIN_DEPTH) {
+                // cout << entry.first << endl;
+                // curFrameMatches[entry.first]->printMatch();
+                matchPaths.emplace_back(curFrameMatches[entry.first]->qInfo.pos, // start coordinate on query
+                                        curFrameMatches[entry.first]->qInfo.pos + bestPath.depth * 3 + 20, // end coordinate on query
+                                        bestPath.score, bestPath.hammingDist);
+                const Match * curMatch = curFrameMatches[entry.first];
+                edges[curMatch] = curFrameMatches[bestNextIdx];
+                matchPaths.back().matches.push_back(curMatch);
+                // curMatch = edges[curMatch];                        
+                // edges2[curFrameMatches[entry.first]] = curFrameMatches[bestNextIdx];
+                // Retrieve the best path
+                // cout << bestPath.depth << endl;
+                while (edges.find(curMatch) != edges.end()) {
+                    // cout << curMatch << " ";
+                    matchPaths.back().matches.push_back(edges[curMatch]);
+                    curMatch = edges[curMatch];
+                }
+                // cout << endl;
+            }
         }
     }
 
@@ -763,43 +934,92 @@ void Taxonomer::remainConsecutiveMatches(vector<const Match *> & curFrameMatches
 //        }
 //        cout << endl;
 //    }
-
-    for (auto &idx: filteredMatchIdx) {
-        filteredMatches.push_back(curFrameMatches[idx]);
-    }
 }
 
+// size_t Taxonomer::DFS(size_t curMatchIdx, const map<size_t, vector<size_t>> & linkedMatches,
+//                        vector<size_t>& filteredMatches, size_t depth, size_t MIN_DEPTH, unordered_set<size_t>& used,
+//                        unordered_map<size_t, size_t> & idx2depth) {
+//     depth++;
+//     size_t maxDepth = 0;
+//     size_t returnDepth = 0;
+//     if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { 
+//         // reached a leaf node
+//         idx2depth[curMatchIdx] = depth;
+//         if (depth > MIN_DEPTH) {
+//             filteredMatches.push_back(curMatchIdx);
+//         }
+//         return depth;
+//     } else { // not a leaf node
+//         for (auto &nextMatchIdx: linkedMatches.at(curMatchIdx)) {
+//             used.insert(nextMatchIdx);
+//             if (idx2depth.find(nextMatchIdx) != idx2depth.end()) {
+//                 returnDepth = idx2depth[nextMatchIdx];
+//                 maxDepth = max(maxDepth, returnDepth);
+//                 continue;
+//             }
+//             returnDepth = DFS(nextMatchIdx, linkedMatches, filteredMatches, depth, MIN_DEPTH, used, idx2depth);
+//             maxDepth = max(maxDepth, returnDepth);
+//         }
+//         if (maxDepth > MIN_DEPTH) {
+//             filteredMatches.push_back(curMatchIdx);
+//             idx2depth[curMatchIdx] = maxDepth;
+//         }
+//     }
+//     return maxDepth;
+// }
 
-size_t Taxonomer::DFS(size_t curMatchIdx, const map<size_t, vector<size_t>> & linkedMatches,
-                       vector<size_t>& filteredMatches, size_t depth, size_t MIN_DEPTH, unordered_set<size_t>& used,
-                       unordered_map<size_t, size_t> & idx2depth) {
+// return: end
+depthScore Taxonomer::DFS(const vector<const Match *> &matches,
+                          size_t curMatchIdx,
+                          const map<size_t, vector<size_t>> &linkedMatches,
+                          size_t depth, size_t MIN_DEPTH,
+                          unordered_set<size_t> &used,
+                          unordered_map<size_t, depthScore> &idx2depthScore,
+                          unordered_map<const Match *, const Match *> & edges, float score, int hammingDist) {
     depth++;
-    size_t maxDepth = 0;
-    size_t returnDepth = 0;
-    if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { 
-        // reached a leaf node
-        idx2depth[curMatchIdx] = depth;
-        if (depth > MIN_DEPTH) {
-            filteredMatches.push_back(curMatchIdx);
+    depthScore bestDepthScore = depthScore(0, 0, 0);
+    depthScore returnDepthScore;
+    if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { // reached a leaf node
+        uint8_t lastEndHamming = (matches[curMatchIdx]->rightEndHamming >> 14);
+        if (lastEndHamming == 0) {
+            score += 3.0f;
+        } else {
+            score += 2.0f - 0.5f * lastEndHamming;
         }
-        return depth;
+        idx2depthScore[curMatchIdx] = depthScore(depth, score, hammingDist + lastEndHamming);
+        return depthScore(depth, score, hammingDist + lastEndHamming);
     } else { // not a leaf node
+        uint8_t lastEndHamming = (matches[curMatchIdx]->rightEndHamming >> 14);
+        if (lastEndHamming == 0) {
+            score += 3.0f;
+        } else {
+            score += 2.0f - 0.5f * lastEndHamming;
+        }
         for (auto &nextMatchIdx: linkedMatches.at(curMatchIdx)) {
             used.insert(nextMatchIdx);
-            if (idx2depth.find(nextMatchIdx) != idx2depth.end()) {
-                returnDepth = idx2depth[nextMatchIdx];
-                maxDepth = max(maxDepth, returnDepth);
+
+            // Reuse the depth score of nextMatchIdx if it has been calculated
+            if (idx2depthScore.find(nextMatchIdx) != idx2depthScore.end()) {
+                returnDepthScore = idx2depthScore[nextMatchIdx];
+                if (returnDepthScore.score > bestDepthScore.score
+                    && returnDepthScore.depth > MIN_DEPTH) {
+                    bestDepthScore = returnDepthScore;
+                    edges[matches[curMatchIdx]] = matches[nextMatchIdx];
+                }   
                 continue;
             }
-            returnDepth = DFS(nextMatchIdx, linkedMatches, filteredMatches, depth, MIN_DEPTH, used, idx2depth);
-            maxDepth = max(maxDepth, returnDepth);
-        }
-        if (maxDepth > MIN_DEPTH) {
-            filteredMatches.push_back(curMatchIdx);
-            idx2depth[curMatchIdx] = maxDepth;
+            returnDepthScore = DFS(matches, nextMatchIdx, linkedMatches, depth, MIN_DEPTH, used, idx2depthScore, edges, score, hammingDist + lastEndHamming);
+            if (returnDepthScore.score > bestDepthScore.score
+                && returnDepthScore.depth > MIN_DEPTH) {
+                bestDepthScore = returnDepthScore;
+                edges[matches[curMatchIdx]] = matches[nextMatchIdx];
+            } 
+        }    
+        if (bestDepthScore.depth > MIN_DEPTH) {
+            idx2depthScore[curMatchIdx] = bestDepthScore;
         }
     }
-    return maxDepth;
+    return bestDepthScore;
 }
 
 // TaxonScore Taxonomer::getBestGenusMatches_spaced(vector<Match> &genusMatches, const Match *matchList, size_t end,
@@ -910,87 +1130,87 @@ size_t Taxonomer::DFS(size_t curMatchIdx, const map<size_t, vector<size_t>> & li
 //     //4. no genus
 // }
 
-TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
-                                           size_t offset, int queryLength) {
-    TaxID currentGenus;
-    TaxID currentSpecies;
+// TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
+//                                            size_t offset, int queryLength) {
+//     TaxID currentGenus;
+//     TaxID currentSpecies;
 
-    vector<const Match *> filteredMatches;
-    vector<vector<const Match *>> matchesForEachGenus;
-    vector<TaxonScore> genusScores;
-    TaxonScore bestScore;
-    size_t i = offset;
-    uint8_t curFrame;
-    vector<const Match *> curFrameMatches;
-    while (i  < end + 1) {
-        currentGenus = matchList[i].genusId;
-        // For current genus
-        while ((i < end + 1) && currentGenus == matchList[i].genusId) {
-            currentSpecies = matchList[i].speciesId;
-
-            // For current species
-            while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
-                curFrame = matchList[i].qInfo.frame;
-                curFrameMatches.clear();
-
-                // For current frame
-                while ((i < end + 1) && currentSpecies == matchList[i].speciesId
-                       && curFrame == matchList[i].qInfo.frame) {
-                    curFrameMatches.push_back(&matchList[i]);
-                    i ++;
-                }
-                if (curFrameMatches.size() > 1) {
-                    remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus);
-                }
-            }
-        }
+//     vector<const Match *> filteredMatches;
+//     vector<vector<const Match *>> matchesForEachGenus;
+//     vector<TaxonScore> genusScores;
+//     TaxonScore bestScore;
+//     size_t i = offset;
+//     uint8_t curFrame;
+//     vector<const Match *> curFrameMatches;
+//     while (i  < end + 1) {
+//         currentGenus = matchList[i].genusId;
+//         // For current genus
+//         while ((i < end + 1) && currentGenus == matchList[i].genusId) {
+//             currentSpecies = matchList[i].speciesId;
 
-        // Construct a match combination using filtered matches of current genus
-        // so that it can best cover the query, and score the combination
+//             // For current species
+//             while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
+//                 curFrame = matchList[i].qInfo.frame;
+//                 curFrameMatches.clear();
+
+//                 // For current frame
+//                 while ((i < end + 1) && currentSpecies == matchList[i].speciesId
+//                        && curFrame == matchList[i].qInfo.frame) {
+//                     curFrameMatches.push_back(&matchList[i]);
+//                     i ++;
+//                 }
+//                 if (curFrameMatches.size() > 1) {
+//                     remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus);
+//                 }
+//             }
+//         }
 
-        if (!filteredMatches.empty()) {
-            matchesForEachGenus.push_back(filteredMatches);
-            genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, queryLength));
-        }
-        filteredMatches.clear();
-    }
+//         // Construct a match combination using filtered matches of current genus
+//         // so that it can best cover the query, and score the combination
 
-    // If there are no meaningful genus
-    if (genusScores.empty()) {
-        bestScore.score = 0;
-        return bestScore;
-    }
+//         if (!filteredMatches.empty()) {
+//             matchesForEachGenus.push_back(filteredMatches);
+//             genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, queryLength));
+//         }
+//         filteredMatches.clear();
+//     }
 
-    TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
-                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
+//     // If there are no meaningful genus
+//     if (genusScores.empty()) {
+//         bestScore.score = 0;
+//         return bestScore;
+//     }
 
-    vector<size_t> maxIdx;
-    for (size_t g = 0; g < genusScores.size(); g++) {
-        if (genusScores[g].score > maxScore.score * 0.95f) {
-            maxIdx.push_back(g);
-        }
-    }
+//     TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
+//                                        [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
 
-    bestScore = maxScore;
+//     vector<size_t> maxIdx;
+//     for (size_t g = 0; g < genusScores.size(); g++) {
+//         if (genusScores[g].score > maxScore.score * 0.95f) {
+//             maxIdx.push_back(g);
+//         }
+//     }
 
-    for (unsigned long g : maxIdx) {
-        for (const Match * m : matchesForEachGenus[g]) {
-            genusMatches.push_back(*m);
-        }
-    }
+//     bestScore = maxScore;
 
-    // More than one genus
-    if (maxIdx.size() > 1) {
-        bestScore.taxId = 0;
-        return bestScore;
-    }
-    return bestScore;
+//     for (unsigned long g : maxIdx) {
+//         for (const Match * m : matchesForEachGenus[g]) {
+//             genusMatches.push_back(*m);
+//         }
+//     }
 
-    //Three cases
-    //1. one genus
-    //2. more than one genus
-    //4. no genus
-}
+//     // More than one genus
+//     if (maxIdx.size() > 1) {
+//         bestScore.taxId = 0;
+//         return bestScore;
+//     }
+//     return bestScore;
+
+//     //Three cases
+//     //1. one genus
+//     //2. more than one genus
+//     //4. no genus
+// }
 
 // TaxonScore Taxonomer::getBestGenusMatches_spaced(vector<Match> &genusMatches, const Match *matchList, size_t end,
 //                                                  size_t offset, int readLength) {
@@ -1154,7 +1374,7 @@ TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
     float score = ((float) coveredLength - hammingSum) / (float) queryLength;
     float coverage = (float) (coveredLength) / (float) (queryLength);
 
-    return {taxId, score, coverage, (int) hammingSum};
+    return {taxId, score, coverage, (int) hammingSum, 0};
 }
 
 TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
@@ -1163,7 +1383,6 @@ TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
                                  int readLength2) {
 
     // Calculate Hamming distance & covered length
-    uint16_t currHammings;
     int aminoAcidNum_total = ((int) readLength1 / 3) + ((int) readLength2 / 3);
     int aminoAcidNum_read1 = ((int) readLength1 / 3);
     int currPos;
@@ -1174,8 +1393,18 @@ TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
     auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3];
     memset(hammingsAtEachPos, 24, (aminoAcidNum_total + 3));
     while (f < matchNum) {
+        uint8_t minHammingDist = 24;
+        uint16_t currHammings = 0;
         currPos = (int) filteredMatches[f]->qInfo.pos / 3;
-        currHammings = filteredMatches[f]->rightEndHamming;
+        // Find the closest match at current position
+        while ((f < matchNum) && currPos == (int) filteredMatches[f]->qInfo.pos / 3) {
+            if (filteredMatches[f]->hamming < minHammingDist) {
+                minHammingDist = filteredMatches[f]->hamming;
+                currHammings = filteredMatches[f]->rightEndHamming;
+            }
+            f++;
+        }
+        // Update hamming distance at each position
         if (GET_2_BITS(currHammings) < hammingsAtEachPos[currPos + unmaskedPos[0]])
             hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings);
         if (GET_2_BITS(currHammings >> 2) < hammingsAtEachPos[currPos + unmaskedPos[1]])
@@ -1231,7 +1460,7 @@ TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
     float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (readLength1 + readLength2);
 
 //    matchesForEachGenus.push_back(move(filteredMatches));
-    return {taxId, score, coverage, (int) hammingSum};
+    return {taxId, score, coverage, (int) hammingSum, 0};
 }
 
 TaxonScore Taxonomer::chooseSpecies(const vector<Match> &matches,
@@ -1363,7 +1592,7 @@ TaxonScore Taxonomer::scoreSpecies(const vector<Match> &matches,
     float score = ((float)coveredLength - hammingSum) / (float) queryLength;
     float coverage = (float) coveredLength / (float) (queryLength);
 
-    return {0, score, coverage, hammingDist};
+    return {0, score, coverage, hammingDist, 0};
 }
 
 TaxonScore Taxonomer::scoreSpecies(const vector<Match> &matches,
@@ -1442,9 +1671,11 @@ TaxonScore Taxonomer::scoreSpecies(const vector<Match> &matches,
     float score = ((float) (coveredLength_read1 + coveredLength_read2) - hammingSum) / (float) (queryLength + queryLength2);
     float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (queryLength + queryLength2);
 
-    return {0, score, coverage, hammingDist};
+    return {0, score, coverage, hammingDist, 0};
 }
 
 bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) {
+    // match1 87654321 -> 08765432
+    // match2 98765432 -> 08765432
     return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF);
 }
\ No newline at end of file
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index a756db73..95365b9b 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -14,11 +14,31 @@ struct TaxonScore {
     float score;
     float coverage;
     int hammingDist;
-    TaxonScore(TaxID taxId, float score, float coverage, int hammingDist) :
-            taxId(taxId), score(score), coverage(coverage), hammingDist(hammingDist) {}
-    TaxonScore() : taxId(0), score(0.0f), coverage(0.0f), hammingDist(0) {}
+    bool LCA;
+    TaxonScore(TaxID taxId, float score, float coverage, int hammingDist, bool LCA) :
+            taxId(taxId), score(score), coverage(coverage), hammingDist(hammingDist), LCA(LCA) {}
+    TaxonScore() : taxId(0), score(0.0f), coverage(0.0f), hammingDist(0), LCA(false) {}
 };
 
+struct depthScore {
+    depthScore(size_t depth, float score, int hammingDist) : depth(depth), score(score), hammingDist(hammingDist) {}
+    depthScore() : depth(0), score(0.f), hammingDist(0) {}
+    size_t depth;
+    float score;
+    int hammingDist;
+};
+
+struct MatchPath {
+    MatchPath(size_t start, size_t end, float score, int hammingDist) : start(start), end(end), score(score), hammingDist(hammingDist) {}
+    MatchPath() : start(0), end(0), score(0.f), hammingDist(0) {}
+    size_t start;
+    size_t end;
+    float score;
+    int hammingDist;
+    vector<const Match *> matches;
+};
+
+
 class Taxonomer {
 private:
     NcbiTaxonomy * taxonomy;
@@ -73,13 +93,28 @@ class Taxonomer {
                           vector<Query> & queryList,
                           const LocalParameters &par);
 
-    void remainConsecutiveMatches(vector<const Match *> & curFrameMatches,
-                                  vector<const Match *> & filteredMatches,
+    void remainConsecutiveMatches(const vector<const Match *> & curFrameMatches,
+                                  vector<MatchPath> & matchPaths,
                                   TaxID genusId);
-
-    size_t DFS(size_t curMatchIdx, const map<size_t, vector<size_t>>& linkedMatches,
-               vector<size_t>& fiteredMatchIdx, size_t depth, size_t MIN_DEPTH, unordered_set<size_t>& used,
-               unordered_map<size_t, size_t> & idx2depth);
+    
+    float combineMatchPaths(vector<MatchPath> & matchPaths,
+                           vector<MatchPath> & combinedMatchPaths,
+                           int readLength);
+
+    bool isMatchPathNotOverlapped(const MatchPath & matchPath1,
+                                  const MatchPath & matchPath2);
+
+    depthScore DFS(const vector<const Match *> &matches, size_t curMatchIdx,
+                   const map<size_t, vector<size_t>> &linkedMatches,
+                   size_t depth, size_t MIN_DEPTH, unordered_set<size_t> &used,
+                   unordered_map<size_t, depthScore> &idx2depthScore,
+                   unordered_map<const Match *, const Match *> & edges, float score, int hammingDist);
+    // depthScore DFS(const vector<const Match *> & curFrameMatches,
+    //                size_t curMatchIdx,
+    //                const map<size_t, vector<size_t>>& linkedMatches,
+    //                size_t depth, size_t MIN_DEPTH, unordered_set<size_t>& used,
+    //            unordered_map<size_t, size_t> & idx2depth,
+    //            size_t startPos, vector<MatchPath> & matchPaths);
 
     static bool isConsecutive(const Match * match1, const Match * match2);
 

From 645a212c1bcfca1ebc3e99f1b91beb792ffb51b7 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Tue, 7 Nov 2023 22:59:38 +0900
Subject: [PATCH 56/65] don't extract k-mers span different orfs

---
 src/commons/SeqIterator.cpp | 22 +++++++++++-----------
 src/commons/Taxonomer.cpp   | 16 ++++++++--------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp
index dc49b367..46cf14a8 100644
--- a/src/commons/SeqIterator.cpp
+++ b/src/commons/SeqIterator.cpp
@@ -539,11 +539,11 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect
         frame = (genes[0].begin - 1) % 3;
         leftEnd = 0;
         while (leftEnd % 3 != frame) leftEnd++;
-        blocks.emplace_back(leftEnd, genes[1].begin - 1 + 22, 1);
+        blocks.emplace_back(leftEnd, genes[1].begin - 2, 1);
         blockIdx++;
     } else {
         frame = (genes[0].end - 1) % 3;
-        rightEnd = genes[1].begin - 1 + 22;
+        rightEnd = genes[1].begin - 2;
         while (rightEnd % 3 != frame) rightEnd--;
         blocks.emplace_back(0, rightEnd, -1);
         blockIdx++;
@@ -583,12 +583,12 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect
             } else {
                 if (!isReverse) { //forward
                     frame = (genes[geneIdx].begin - 1) % 3;
-                    leftEnd = genes[geneIdx - 1].end - 1 - 22;
+                    leftEnd = genes[geneIdx - 1].end;
                     while (leftEnd % 3 != frame) leftEnd++;
                     blocks.emplace_back(leftEnd, genes[geneIdx].end - 1, 1);
                     blockIdx++;
                 } else { // reverse
-                    blocks.emplace_back(genes[geneIdx - 1].end - 22 - 1, genes[geneIdx].end - 1, -1);
+                    blocks.emplace_back(genes[geneIdx - 1].end, genes[geneIdx].end - 1, -1);
                     blockIdx++;
                 }
             }
@@ -597,24 +597,24 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect
             if (hasBeenExtendedToLeft) {
                 if (!isReverse) { //forward
                     frame = (genes[geneIdx].begin - 1) % 3;
-                    leftEnd = genes[geneIdx - 1].end - 1 - 22;
+                    leftEnd = genes[geneIdx - 1].end;
                     while (leftEnd % 3 != frame) leftEnd++;
-                    blocks.emplace_back(leftEnd, genes[geneIdx + 1].begin - 1 + 22, 1);
+                    blocks.emplace_back(leftEnd, genes[geneIdx + 1].begin - 2, 1);
                     blockIdx++;
                 } else {
                     frame = (genes[geneIdx].end - 1) % 3;
-                    rightEnd = genes[geneIdx + 1].begin - 1 + 22;
+                    rightEnd = genes[geneIdx + 1].begin - 2;
                     while (rightEnd % 3 != frame) rightEnd--;
-                    blocks.emplace_back(genes[geneIdx - 1].end - 1 - 22, rightEnd, -1);
+                    blocks.emplace_back(genes[geneIdx - 1].end, rightEnd, -1);
                     blockIdx++;
                 }
             } else {
                 if (!isReverse) { //forward
-                    blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx + 1].begin - 1 + 22, 1);
+                    blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx + 1].begin - 2, 1);
                     blockIdx++;
                 } else {
                     frame = (genes[geneIdx].end - 1) % 3;
-                    rightEnd = genes[geneIdx + 1].begin - 1 + 22;
+                    rightEnd = genes[geneIdx + 1].begin - 2;
                     while (rightEnd % 3 != frame) rightEnd--;
                     blocks.emplace_back(genes[geneIdx].begin - 1, rightEnd, -1);
                     blockIdx++;
@@ -639,7 +639,7 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect
     // If left region is not covered, cover it.
     leftEnd = genes[numOfGene - 1].begin - 1;
     if (hasBeenExtendedToLeft) {
-        leftEnd = genes[numOfGene - 2].end - 1 - 22;
+        leftEnd = genes[numOfGene - 2].end;
         if (!isReverse) {
             frame = (genes[numOfGene - 1].begin - 1) % 3;
             while (leftEnd % 3 != frame) leftEnd++;
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index a3a3b3b5..6c5fb707 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -168,7 +168,7 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
     // sort(speciesMatches.begin(), speciesMatches.end(),
     //      [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
 
-    cout << "7 " << currentQuery << endl;
+    // cout << "7 " << currentQuery << endl;
 
     TaxID result = lowerRankClassification(speciesMatches, speciesScore.taxId);
 
@@ -184,7 +184,7 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
     queryList[currentQuery].coverage = speciesScore.coverage;
     queryList[currentQuery].hammingDist = speciesScore.hammingDist;
     queryList[currentQuery].newSpecies = false;
-    cout << "8" << currentQuery << endl;
+    // cout << "8" << currentQuery << endl;
 //    if (par.printLog) {
 //        cout << "# " << currentQuery << endl;
 //        for (size_t i = 0; i < genusMatches.size(); i++) {
@@ -592,9 +592,9 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
             // Initialize species2matchPaths
             species2matchPaths[currentSpecies].emplace_back(0, 0, 0, 0);
             // cout << "2" << endl;
-            cout << currentSpecies << endl;
+            // cout << currentSpecies << endl;
             float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], readLength1 + readLength2);
-            cout << endl;
+            // cout << endl;
             species2score[currentSpecies] = score;
             if (score > bestSpScore) {
                 bestSpScore = score;
@@ -611,7 +611,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
     // cout << "4" << endl;
     vector<TaxID> maxSpecies;
     for (auto & spScore : species2score) {
-        cout << spScore.first << " " << spScore.second << endl;
+        // cout << spScore.first << " " << spScore.second << endl;
         if (spScore.second == bestSpScore) {
             maxSpecies.push_back(spScore.first);
         }
@@ -664,10 +664,10 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
     // 2. Add the matchPath with the highest score that is not overlapped with the matchPath in combinedMatchPaths
     // 3. Repeat 2 until no matchPath can be added
     for (size_t i = 0; i < matchPaths.size(); i++) {
-        cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << " " << matchPaths[i].matches.back()->targetId << " " << matchPaths[i].matches.back()->qInfo.frame <<endl;
+        // cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << " " << matchPaths[i].matches.back()->targetId << " " << matchPaths[i].matches.back()->qInfo.frame <<endl;
         if (combinedMatchPaths.empty()) {
             combinedMatchPaths.push_back(matchPaths[i]);
-            combinedMatchPaths.back().matches = matchPaths[i].matches;
+            // combinedMatchPaths.back().matches = matchPaths[i].matches;
             // cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << endl;
             // for (auto & match : matchPaths[i].matches) {
             //     match->printMatch();
@@ -685,7 +685,7 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
             }
             if (!isOverlapped) {
                 combinedMatchPaths.push_back(matchPaths[i]);
-                combinedMatchPaths.back().matches = matchPaths[i].matches;
+                // combinedMatchPaths.back().matches = matchPaths[i].matches;
                 // cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << endl;
                 // for (auto & match : matchPaths[i].matches) {
                 //     match->printMatch();

From ce5b4324ac0cb26cc98540508cf97375f20353eb Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Wed, 8 Nov 2023 23:40:53 +0900
Subject: [PATCH 57/65] merge two linked match paths

---
 src/commons/Taxonomer.cpp  |  87 +++++++++++++++++++++++++------
 src/commons/Taxonomer.h    |  11 ++--
 src/util/mapping2taxon.cpp | 103 +++++++++++++++++++++++++++++++++++++
 3 files changed, 182 insertions(+), 19 deletions(-)
 create mode 100644 src/util/mapping2taxon.cpp

diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index 6c5fb707..e3c49b2b 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -1,4 +1,5 @@
 #include "Taxonomer.h"
+#include "BitManipulateMacros.h"
 #include "Match.h"
 #include "NcbiTaxonomy.h"
 #include <cstdint>
@@ -593,7 +594,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
             species2matchPaths[currentSpecies].emplace_back(0, 0, 0, 0);
             // cout << "2" << endl;
             // cout << currentSpecies << endl;
-            float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], readLength1 + readLength2);
+            float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], readLength1 + readLength2, matchList);
             // cout << endl;
             species2score[currentSpecies] = score;
             if (score > bestSpScore) {
@@ -653,7 +654,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
 
 float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
                                    vector<MatchPath> & combinedMatchPaths,
-                                   int readLength) {
+                                   int readLength, const Match * matchList) {
     combinedMatchPaths.clear();
     // Sort matchPaths by the their score
     sort(matchPaths.begin(), matchPaths.end(),
@@ -675,13 +676,16 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
         } else {
             bool isOverlapped = false;
             for (size_t j = 0; j < combinedMatchPaths.size(); j++) {
-                if (!isMatchPathNotOverlapped(matchPaths[i], combinedMatchPaths[j])) {
+                if (isMatchPathOverlapped(matchPaths[i], combinedMatchPaths[j])) { // overlap!
+                    if (isMatchPathLinked(matchPaths[i], combinedMatchPaths[j])) {
+                        // merge two linked matchPaths by editing the combinedMatchPaths[j]
+                        mergeMatchPaths(matchPaths[i], combinedMatchPaths[j]);                        
+                        break;
+                    } else {
+                        break;
+                    }
                     isOverlapped = true;
-                    break;
-                } else {
-                    // cout << matchPaths[i].start << " " << matchPaths[i].end << endl;
-                    // cout << combinedMatchPaths[j].start << " " << combinedMatchPaths[j].end << endl << endl;;
-                }
+                } 
             }
             if (!isOverlapped) {
                 combinedMatchPaths.push_back(matchPaths[i]);
@@ -693,10 +697,6 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
             }
         }
     }
-    // cout << endl;
-
-
-
     // Calculate the score of combinedMatchPaths
     float score = 0;
     for (auto & matchPath : combinedMatchPaths) {
@@ -705,12 +705,59 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
     return score / readLength;
 }
 
-bool Taxonomer::isMatchPathNotOverlapped(const MatchPath & matchPath1,
-                                         const MatchPath & matchPath2) {
-    return (matchPath1.end < matchPath2.start) || (matchPath2.end < matchPath1.start);                                       
+bool Taxonomer::isMatchPathLinked(const MatchPath & matchPath1, const MatchPath & matchPath2) {
+    int overlappedLength = min(matchPath1.end, matchPath2.end) - max(matchPath1.start, matchPath2.start) + 1;
+    if (!(20 < overlappedLength && overlappedLength < 24)) {
+        return false;
+    }
+    const Match * last;
+    const Match * first;
+    if (matchPath1.start < matchPath2.start) {
+        last = matchPath1.matches.back();
+        first = matchPath2.matches.front();
+    } else {
+        last = matchPath2.matches.back();
+        first = matchPath1.matches.front();
+    }
+    if (overlappedLength == 21) {
+        return isConsecutive(last, first);
+    } else {
+        return isConsecutive_diffFrame(last, first);
+    }
+    return false;
+}
+
+bool Taxonomer::isMatchPathOverlapped(const MatchPath & matchPath1,
+                                      const MatchPath & matchPath2) {
+    return !((matchPath1.end < matchPath2.start) || (matchPath2.end < matchPath1.start));                                       
+}
+// 87654321
+void Taxonomer::mergeMatchPaths(const MatchPath & source, MatchPath & target) {
+    if (source.start < target.start) {
+        target.start = source.start;
+        uint8_t lastEndHamming = GET_2_BITS(target.matches.front()->rightEndHamming);
+        target.hammingDist += source.hammingDist - (source.matches.back()->hamming - lastEndHamming);
+        target.score += source.score - source.matches.back()->getScore();
+        if (lastEndHamming == 0) {
+            target.score += 3.0f;
+        } else {
+            target.score += 2.0f - 0.5f * lastEndHamming;
+        }
+        target.matches.insert(target.matches.begin(), source.matches.begin(), source.matches.end() - 1);
+    } else {
+        target.end = source.end;
+        uint8_t lastEndHamming = GET_2_BITS(source.matches.front()->rightEndHamming >> 14);
+        target.hammingDist += source.hammingDist - (source.matches.front()->hamming - lastEndHamming);
+        target.score += source.score - source.matches.front()->getScore();
+        if (lastEndHamming == 0) {
+            target.score += 3.0f;
+        } else {
+            target.score += 2.0f - 0.5f * lastEndHamming;
+        }
+        target.matches.insert(target.matches.end(), source.matches.begin() + 1, source.matches.end());
+    }
 }
 
-\
 
 //     if (matchPath1.start > matchPath2.start) {
 //         return isMatchPathOverlapped(matchPath2, matchPath1, readLength);
@@ -1678,4 +1725,12 @@ bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) {
     // match1 87654321 -> 08765432
     // match2 98765432 -> 08765432
     return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF);
+}
+
+bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * match2) {
+    // int hamming1 = match1->hamming - GET_2_BITS(match1->rightEndHamming);
+    // int hamming2 = match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14);
+    // match1 87654321 -> 08765432
+    // match2 98765432 -> 08765432
+    return (match1->hamming - GET_2_BITS(match1->rightEndHamming)) == (match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14));
 }
\ No newline at end of file
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index 95365b9b..722ced8b 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -99,10 +99,13 @@ class Taxonomer {
     
     float combineMatchPaths(vector<MatchPath> & matchPaths,
                            vector<MatchPath> & combinedMatchPaths,
-                           int readLength);
+                           int readLength, const Match * matchList);
 
-    bool isMatchPathNotOverlapped(const MatchPath & matchPath1,
-                                  const MatchPath & matchPath2);
+    bool isMatchPathOverlapped(const MatchPath & matchPath1, const MatchPath & matchPath2);
+
+    bool isMatchPathLinked(const MatchPath & matchPath1, const MatchPath & matchPath2);
+
+    void mergeMatchPaths(const MatchPath & source, MatchPath & target);
 
     depthScore DFS(const vector<const Match *> &matches, size_t curMatchIdx,
                    const map<size_t, vector<size_t>> &linkedMatches,
@@ -118,6 +121,8 @@ class Taxonomer {
 
     static bool isConsecutive(const Match * match1, const Match * match2);
 
+    static bool isConsecutive_diffFrame(const Match * match1, const Match * match2);
+
     TaxonScore getBestGenusMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end,
                                    size_t offset, int queryLength);
 
diff --git a/src/util/mapping2taxon.cpp b/src/util/mapping2taxon.cpp
new file mode 100644
index 00000000..e27928c9
--- /dev/null
+++ b/src/util/mapping2taxon.cpp
@@ -0,0 +1,103 @@
+#include <iostream>
+#include <istream>
+#include <string>
+#include <vector>
+#include "Command.h"
+#include "LocalParameters.h"
+#include "NcbiTaxonomy.h"
+#include "common.h"
+#include "fstream"
+#include <sstream>
+#include <unordered_map>
+
+using namespace std;
+
+struct read2taxon {
+    string read;
+    TaxID taxon;
+};
+
+int parseTaxId_metamaps(const string & mappingRes) {
+    vector<string> tokens = Util::split(mappingRes, " ");
+    return stoi(Util::split(tokens[5], "|")[2]);
+}
+
+// It takes a mapping result of Metamaps.
+// The mapping result includes mutliple mappings for a read, which have mapping scores.
+// The function returns the taxon ID of the best mapping.
+// If there are multiple mappings with the same best score, it returns the LCA of them.
+int mapping2taxon(int argc, const char **argv, const Command &command) {
+    LocalParameters &par = LocalParameters::getLocalInstance();
+    par.parseParameters(argc, argv, command, false, Parameters::PARSE_ALLOW_EMPTY, 0);
+    string mappingFile = par.filenames[0];
+    string taxonomyDir = par.filenames[1];
+    string output = mappingFile + ".reads2taxon";
+    ofstream out(output);
+
+    vector<read2taxon> read2taxon;
+    NcbiTaxonomy *taxonomy = loadTaxonomy("", taxonomyDir);
+    cout << "Taxonomy loaded" << endl;
+    
+    // Iterate through mapping file
+    ifstream mapping(mappingFile);
+    string line;
+    vector<TaxID> taxIds;
+    string previousRead = "";
+    double bestScore = -2;
+    TaxID bestTaxId = -1;
+    bool lastStored = false;
+    
+    while (getline(mapping, line)) {
+        vector<string> tokens = Util::split(line, " ");
+        string currentRead = tokens[0];
+        if (currentRead == previousRead) { // Same read
+            // Get score
+            stringstream scoreString(tokens[13]);
+            double curScore = 0;
+            scoreString >> curScore;
+
+            if (curScore > bestScore) {
+                taxIds.clear();
+                bestScore = curScore;
+                bestTaxId = parseTaxId_metamaps(line);
+                taxIds.push_back(bestTaxId);
+            } else if (curScore == bestScore) {
+                taxIds.push_back(parseTaxId_metamaps(line));
+                bestTaxId = taxonomy->LCA(taxIds)->taxId;
+            }
+            lastStored = false;
+        } else { // New read
+            // Store results for previous read
+            // out << previousRead << "\t" << bestTaxId << endl;
+            read2taxon.push_back({previousRead, bestTaxId});
+            lastStored = true;
+            
+            // Initialize variables
+            previousRead = currentRead;
+            taxIds.clear();
+
+            // Get score
+            stringstream scoreString(tokens[13]);
+            double curScore = 0;
+            scoreString >> curScore;
+
+            // Update variables
+            bestScore = curScore;
+            bestTaxId = parseTaxId_metamaps(line);
+            taxIds.push_back(bestTaxId);
+        }
+    }
+
+    if (!lastStored) {
+        // out << previousRead << "\t" << bestTaxId << endl;
+        read2taxon.push_back({previousRead, bestTaxId});
+    }
+
+    // Write to file
+    cout << "Writing to file" << endl;
+    for (size_t i = 1; i < read2taxon.size(); i++) {
+        out << read2taxon[i].read << "\t" << read2taxon[i].taxon << "\n";
+    }
+
+    return 0;
+}
\ No newline at end of file

From 61c9fff1d75a769818e2e82eb38d0d2ae3fe7fb0 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Thu, 9 Nov 2023 15:36:02 +0900
Subject: [PATCH 58/65] 1. Trimming overlapping match paths 2. Gather all the
 matches of selected species for lower rank classification 3. hamming <=
 min(minHamming, 6)

---
 src/commons/KmerMatcher.cpp |   8 +-
 src/commons/Match.h         |   9 +-
 src/commons/Taxonomer.cpp   | 887 ++++++------------------------------
 src/commons/Taxonomer.h     |  11 +-
 4 files changed, 135 insertions(+), 780 deletions(-)

diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 8a13ceb1..084b919b 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -292,7 +292,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
                             // }
                             matches[matchCnt] = {queryKmerList[j].info,
                                                  candidateKmerInfos[idx].sequenceID,
-                                                 taxId2genusId[candidateKmerInfos[idx].sequenceID],
                                                  taxId2speciesId[candidateKmerInfos[idx].sequenceID],
                                                  selectedHammings[k],
                                                  selectedHammingSum[k],
@@ -334,7 +333,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
                             // }
                             matches[matchCnt] = {queryKmerList[j].info,
                                                  candidateKmerInfos[idx].sequenceID,
-                                                 taxId2genusId[candidateKmerInfos[idx].sequenceID],
                                                  taxId2speciesId[candidateKmerInfos[idx].sequenceID],
                                                  selectedHammings[k],
                                                  selectedHammingSum[k],
@@ -430,7 +428,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
                         // }
                         matches[matchCnt] = {queryKmerList[j].info,
                                              candidateKmerInfos[idx].sequenceID,
-                                             taxId2genusId[candidateKmerInfos[idx].sequenceID],
                                              taxId2speciesId[candidateKmerInfos[idx].sequenceID],
                                              selectedHammings[k],
                                              selectedHammingSum[k],
@@ -516,7 +513,7 @@ void KmerMatcher::compareDna(uint64_t query,
 
     // Select target k-mers that passed hamming criteria
     for (size_t h = 0; h < size; h++) {
-        if (hammingSums[h] <= 6) {// minHammingSum + hammingMargin) {
+        if (hammingSums[h] <= min(minHammingSum * 2, 6)) {
             selectedMatches.push_back(h);
             selectedHammingSum.push_back(hammingSums[h]);
             if (frame < 3) {
@@ -534,9 +531,6 @@ bool KmerMatcher::compareMatches(const Match& a, const Match& b) {
     if (a.qInfo.sequenceID != b.qInfo.sequenceID)
         return a.qInfo.sequenceID < b.qInfo.sequenceID;
 
-    if (a.genusId != b.genusId)
-        return a.genusId < b.genusId;
-
     if (a.speciesId != b.speciesId)
         return a.speciesId < b.speciesId;
 
diff --git a/src/commons/Match.h b/src/commons/Match.h
index f47881ef..9bc1fb44 100644
--- a/src/commons/Match.h
+++ b/src/commons/Match.h
@@ -6,21 +6,19 @@
 #include <iostream>
 #include "BitManipulateMacros.h"
 
-struct Match { // 24 byte
+struct Match { // 20 byte
     Match(){}
     Match(QueryKmerInfo qInfo,
           int targetId,
-          TaxID genusId,
           TaxID speciesId,
           uint16_t eachHamming,
           uint8_t hamming,
           bool redundancy):
-          qInfo(qInfo), targetId(targetId), genusId(genusId), speciesId(speciesId),
+          qInfo(qInfo), targetId(targetId), speciesId(speciesId),
           rightEndHamming(eachHamming), hamming(hamming), redundancy(redundancy) { }
 
     QueryKmerInfo qInfo; // 8
     TaxID targetId; // 4 taxonomy id infact
-    TaxID genusId; // 4
     TaxID speciesId; // 4
     uint16_t rightEndHamming; // 2
     uint8_t hamming; // 1
@@ -28,10 +26,9 @@ struct Match { // 24 byte
 
     void printMatch() const {
         std::cout << qInfo.sequenceID << " " << qInfo.pos << " " << qInfo.frame << " "
-        << targetId << " " << genusId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << " " << getScore() << std::endl;
+        << targetId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << " " << getScore() << std::endl;
     }
 
-
     float getScore(float score = 0.0f, int cnt = 0) const {
         int currentHamming = GET_2_BITS(rightEndHamming >> cnt * 2);
         if (currentHamming == 0) {
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index e3c49b2b..c0760d8c 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -2,6 +2,7 @@
 #include "BitManipulateMacros.h"
 #include "Match.h"
 #include "NcbiTaxonomy.h"
+#include "printBinary.h"
 #include <cstdint>
 #include <sys/types.h>
 #include <unordered_map>
@@ -59,7 +60,7 @@ void Taxonomer::assignTaxonomy(const Match *matchList,
     {
 #pragma omp for schedule(dynamic, 1)
         for (size_t i = 0; i < blockIdx; ++i) {
-            chooseBestTaxon2(matchBlocks[i].id,
+            chooseBestTaxon(matchBlocks[i].id,
                             matchBlocks[i].start,
                             matchBlocks[i].end,
                             matchList,
@@ -76,12 +77,12 @@ void Taxonomer::assignTaxonomy(const Match *matchList,
 
 }
 
-void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
-                                 size_t offset,
-                                 size_t end,
-                                 const Match *matchList,
-                                 vector<Query> & queryList,
-                                 const LocalParameters &par) {
+void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
+                                size_t offset,
+                                size_t end,
+                                const Match *matchList,
+                                vector<Query> & queryList,
+                                const LocalParameters &par) {
 
 //    if (true) {
 //        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
@@ -89,7 +90,6 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
 //            cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
 //        }
 //    }
-
     // Get the best species for current query
     vector<Match> speciesMatches;
     speciesMatches.reserve(end - offset + 1);
@@ -124,21 +124,11 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
 
     // If there are two or more good species level candidates, find the LCA.
     if (speciesScore.LCA) {
-        // cout << "LCA" << endl;
-        // vector<TaxID> genusList;
-        // genusList.reserve(speciesMatches.size());
-        // for (auto & genusMatch : speciesMatches) {
-        //     genusList.push_back(genusMatch.genusId);
-        // }
-        // selectedTaxon = taxonomy->LCA(genusList)->taxId;
         queryList[currentQuery].isClassified = true;
         queryList[currentQuery].classification = speciesScore.taxId;
         queryList[currentQuery].score = speciesScore.score;
         queryList[currentQuery].coverage = speciesScore.coverage;
         queryList[currentQuery].hammingDist = speciesScore.hammingDist;
-        // for (auto & spMatch : speciesMatches) {
-        //     queryList[currentQuery].taxCnt[spMatch.targetId]++;
-        // }
         return;
     }
 
@@ -156,20 +146,9 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
         return;
     }
 
-    // Sort matches by the position of the query sequence
-//    sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first,
-//         genusMatches.begin() + speciesMatchRange[selectedSpecies].second,
-//         [](const Match & a, const Match & b) {
-//        if (a.qInfo.position / 3 == b.qInfo.position / 3)
-//            return a.hamming < b.hamming;
-//        else
-//            return a.qInfo.position / 3 < b.qInfo.position / 3;
-//    });
-
-    // sort(speciesMatches.begin(), speciesMatches.end(),
-    //      [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
-
-    // cout << "7 " << currentQuery << endl;
+    // Sort matches by the coordinate of the query
+    sort(speciesMatches.begin(), speciesMatches.end(),
+         [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
 
     TaxID result = lowerRankClassification(speciesMatches, speciesScore.taxId);
 
@@ -185,7 +164,6 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
     queryList[currentQuery].coverage = speciesScore.coverage;
     queryList[currentQuery].hammingDist = speciesScore.hammingDist;
     queryList[currentQuery].newSpecies = false;
-    // cout << "8" << currentQuery << endl;
 //    if (par.printLog) {
 //        cout << "# " << currentQuery << endl;
 //        for (size_t i = 0; i < genusMatches.size(); i++) {
@@ -199,227 +177,36 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery,
 //    }
 }
 
-// void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
-//                                  size_t offset,
-//                                  size_t end,
-//                                  const Match *matchList,
-//                                  vector<Query> & queryList,
-//                                  const LocalParameters &par) {
-//     TaxID selectedTaxon;
-
-// //    if (true) {
-// //        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
-// //        for (size_t i = offset; i < end + 1; i++) {
-// //            cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
-// //        }
-// //    }
-
-//     // Get the best genus for current query
-//     vector<Match> genusMatches;
-//     genusMatches.reserve(end - offset + 1);
-//     TaxonScore genusScore(0, 0, 0, 0);
-//     if (par.seqMode == 2) {
-//         genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
-//                                              queryList[currentQuery].queryLength,
-//                                              queryList[currentQuery].queryLength2);
-//     } else {
-//         genusScore = getBestGenusMatches(genusMatches, matchList, end, offset,
-//                                              queryList[currentQuery].queryLength);
-//     }
-
-// //    if (true) {
-// //        cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n";
-// //        for (size_t i = 0; i < genusMatches.size(); i++) {
-// //           cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) <<  " "  << int(genusMatches[i].redundancy) << endl;
-// //         }
-// //        cout << "Genus score: " << genusScore.score << "\n";
-// //    }
-
-//     // If there is no proper genus for current query, it is un-classified.
-//     if (genusScore.score == 0 || genusScore.coverage < par.minCoverage || genusScore.score < par.minScore) {
-//         queryList[currentQuery].isClassified = false;
-//         queryList[currentQuery].classification = 0;
-//         queryList[currentQuery].score = genusScore.score;
-//         queryList[currentQuery].coverage = genusScore.coverage;
-//         queryList[currentQuery].hammingDist = genusScore.hammingDist;
-//         queryList[currentQuery].newSpecies = false;
-//         return;
-//     }
-
-//     // If there are two or more good genus level candidates, find the LCA.
-//     if (genusScore.taxId == 0) {
-//         vector<TaxID> genusList;
-//         genusList.reserve(genusMatches.size());
-//         for (auto & genusMatch : genusMatches) {
-//             genusList.push_back(genusMatch.genusId);
-//         }
-//         selectedTaxon = taxonomy->LCA(genusList)->taxId;
-//         queryList[currentQuery].isClassified = true;
-//         queryList[currentQuery].classification = selectedTaxon;
-//         queryList[currentQuery].score = genusScore.score;
-//         queryList[currentQuery].coverage = genusScore.coverage;
-//         queryList[currentQuery].hammingDist = genusScore.hammingDist;
-//         for (auto & genusMatch : genusMatches) {
-//             queryList[currentQuery].taxCnt[genusMatch.targetId]++;
-//         }
-//         return;
-//     }
-
-//     // Choose the species with the highest coverage.
-//     TaxID selectedSpecies;
-//     TaxonScore speciesScore;
-//     vector<TaxID> species;
-//     unordered_map<TaxID, pair<int, int>> speciesMatchRange;
-//     if (par.seqMode == 2) {
-//         speciesScore = chooseSpecies(genusMatches,
-//                                      queryList[currentQuery].queryLength,
-//                                      queryList[currentQuery].queryLength2,
-//                                      species,
-//                                      speciesMatchRange);
-//     } else {
-//         speciesScore = chooseSpecies(genusMatches,
-//                                      queryList[currentQuery].queryLength,
-//                                      species,
-//                                      speciesMatchRange);
-//     }
-
-
-//     // Classify to LCA if more than one species are selected
-//     if (species.size() > 1) {
-//         queryList[currentQuery].isClassified = true;
-//         queryList[currentQuery].classification = taxonomy->LCA(species)->taxId;
-//         queryList[currentQuery].score = genusScore.score;
-//         queryList[currentQuery].coverage = genusScore.coverage;
-//         queryList[currentQuery].hammingDist = genusScore.hammingDist;
-//         for (auto & genusMatch : genusMatches) {
-//             queryList[currentQuery].taxCnt[genusMatch.targetId]++;
-//         }
-//         return;
-//     }
-
-//     // If score is not enough, classify to the parent of the selected species
-//     if (speciesScore.score < par.minSpScore) {
-//         queryList[currentQuery].isClassified = true;
-//         queryList[currentQuery].classification = taxonomy->taxonNode(
-//                 taxonomy->getTaxIdAtRank(species[0], "species"))->parentTaxId;
-//         queryList[currentQuery].score = genusScore.score;
-//         queryList[currentQuery].coverage = genusScore.coverage;
-//         queryList[currentQuery].hammingDist = genusScore.hammingDist;
-//         for (auto & genusMatch : genusMatches) {
-//             if(genusMatch.speciesId == species[0]){
-//                 queryList[currentQuery].taxCnt[genusMatch.targetId]++;
-//             }
-//         }
-//         return;
-//     }
-
-//     // Sort matches by the position of the query sequence
-//     selectedSpecies = species[0];
-// //    sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first,
-// //         genusMatches.begin() + speciesMatchRange[selectedSpecies].second,
-// //         [](const Match & a, const Match & b) {
-// //        if (a.qInfo.position / 3 == b.qInfo.position / 3)
-// //            return a.hamming < b.hamming;
-// //        else
-// //            return a.qInfo.position / 3 < b.qInfo.position / 3;
-// //    });
-
-//     vector<Match>::const_iterator first = genusMatches.begin() + speciesMatchRange[selectedSpecies].first;
-//     vector<Match>::const_iterator last = genusMatches.begin() + speciesMatchRange[selectedSpecies].second;
-//     vector<Match> speciesMatches(first, last);
-
-
-//     sort(speciesMatches.begin(), speciesMatches.end(),
-//          [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
-         
-//     TaxID result = lowerRankClassification(speciesMatches, selectedSpecies);
-
-//     // Record matches of selected species
-//     for (size_t i = speciesMatchRange[selectedSpecies].first; i < speciesMatchRange[selectedSpecies].second; i++) {
-//         queryList[currentQuery].taxCnt[genusMatches[i].targetId]++;
-//     }
-
-//     // Store classification results
-//     queryList[currentQuery].isClassified = true;
-//     queryList[currentQuery].classification = result;
-//     queryList[currentQuery].score = speciesScore.score;
-//     queryList[currentQuery].coverage = speciesScore.coverage;
-//     queryList[currentQuery].hammingDist = speciesScore.hammingDist;
-//     queryList[currentQuery].newSpecies = false;
-// //    if (par.printLog) {
-// //        cout << "# " << currentQuery << endl;
-// //        for (size_t i = 0; i < genusMatches.size(); i++) {
-// //            cout << i << " " << genusMatches[i].qInfo.pos << " " <<
-// //            genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl;
-// //        }
-// //        cout << "Score: " << speciesScore.score << "  " << selectedSpecies << " "
-// //             << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx)
-// //
-// //             << endl;
-// //    }
-// }
-
 TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, TaxID spTaxId) {
     unordered_map<TaxID, unsigned int> taxCnt;
     size_t matchNum = matches.size();
-    // cout << spTaxId << endl;
-    
 
     for (size_t i = 0; i < matchNum; i++) {
         // cout << matches[i].targetId << endl;
-        taxCnt[matches[i].targetId] ++;
-        // size_t currQuotient = matches[i].qInfo.pos / 3;
-        // uint8_t minHamming = 0; //matches[i].hamming;
-        // Match * minHammingMatch = & matches[i];
-        // TaxID minHammingTaxId = minHammingMatch->targetId;
-        // while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) {
-        //     if (matches[i].hamming < minHamming) {
-        //         minHamming = matches[i].hamming;
-        //         minHammingMatch = & matches[i];
-        //         minHammingTaxId = minHammingMatch->targetId;
-        //     } else if (matches[i].hamming == minHamming) {
-        //         minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId);
-        //         minHammingMatch->redundancy = true;
-        //         matches[i].redundancy = true;
-        //     }
-        //     i++;
-        // }
-        // taxCnt[minHammingTaxId]++;       
+        // taxCnt[matches[i].targetId] ++;
+        size_t currQuotient = matches[i].qInfo.pos / 3;
+        uint8_t minHamming = matches[i].hamming;
+        Match * minHammingMatch = & matches[i];
+        TaxID minHammingTaxId = minHammingMatch->targetId;
+        while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) {
+            if (matches[i].hamming < minHamming) {
+                minHamming = matches[i].hamming;
+                minHammingMatch = & matches[i];
+                minHammingTaxId = minHammingMatch->targetId;
+            } else if (matches[i].hamming == minHamming) {
+                minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId);
+                minHammingMatch->redundancy = true;
+                matches[i].redundancy = true;
+            }
+            i++;
+        }
+        taxCnt[minHammingTaxId]++;       
     }
 
-    // int i = matchRange.second - 1;
-    // while ( i >= matchRange.first ) {
-    //     size_t currQuotient = matches[i].qInfo.pos / 3;
-    //     uint8_t minHamming = matches[i].hamming;
-    //     Match * minHammingMatch = & matches[i];
-    //     TaxID minHammingTaxId = minHammingMatch->targetId;
-    //     i --;
-    //     while ( (i >= matchRange.first) && (currQuotient == matches[i].qInfo.pos / 3) ) {
-    //         if (matches[i].hamming < minHamming) {
-    //             minHamming = matches[i].hamming;
-    //             minHammingMatch = & matches[i];
-    //             minHammingTaxId = minHammingMatch->targetId;
-    //         } else if (matches[i].hamming == minHamming) {
-    //             minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId);
-    //             minHammingMatch->redundancy = true;
-    //             matches[i].redundancy = true;
-    //         }
-    //         i--;
-    //     }
-    //     taxCnt[minHammingTaxId]++;
-    // }
-
     unordered_map<TaxID, TaxonCounts> cladeCnt;
-    // cout << "8" << endl;
     getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
-    // // print cladeCnt
-    // for (auto it = cladeCnt.begin(); it != cladeCnt.end(); it++) {
-    //     cout << it->first << " " << it->second.taxCount << " " << it->second.cladeCount << endl;
-    // }
-    // cout << "9" << endl;
     if (accessionLevel == 2) { // Don't do accession-level classification
         // Remove leaf nodes
-        // cout << "10" << endl;
         for (auto it = cladeCnt.begin(); it != cladeCnt.end(); it++) {
             TaxonNode const * taxon = taxonomy->taxonNode(it->first);
             if (strcmp(taxonomy->getString(taxon->rankIdx), "") == 0) {
@@ -429,10 +216,8 @@ TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, TaxID spTaxId)
                                                                  it->first));
             } 
         }
-        
         return BFS(cladeCnt, spTaxId);
     } else {
-        // cout << "10-2" << endl;
         return BFS(cladeCnt, spTaxId);
     }
 }
@@ -441,8 +226,6 @@ void Taxonomer::getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> &
                                        unordered_map<TaxID, TaxonCounts> & cladeCount,
                                        TaxID speciesTaxID) {
     for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) {
-//        cladeCount[it->first].taxCount = it->second;
-//        cladeCount[it->first].cladeCount += it->second;
         TaxonNode const * taxon = taxonomy->taxonNode(it->first);
         cladeCount[taxon->taxId].taxCount = it->second;
         cladeCount[taxon->taxId].cladeCount += it->second;
@@ -487,21 +270,22 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
                                             size_t end,
                                             size_t offset,
                                             int queryLength) {
-    TaxID currentSpecies;
-    vector<const Match *> filteredMatches;
     vector<vector<const Match *>> matchesForEachSpecies;
-    vector<TaxonScore> speciesScores;
     TaxonScore bestScore;
-    size_t i = offset;
-    uint8_t curFrame;
     vector<const Match *> curFrameMatches;
     vector<MatchPath> matchPaths;
+    unordered_map<TaxID, float> species2score;
+    unordered_map<TaxID, vector<MatchPath>> species2matchPaths;
+    float bestSpScore = 0;
+    unordered_map<TaxID, pair<size_t, size_t>> speciesMatchRange;
 
-     while (i  < end + 1) {
-        currentSpecies = matchList[i].speciesId;
+    size_t i = offset;
+    while (i  < end + 1) {
+        TaxID currentSpecies = matchList[i].speciesId;
+        size_t start = i;
         // For current species
         while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
-            curFrame = matchList[i].qInfo.frame;
+            uint8_t curFrame = matchList[i].qInfo.frame;
             curFrameMatches.clear();
             // For current frame
             while ((i < end + 1) && currentSpecies == matchList[i].speciesId && curFrame == matchList[i].qInfo.frame) {
@@ -512,44 +296,61 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
                 remainConsecutiveMatches(curFrameMatches, matchPaths, currentSpecies);
             }
         }
-        // Construct a match combination using filtered matches of current species
-        // so that it can best cover the query, and score the combination
-        if (!filteredMatches.empty()) {
-            matchesForEachSpecies.push_back(filteredMatches);
-            speciesScores.push_back(scoreTaxon(filteredMatches, currentSpecies, queryLength));
+        speciesMatchRange[currentSpecies] = make_pair(start, i);
+        // Combine MatchPaths
+        if (!matchPaths.empty()) {
+            // cout << currentSpecies << endl;
+            float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], queryLength);
+            // cout << endl;
+            if (score > 1.0f) {score = 1.0f;}
+            species2score[currentSpecies] = score;
+            if (score > bestSpScore) {
+                bestSpScore = score;
+            }
         }
-        filteredMatches.clear();
+        matchPaths.clear();
     }
     
     // If there are no meaningful species
-    if (speciesScores.empty()) {
+    if (species2score.empty()) {
         bestScore.score = 0;
         return bestScore;
     }
 
-    TaxonScore maxScore = *max_element(speciesScores.begin(), speciesScores.end(),
-                                       [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
-
-    vector<size_t> maxIdx;
-    for (size_t g = 0; g < speciesScores.size(); g++) {
-        if (speciesScores[g].score == maxScore.score) {
-            maxIdx.push_back(g);
+    vector<TaxID> maxSpecies;
+    for (auto & spScore : species2score) {
+        if (spScore.second > bestSpScore * 0.99) {
+            maxSpecies.push_back(spScore.first);
         }
     }
-    bestScore = maxScore;
 
-    for (unsigned long g : maxIdx) {
-        for (const Match * m : matchesForEachSpecies[g]) {
-            speciesMatches.push_back(*m);
+    // More than one species --> LCA
+    if (maxSpecies.size() > 1) {
+        bestScore.LCA = true;
+        bestScore.taxId = taxonomy->LCA(maxSpecies)->taxId;
+        for (auto & sp : maxSpecies) {
+            bestScore.score += species2score[sp];
         }
+        bestScore.score /= maxSpecies.size();
+        return bestScore;
     }
 
-    // More than one species
-    if (maxIdx.size() > 1) {
-        bestScore.taxId = 0;
+    // One species
+    bestScore.taxId = maxSpecies[0];
+    bestScore.score = species2score[maxSpecies[0]];
+    float coveredLength = 0.f;
+    int hammingDist = 0;
+    for (auto & matchPath : species2matchPaths[maxSpecies[0]]) {
+        coveredLength += matchPath.end - matchPath.start + 1;
+        hammingDist += matchPath.hammingDist;
+        for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) {
+            speciesMatches.push_back(matchList[i]);
+        }
     }
-
-    return bestScore;                    
+    bestScore.coverage = coveredLength / queryLength;
+    bestScore.hammingDist = hammingDist;
+    
+    return bestScore;                                  
 }
 
 TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
@@ -558,24 +359,22 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
                                             size_t offset,
                                             int readLength1,
                                             int readLength2) {
-    TaxID currentSpecies;
-    vector<const Match *> filteredMatches;
     vector<vector<const Match *>> matchesForEachSpecies;
-    vector<TaxonScore> speciesScores;
-    TaxonScore bestScore;
-    size_t i = offset;
-    uint8_t curFrame;
+    TaxonScore bestScore;    
     vector<const Match *> curFrameMatches;
     vector<MatchPath> matchPaths;
     unordered_map<TaxID, float> species2score;
     unordered_map<TaxID, vector<MatchPath>> species2matchPaths;
     float bestSpScore = 0;
-
-     while (i  < end + 1) {
-        currentSpecies = matchList[i].speciesId;
+    unordered_map<TaxID, pair<size_t, size_t>> speciesMatchRange;
+    
+    size_t i = offset;
+    while (i  < end + 1) {
+        TaxID currentSpecies = matchList[i].speciesId;
+        size_t start = i;
         // For current species
         while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
-            curFrame = matchList[i].qInfo.frame;
+            uint8_t curFrame = matchList[i].qInfo.frame;
             curFrameMatches.clear();
             // For current frame
             while ((i < end + 1) && currentSpecies == matchList[i].speciesId && curFrame == matchList[i].qInfo.frame) {
@@ -583,19 +382,16 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
                 i ++;
             }
             if (curFrameMatches.size() > 1) {
-                // cout << "1" << endl;
                 remainConsecutiveMatches(curFrameMatches, matchPaths, currentSpecies);
             }
         }
+        speciesMatchRange[currentSpecies] = make_pair(start, i);
         // Combine MatchPaths
-        // so that it can best cover the query, and score the combination
         if (!matchPaths.empty()) {
-            // Initialize species2matchPaths
-            species2matchPaths[currentSpecies].emplace_back(0, 0, 0, 0);
-            // cout << "2" << endl;
             // cout << currentSpecies << endl;
-            float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], readLength1 + readLength2, matchList);
+            float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], readLength1 + readLength2);
             // cout << endl;
+            if (score > 1.0f) {score = 1.0f;}
             species2score[currentSpecies] = score;
             if (score > bestSpScore) {
                 bestSpScore = score;
@@ -609,15 +405,14 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
         bestScore.score = 0;
         return bestScore;
     }
-    // cout << "4" << endl;
+
     vector<TaxID> maxSpecies;
     for (auto & spScore : species2score) {
-        // cout << spScore.first << " " << spScore.second << endl;
-        if (spScore.second == bestSpScore) {
+        if (spScore.second > bestSpScore * 0.99) {
             maxSpecies.push_back(spScore.first);
         }
     }
-    // cout << "5" << endl;
+
     // More than one species --> LCA
     if (maxSpecies.size() > 1) {
         bestScore.LCA = true;
@@ -635,26 +430,21 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
     float coveredLength = 0.f;
     int hammingDist = 0;
     for (auto & matchPath : species2matchPaths[maxSpecies[0]]) {
-        // cout << "here" << endl;
         coveredLength += matchPath.end - matchPath.start + 1;
         hammingDist += matchPath.hammingDist;
-        for (auto match : matchPath.matches) {
-            // cout << match->targetId << endl;
-            // match->printMatch();
-            speciesMatches.push_back(*match);
-            // speciesMatches.back().printMatch();
+        for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) {
+            speciesMatches.push_back(matchList[i]);
         }
     }
     bestScore.coverage = coveredLength / (readLength1 + readLength2);
     bestScore.hammingDist = hammingDist;
 
-// cout << "6" << endl;
     return bestScore;                    
 }
 
 float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
                                    vector<MatchPath> & combinedMatchPaths,
-                                   int readLength, const Match * matchList) {
+                                   int readLength) {
     combinedMatchPaths.clear();
     // Sort matchPaths by the their score
     sort(matchPaths.begin(), matchPaths.end(),
@@ -665,35 +455,24 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
     // 2. Add the matchPath with the highest score that is not overlapped with the matchPath in combinedMatchPaths
     // 3. Repeat 2 until no matchPath can be added
     for (size_t i = 0; i < matchPaths.size(); i++) {
-        // cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << " " << matchPaths[i].matches.back()->targetId << " " << matchPaths[i].matches.back()->qInfo.frame <<endl;
         if (combinedMatchPaths.empty()) {
             combinedMatchPaths.push_back(matchPaths[i]);
-            // combinedMatchPaths.back().matches = matchPaths[i].matches;
-            // cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << endl;
-            // for (auto & match : matchPaths[i].matches) {
-            //     match->printMatch();
-            // }
         } else {
             bool isOverlapped = false;
             for (size_t j = 0; j < combinedMatchPaths.size(); j++) {
                 if (isMatchPathOverlapped(matchPaths[i], combinedMatchPaths[j])) { // overlap!
                     if (isMatchPathLinked(matchPaths[i], combinedMatchPaths[j])) {
                         // merge two linked matchPaths by editing the combinedMatchPaths[j]
-                        mergeMatchPaths(matchPaths[i], combinedMatchPaths[j]);                        
-                        break;
+                        trimMatchPath(matchPaths[i], combinedMatchPaths[j]);                        
+                        continue;
                     } else {
+                        isOverlapped = true;
                         break;
                     }
-                    isOverlapped = true;
                 } 
             }
             if (!isOverlapped) {
                 combinedMatchPaths.push_back(matchPaths[i]);
-                // combinedMatchPaths.back().matches = matchPaths[i].matches;
-                // cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << endl;
-                // for (auto & match : matchPaths[i].matches) {
-                //     match->printMatch();
-                // }
             }
         }
     }
@@ -707,7 +486,7 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
 
 bool Taxonomer::isMatchPathLinked(const MatchPath & matchPath1, const MatchPath & matchPath2) {
     int overlappedLength = min(matchPath1.end, matchPath2.end) - max(matchPath1.start, matchPath2.start) + 1;
-    if (!(20 < overlappedLength && overlappedLength < 24)) {
+    if (20 >= overlappedLength || overlappedLength >= 24) {
         return false;
     }
     const Match * last;
@@ -722,6 +501,7 @@ bool Taxonomer::isMatchPathLinked(const MatchPath & matchPath1, const MatchPath
     if (overlappedLength == 21) {
         return isConsecutive(last, first);
     } else {
+
         return isConsecutive_diffFrame(last, first);
     }
     return false;
@@ -758,114 +538,32 @@ void Taxonomer::mergeMatchPaths(const MatchPath & source, MatchPath & target) {
     }
 }
 
-
-//     if (matchPath1.start > matchPath2.start) {
-//         return isMatchPathOverlapped(matchPath2, matchPath1, readLength);
-//     }
-//     if (matchPath1.end < matchPath2.start) {
-//         return false;
-//     }
-//     if (matchPath1.endPos >= matchPath2.startPos) {
-//         if (matchPath1.endPos <= matchPath2.endPos) {
-//             return true;
-//         } else {
-//             if (matchPath1.startPos + readLength - 1 >= matchPath2.startPos) {
-//                 return true;
-//             } else {
-//                 return false;
-//             }
-//         }
-//     }
-//     return false;
-// }
-
-
-// TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
-//                                            size_t offset, int readLength1, int readLength2) {
-//     TaxID currentGenus;
-//     TaxID currentSpecies;
-
-//     vector<const Match *> filteredMatches;
-//     vector<vector<const Match *>> matchesForEachGenus;
-//     vector<TaxonScore> genusScores;
-//     TaxonScore bestScore;
-//     size_t i = offset;
-//     uint8_t curFrame;
-//     vector<const Match *> curFrameMatches;
-//     while (i  < end + 1) {
-// //        currentGenus = taxId2genusId[matchList[i].targetId];
-//         currentGenus = matchList[i].genusId;
-//         // For current genus
-//         while ((i < end + 1) && currentGenus == matchList[i].genusId) {
-// //            currentSpecies = taxId2speciesId[matchList[i].targetId];
-//             currentSpecies = matchList[i].speciesId;
-// //            if (par.printLog) {
-// //                cout << currentGenus << " " << currentSpecies << endl;
-// //            }
-//             // For current species
-//             while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
-//                 curFrame = matchList[i].qInfo.frame;
-//                 curFrameMatches.clear();
-
-//                 // For current frame
-//                 while ((i < end + 1) && currentSpecies == matchList[i].speciesId
-//                        && curFrame == matchList[i].qInfo.frame) {
-//                     curFrameMatches.push_back(&matchList[i]);
-//                     i ++;
-//                 }
-//                 if (curFrameMatches.size() > 1) {
-//                     remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus);
-//                 }
-//             }
-//         }
-
-//         // Construct a match combination using filtered matches of current genus
-//         // so that it can best cover the query, and score the combination
-//         if (!filteredMatches.empty()) {
-//             matchesForEachGenus.push_back(filteredMatches);
-//             genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, readLength1, readLength2));
-//         }
-//         filteredMatches.clear();
-//     }
-
-//     // If there are no meaningful genus
-//     if (genusScores.empty()) {
-//         bestScore.score = 0;
-//         return bestScore;
-//     }
-
-//     TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
-//                                        [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
-
-//     vector<size_t> maxIdx;
-//     for (size_t g = 0; g < genusScores.size(); g++) {
-//         if (genusScores[g].score > maxScore.score * 0.95f) {
-//             maxIdx.push_back(g);
-//         }
-//     }
-//     bestScore = maxScore;
-
-//     for (unsigned long g : maxIdx) {
-//         for (const Match * m : matchesForEachGenus[g]) {
-//             genusMatches.push_back(*m);
-//         }
-//     }
-
-
-
-//     // More than one genus
-//     if (maxIdx.size() > 1) {
-//         bestScore.taxId = 0;
-//         return bestScore;
-//     }
-
-//     return bestScore;
-
-//     //Three cases
-//     //1. one genus
-//     //2. more than one genus
-//     //4. no genus
-// }
+void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2) {
+    int margin = min(path1.end, path2.end) - max(path1.start, path2.start) + 1 - 21;
+    if (path1.start < path2.start) { 
+        path1.end = path2.start - 1;
+        uint8_t lastEndHamming = GET_2_BITS(path1.matches.back()->rightEndHamming);
+        path1.hammingDist = path1.hammingDist - (path1.matches.back()->hamming - lastEndHamming);
+        path1.score = path1.score - path1.matches.back()->getScore() - margin;
+        if (lastEndHamming == 0) {
+            path1.score += 3.0f;
+        } else {
+            path1.score += 2.0f - 0.5f * lastEndHamming;
+        }
+        path1.matches.pop_back();
+    } else {
+        path1.start = path2.end + 1;
+        uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14);
+        path1.hammingDist = path1.hammingDist - (path1.matches.front()->hamming - lastEndHamming);
+        path1.score = path1.score - path1.matches.front()->getScore() - margin;
+        if (lastEndHamming == 0) {
+            path1.score += 3.0f;
+        } else {
+            path1.score += 2.0f - 0.5f * lastEndHamming;
+        }
+        path1.matches.erase(path1.matches.begin());
+    }
+}
 
 void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameMatches,
                                          vector<MatchPath> & matchPaths,
@@ -983,39 +681,6 @@ void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameM
 //    }
 }
 
-// size_t Taxonomer::DFS(size_t curMatchIdx, const map<size_t, vector<size_t>> & linkedMatches,
-//                        vector<size_t>& filteredMatches, size_t depth, size_t MIN_DEPTH, unordered_set<size_t>& used,
-//                        unordered_map<size_t, size_t> & idx2depth) {
-//     depth++;
-//     size_t maxDepth = 0;
-//     size_t returnDepth = 0;
-//     if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { 
-//         // reached a leaf node
-//         idx2depth[curMatchIdx] = depth;
-//         if (depth > MIN_DEPTH) {
-//             filteredMatches.push_back(curMatchIdx);
-//         }
-//         return depth;
-//     } else { // not a leaf node
-//         for (auto &nextMatchIdx: linkedMatches.at(curMatchIdx)) {
-//             used.insert(nextMatchIdx);
-//             if (idx2depth.find(nextMatchIdx) != idx2depth.end()) {
-//                 returnDepth = idx2depth[nextMatchIdx];
-//                 maxDepth = max(maxDepth, returnDepth);
-//                 continue;
-//             }
-//             returnDepth = DFS(nextMatchIdx, linkedMatches, filteredMatches, depth, MIN_DEPTH, used, idx2depth);
-//             maxDepth = max(maxDepth, returnDepth);
-//         }
-//         if (maxDepth > MIN_DEPTH) {
-//             filteredMatches.push_back(curMatchIdx);
-//             idx2depth[curMatchIdx] = maxDepth;
-//         }
-//     }
-//     return maxDepth;
-// }
-
-// return: end
 depthScore Taxonomer::DFS(const vector<const Match *> &matches,
                           size_t curMatchIdx,
                           const map<size_t, vector<size_t>> &linkedMatches,
@@ -1069,304 +734,6 @@ depthScore Taxonomer::DFS(const vector<const Match *> &matches,
     return bestDepthScore;
 }
 
-// TaxonScore Taxonomer::getBestGenusMatches_spaced(vector<Match> &genusMatches, const Match *matchList, size_t end,
-//                                                   size_t offset, int readLength1, int readLength2) {
-//     TaxID currentGenus;
-//     TaxID currentSpecies;
-
-//     vector<const Match *> tempMatchContainer;
-//     vector<const Match *> filteredMatches;
-//     vector<vector<const Match *>> matchesForEachGenus;
-//     vector<bool> conservedWithinGenus;
-//     vector<TaxonScore> genusScores;
-//     TaxonScore bestScore;
-//     size_t i = offset;
-//     bool lastIn;
-//     while (i + 1 < end + 1) {
-//         currentGenus = matchList[i].genusId;
-//         // For current genus
-//         while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) {
-// //            currentSpecies = taxId2speciesId[matchList[i].targetId];
-//             currentSpecies = matchList[i].speciesId;
-//             // For current species
-//             // Filter un-consecutive matches (probably random matches)
-//             lastIn = false;
-//             int distance = 0;
-//             int diffPosCntOfCurrRange = 1;
-//             int dnaDist = 0;
-
-//             // For the same species
-//             while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) {
-//                 distance = matchList[i+1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3;
-//                 dnaDist = matchList[i+1].qInfo.pos - matchList[i].qInfo.pos;
-//                 if (distance == 0) { // At the same position
-//                     tempMatchContainer.push_back(matchList + i);
-//                 } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping
-//                     lastIn = true;
-//                     tempMatchContainer.push_back(matchList + i);
-//                     diffPosCntOfCurrRange ++;
-//                 } else { // Not consecutive --> End range
-//                     if (lastIn){
-//                         tempMatchContainer.push_back(matchList + i);
-//                         if (diffPosCntOfCurrRange >= minCoveredPos) {
-//                             filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
-//                                                    tempMatchContainer.end());
-//                         }
-//                     }
-//                     lastIn = false;
-//                     // Initialize range info
-//                     tempMatchContainer.clear();
-//                     diffPosCntOfCurrRange = 1;
-//                 }
-//                 i++;
-//             }
-
-//             // Met next species
-//             if (lastIn) {
-//                 tempMatchContainer.push_back(matchList + i);
-//                 if (diffPosCntOfCurrRange >= minCoveredPos) {
-//                     filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
-//                                            tempMatchContainer.end());
-//                 }
-//             }
-//             tempMatchContainer.clear();
-//             i++;
-//         }
-
-//         // Construct a match combination using filtered matches of current genus
-//         // so that it can best cover the query, and score the combination
-//         if (!filteredMatches.empty()) {
-//             genusScores.push_back(scoreTaxon(filteredMatches, readLength1, readLength2));
-//         }
-//         filteredMatches.clear();
-//     }
-
-//     // If there are no meaningful genus
-//     if (genusScores.empty()) {
-//         bestScore.score = 0;
-//         return bestScore;
-//     }
-
-//     TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
-//                                        [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
-
-//     vector<size_t> maxIdx;
-//     for (size_t g = 0; g < genusScores.size(); g++) {
-//         if (genusScores[g].score > maxScore.score * 0.95f) {
-//             maxIdx.push_back(g);
-//         }
-//     }
-//     bestScore = maxScore;
-
-//     for (unsigned long g : maxIdx) {
-//         for (const Match * m : matchesForEachGenus[g]) {
-//             genusMatches.push_back(*m);
-//         }
-//     }
-
-//     // More than one genus
-//     if (maxIdx.size() > 1) {
-//         bestScore.taxId = 0;
-//         return bestScore;
-//     }
-//     return bestScore;
-
-//     //Three cases
-//     //1. one genus
-//     //2. more than one genus
-//     //4. no genus
-// }
-
-// TaxonScore Taxonomer::getBestGenusMatches(vector<Match> &genusMatches, const Match *matchList, size_t end,
-//                                            size_t offset, int queryLength) {
-//     TaxID currentGenus;
-//     TaxID currentSpecies;
-
-//     vector<const Match *> filteredMatches;
-//     vector<vector<const Match *>> matchesForEachGenus;
-//     vector<TaxonScore> genusScores;
-//     TaxonScore bestScore;
-//     size_t i = offset;
-//     uint8_t curFrame;
-//     vector<const Match *> curFrameMatches;
-//     while (i  < end + 1) {
-//         currentGenus = matchList[i].genusId;
-//         // For current genus
-//         while ((i < end + 1) && currentGenus == matchList[i].genusId) {
-//             currentSpecies = matchList[i].speciesId;
-
-//             // For current species
-//             while ((i < end + 1) && currentSpecies == matchList[i].speciesId) {
-//                 curFrame = matchList[i].qInfo.frame;
-//                 curFrameMatches.clear();
-
-//                 // For current frame
-//                 while ((i < end + 1) && currentSpecies == matchList[i].speciesId
-//                        && curFrame == matchList[i].qInfo.frame) {
-//                     curFrameMatches.push_back(&matchList[i]);
-//                     i ++;
-//                 }
-//                 if (curFrameMatches.size() > 1) {
-//                     remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus);
-//                 }
-//             }
-//         }
-
-//         // Construct a match combination using filtered matches of current genus
-//         // so that it can best cover the query, and score the combination
-
-//         if (!filteredMatches.empty()) {
-//             matchesForEachGenus.push_back(filteredMatches);
-//             genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, queryLength));
-//         }
-//         filteredMatches.clear();
-//     }
-
-//     // If there are no meaningful genus
-//     if (genusScores.empty()) {
-//         bestScore.score = 0;
-//         return bestScore;
-//     }
-
-//     TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
-//                                        [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; });
-
-//     vector<size_t> maxIdx;
-//     for (size_t g = 0; g < genusScores.size(); g++) {
-//         if (genusScores[g].score > maxScore.score * 0.95f) {
-//             maxIdx.push_back(g);
-//         }
-//     }
-
-//     bestScore = maxScore;
-
-//     for (unsigned long g : maxIdx) {
-//         for (const Match * m : matchesForEachGenus[g]) {
-//             genusMatches.push_back(*m);
-//         }
-//     }
-
-//     // More than one genus
-//     if (maxIdx.size() > 1) {
-//         bestScore.taxId = 0;
-//         return bestScore;
-//     }
-//     return bestScore;
-
-//     //Three cases
-//     //1. one genus
-//     //2. more than one genus
-//     //4. no genus
-// }
-
-// TaxonScore Taxonomer::getBestGenusMatches_spaced(vector<Match> &genusMatches, const Match *matchList, size_t end,
-//                                                  size_t offset, int readLength) {
-//     TaxID currentGenus;
-//     TaxID currentSpecies;
-
-//     vector<const Match *> tempMatchContainer;
-//     vector<const Match *> filteredMatches;
-//     vector<vector<Match>> matchesForEachGenus;
-//     vector<bool> conservedWithinGenus;
-//     vector<TaxonScore> genusScores;
-//     TaxonScore bestScore;
-//     size_t i = offset;
-//     bool lastIn;
-//     size_t speciesMatchCnt;
-//     while (i + 1 < end + 1) {
-//         currentGenus = matchList[i].genusId;
-//         // For current genus
-//         while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) {
-//             currentSpecies = matchList[i].speciesId;
-//             // For current species
-//             // Filter un-consecutive matches (probably random matches)
-//             lastIn = false;
-//             int distance = 0;
-//             int diffPosCntOfCurrRange = 1;
-//             int dnaDist = 0;
-
-//             // For the same species
-//             while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) {
-//                 distance = matchList[i + 1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3;
-//                 dnaDist = matchList[i + 1].qInfo.pos - matchList[i].qInfo.pos;
-//                 if (distance == 0) { // At the same position
-//                     tempMatchContainer.push_back(matchList + i);
-//                 } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping
-//                     lastIn = true;
-//                     tempMatchContainer.push_back(matchList + i);
-//                     diffPosCntOfCurrRange++;
-//                 } else { // Not consecutive --> End range
-//                     if (lastIn) {
-//                         tempMatchContainer.push_back(matchList + i);
-//                         if (diffPosCntOfCurrRange >= minCoveredPos) {
-//                             filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
-//                                                    tempMatchContainer.end());
-//                         }
-//                     }
-//                     lastIn = false;
-//                     // Initialize range info
-//                     tempMatchContainer.clear();
-//                     diffPosCntOfCurrRange = 1;
-//                 }
-//                 i++;
-//             }
-
-//             // Met next species
-//             if (lastIn) {
-//                 tempMatchContainer.push_back(matchList + i);
-//                 if (diffPosCntOfCurrRange >= minCoveredPos) {
-//                     filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(),
-//                                            tempMatchContainer.end());
-//                 }
-//             }
-//             tempMatchContainer.clear();
-//             i++;
-//         }
-
-//         // Construct a match combination using filtered matches of current genus
-//         // so that it can best cover the query, and score the combination
-//         if (!filteredMatches.empty()) {
-//             genusScores.push_back(scoreTaxon(filteredMatches, readLength));
-//         }
-//         filteredMatches.clear();
-//     }
-
-//     // If there are no meaningful genus
-//     if (genusScores.empty()) {
-//         bestScore.score = 0;
-//         return bestScore;
-//     }
-
-//     TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(),
-//                                        [](const TaxonScore &a, const TaxonScore &b) { return a.score < b.score; });
-
-//     vector<size_t> maxIdx;
-//     for (size_t g = 0; g < genusScores.size(); g++) {
-//         if (genusScores[g].score > maxScore.score * 0.95f) {
-//             maxIdx.push_back(g);
-//         }
-//     }
-//     bestScore = maxScore;
-
-//     for (unsigned long g: maxIdx) {
-//         genusMatches.insert(genusMatches.end(),
-//                             matchesForEachGenus[g].begin(),
-//                             matchesForEachGenus[g].end());
-//     }
-
-//     // More than one genus
-//     if (maxIdx.size() > 1) {
-//         bestScore.taxId = 0;
-//         return bestScore;
-//     }
-//     return bestScore;
-
-//     //Three cases
-//     //1. one genus
-//     //2. more than one genus
-//     //4. no genus
-// }
-
 TaxonScore Taxonomer::scoreTaxon(vector<const Match *> &filteredMatches,
                                  TaxID taxId,
                                  int queryLength) {
@@ -1730,6 +1097,8 @@ bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) {
 bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * match2) {
     // int hamming1 = match1->hamming - GET_2_BITS(match1->rightEndHamming);
     // int hamming2 = match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14);
+    // cout << match1->rightEndHamming << " " << match2->rightEndHamming << endl;
+    // cout << hamming1 << " " << hamming2 << endl;
     // match1 87654321 -> 08765432
     // match2 98765432 -> 08765432
     return (match1->hamming - GET_2_BITS(match1->rightEndHamming)) == (match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14));
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index 722ced8b..ee131df1 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -85,13 +85,6 @@ class Taxonomer {
                          const Match *matchList,
                          vector<Query> & queryList,
                          const LocalParameters &par);
-    
-    void chooseBestTaxon2(uint32_t currentQuery,
-                          size_t offset,
-                          size_t end,
-                          const Match *matchList,
-                          vector<Query> & queryList,
-                          const LocalParameters &par);
 
     void remainConsecutiveMatches(const vector<const Match *> & curFrameMatches,
                                   vector<MatchPath> & matchPaths,
@@ -99,7 +92,7 @@ class Taxonomer {
     
     float combineMatchPaths(vector<MatchPath> & matchPaths,
                            vector<MatchPath> & combinedMatchPaths,
-                           int readLength, const Match * matchList);
+                           int readLength);
 
     bool isMatchPathOverlapped(const MatchPath & matchPath1, const MatchPath & matchPath2);
 
@@ -107,6 +100,8 @@ class Taxonomer {
 
     void mergeMatchPaths(const MatchPath & source, MatchPath & target);
 
+    void trimMatchPath(MatchPath & path1, const MatchPath & path2);
+
     depthScore DFS(const vector<const Match *> &matches, size_t curMatchIdx,
                    const map<size_t, vector<size_t>> &linkedMatches,
                    size_t depth, size_t MIN_DEPTH, unordered_set<size_t> &used,

From ba7375ea3e856398fb8e62616353ef781f8389ed Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Thu, 9 Nov 2023 17:10:46 +0900
Subject: [PATCH 59/65] undo changes in getExtendedORFs function

---
 src/commons/SeqIterator.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp
index 46cf14a8..6d4cdd24 100644
--- a/src/commons/SeqIterator.cpp
+++ b/src/commons/SeqIterator.cpp
@@ -539,11 +539,11 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect
         frame = (genes[0].begin - 1) % 3;
         leftEnd = 0;
         while (leftEnd % 3 != frame) leftEnd++;
-        blocks.emplace_back(leftEnd, genes[1].begin - 2, 1);
+        blocks.emplace_back(leftEnd, genes[1].begin - 1 + 22, 1);
         blockIdx++;
     } else {
         frame = (genes[0].end - 1) % 3;
-        rightEnd = genes[1].begin - 2;
+        rightEnd = genes[1].begin - 1 + 22;
         while (rightEnd % 3 != frame) rightEnd--;
         blocks.emplace_back(0, rightEnd, -1);
         blockIdx++;
@@ -583,12 +583,12 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect
             } else {
                 if (!isReverse) { //forward
                     frame = (genes[geneIdx].begin - 1) % 3;
-                    leftEnd = genes[geneIdx - 1].end;
+                    leftEnd = genes[geneIdx - 1].end -1 -22;
                     while (leftEnd % 3 != frame) leftEnd++;
                     blocks.emplace_back(leftEnd, genes[geneIdx].end - 1, 1);
                     blockIdx++;
                 } else { // reverse
-                    blocks.emplace_back(genes[geneIdx - 1].end, genes[geneIdx].end - 1, -1);
+                    blocks.emplace_back(genes[geneIdx - 1].end - 22 - 1, genes[geneIdx].end - 1, -1);
                     blockIdx++;
                 }
             }
@@ -597,24 +597,24 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect
             if (hasBeenExtendedToLeft) {
                 if (!isReverse) { //forward
                     frame = (genes[geneIdx].begin - 1) % 3;
-                    leftEnd = genes[geneIdx - 1].end;
+                    leftEnd = genes[geneIdx - 1].end - 1 - 22;
                     while (leftEnd % 3 != frame) leftEnd++;
-                    blocks.emplace_back(leftEnd, genes[geneIdx + 1].begin - 2, 1);
+                    blocks.emplace_back(leftEnd, genes[geneIdx + 1].begin - 1 + 22, 1);
                     blockIdx++;
                 } else {
                     frame = (genes[geneIdx].end - 1) % 3;
-                    rightEnd = genes[geneIdx + 1].begin - 2;
+                    rightEnd = genes[geneIdx + 1].begin - 1 + 22;
                     while (rightEnd % 3 != frame) rightEnd--;
-                    blocks.emplace_back(genes[geneIdx - 1].end, rightEnd, -1);
+                    blocks.emplace_back(genes[geneIdx - 1].end - 1 - 22, rightEnd, -1);
                     blockIdx++;
                 }
             } else {
                 if (!isReverse) { //forward
-                    blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx + 1].begin - 2, 1);
+                    blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx + 1].begin - 1 + 22, 1);
                     blockIdx++;
                 } else {
                     frame = (genes[geneIdx].end - 1) % 3;
-                    rightEnd = genes[geneIdx + 1].begin - 2;
+                    rightEnd = genes[geneIdx + 1].begin - 1 + 22;
                     while (rightEnd % 3 != frame) rightEnd--;
                     blocks.emplace_back(genes[geneIdx].begin - 1, rightEnd, -1);
                     blockIdx++;
@@ -639,7 +639,7 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect
     // If left region is not covered, cover it.
     leftEnd = genes[numOfGene - 1].begin - 1;
     if (hasBeenExtendedToLeft) {
-        leftEnd = genes[numOfGene - 2].end;
+        leftEnd = genes[numOfGene - 2].end - 1 - 22;
         if (!isReverse) {
             frame = (genes[numOfGene - 1].begin - 1) % 3;
             while (leftEnd % 3 != frame) leftEnd++;

From cfb163522a7c9cef1a3138536ac0fd83741ccc00 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Sun, 12 Nov 2023 15:21:06 +0900
Subject: [PATCH 60/65] error in subspecies level classification

---
 src/commons/QueryIndexer.cpp |  10 +--
 src/commons/Taxonomer.cpp    | 147 ++++++++++++++++++++++-------------
 src/commons/Taxonomer.h      |  11 ++-
 3 files changed, 105 insertions(+), 63 deletions(-)

diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp
index 66ec70a4..4fba5072 100644
--- a/src/commons/QueryIndexer.cpp
+++ b/src/commons/QueryIndexer.cpp
@@ -51,11 +51,11 @@ void QueryIndexer::indexQueryFile() {
         }
         querySplits.emplace_back(start, readNum_1, kmerCnt);
         // Print elements
-        for (auto & querySplit : querySplits) {
-            std::cout << "start: " << querySplit.start << "\t";
-            std::cout << "end: " << querySplit.end << "\t";
-            std::cout << "kmerCnt: " << querySplit.kmerCnt << "\n";
-        }
+        // for (auto & querySplit : querySplits) {
+        //     std::cout << "start: " << querySplit.start << "\t";
+        //     std::cout << "end: " << querySplit.end << "\t";
+        //     std::cout << "kmerCnt: " << querySplit.kmerCnt << "\n";
+        // }
         delete kseq;
     } else {
         KSeqWrapper* kseq_1 = KSeqFactory(queryPath_1.c_str());
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index c0760d8c..4b0f230f 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -91,7 +91,7 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
 //        }
 //    }
     // Get the best species for current query
-    vector<Match> speciesMatches;
+    vector<const Match*> speciesMatches;
     speciesMatches.reserve(end - offset + 1);
     TaxonScore speciesScore(0, 0, 0, 0, 0);
     if (par.seqMode == 2) {
@@ -132,6 +132,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
         return;
     }
 
+    // Filter redundant matches  
+    vector<const Match *> filteredMatches;
+    unordered_map<TaxID, unsigned int> taxCnt;
+    filterRedundantMatches(speciesMatches, filteredMatches, taxCnt);
+    
     // If score is not enough, classify to the parent of the selected species
     if (speciesScore.score < par.minSpScore) {
         queryList[currentQuery].isClassified = true;
@@ -140,21 +145,18 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
         queryList[currentQuery].score = speciesScore.score;
         queryList[currentQuery].coverage = speciesScore.coverage;
         queryList[currentQuery].hammingDist = speciesScore.hammingDist;
-        for (auto & spMatch : speciesMatches) {
-            queryList[currentQuery].taxCnt[spMatch.targetId]++;
+        for (auto spMatch : filteredMatches) {
+            queryList[currentQuery].taxCnt[spMatch->targetId]++;
         }
         return;
     }
 
-    // Sort matches by the coordinate of the query
-    sort(speciesMatches.begin(), speciesMatches.end(),
-         [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; });
-
-    TaxID result = lowerRankClassification(speciesMatches, speciesScore.taxId);
+    // Lower rank classification
+    TaxID result = lowerRankClassification(taxCnt, speciesScore.taxId);
 
     // Record matches of selected species
-    for (auto & spMatch : speciesMatches) {
-            queryList[currentQuery].taxCnt[spMatch.targetId]++;
+    for (auto & spMatch : filteredMatches) {
+            queryList[currentQuery].taxCnt[spMatch->targetId]++;
     }
 
     // Store classification results
@@ -177,31 +179,59 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
 //    }
 }
 
-TaxID Taxonomer::lowerRankClassification(vector<Match> &matches, TaxID spTaxId) {
-    unordered_map<TaxID, unsigned int> taxCnt;
-    size_t matchNum = matches.size();
-
+void Taxonomer::filterRedundantMatches(vector<const Match *> & speciesMatches,
+                                       vector<const Match *> & filteredMatches,
+                                       unordered_map<TaxID, unsigned int> & taxCnt) {
+    filteredMatches.reserve(speciesMatches.size());
+    // Sort matches by the coordinate on the query
+    sort(speciesMatches.begin(), speciesMatches.end(),
+         [](const Match * a, const Match * b) { return a->qInfo.pos < b->qInfo.pos; });
+    
+    // Remove redundant matches
+    size_t matchNum = speciesMatches.size();
     for (size_t i = 0; i < matchNum; i++) {
-        // cout << matches[i].targetId << endl;
-        // taxCnt[matches[i].targetId] ++;
-        size_t currQuotient = matches[i].qInfo.pos / 3;
-        uint8_t minHamming = matches[i].hamming;
-        Match * minHammingMatch = & matches[i];
+        size_t currQuotient = speciesMatches[i]->qInfo.pos / 3;
+        uint8_t minHamming = speciesMatches[i]->hamming;
+        const Match * minHammingMatch = speciesMatches[i];
         TaxID minHammingTaxId = minHammingMatch->targetId;
-        while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) {
-            if (matches[i].hamming < minHamming) {
-                minHamming = matches[i].hamming;
-                minHammingMatch = & matches[i];
+        while ((i < matchNum) && (currQuotient == speciesMatches[i]->qInfo.pos / 3)) {
+            if (speciesMatches[i]->hamming < minHamming) {
+                minHamming = speciesMatches[i]->hamming;
+                minHammingMatch = speciesMatches[i];
                 minHammingTaxId = minHammingMatch->targetId;
-            } else if (matches[i].hamming == minHamming) {
-                minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId);
-                minHammingMatch->redundancy = true;
-                matches[i].redundancy = true;
+            } else if (speciesMatches[i]->hamming == minHamming) {
+                minHammingTaxId = taxonomy->LCA(minHammingTaxId, speciesMatches[i]->targetId);
             }
             i++;
         }
-        taxCnt[minHammingTaxId]++;       
+        filteredMatches.push_back(&*minHammingMatch);
+        taxCnt[minHammingTaxId]++;
     }
+}
+
+TaxID Taxonomer::lowerRankClassification(const unordered_map<TaxID, unsigned int> & taxCnt, TaxID spTaxId) {
+    // size_t matchNum = matches.size();
+    // for (size_t i = 0; i < matchNum; i++) {
+    //     // cout << matches[i].targetId << endl;
+    //     // taxCnt[matches[i].targetId] ++;
+    //     size_t currQuotient = matches[i].qInfo.pos / 3;
+    //     uint8_t minHamming = matches[i].hamming;
+    //     Match * minHammingMatch = & matches[i];
+    //     TaxID minHammingTaxId = minHammingMatch->targetId;
+    //     while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) {
+    //         if (matches[i].hamming < minHamming) {
+    //             minHamming = matches[i].hamming;
+    //             minHammingMatch = & matches[i];
+    //             minHammingTaxId = minHammingMatch->targetId;
+    //         } else if (matches[i].hamming == minHamming) {
+    //             minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId);
+    //             minHammingMatch->redundancy = true;
+    //             matches[i].redundancy = true;
+    //         }
+    //         i++;
+    //     }
+    //     taxCnt[minHammingTaxId]++;       
+    // }
 
     unordered_map<TaxID, TaxonCounts> cladeCnt;
     getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
@@ -265,7 +295,7 @@ TaxID Taxonomer::BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID r
     }
 }
 
-TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
+TaxonScore Taxonomer::getBestSpeciesMatches(vector<const Match *> & speciesMatches,
                                             const Match *matchList,
                                             size_t end,
                                             size_t offset,
@@ -319,7 +349,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
 
     vector<TaxID> maxSpecies;
     for (auto & spScore : species2score) {
-        if (spScore.second > bestSpScore * 0.99) {
+        if (spScore.second > bestSpScore * 0.95) {
             maxSpecies.push_back(spScore.first);
         }
     }
@@ -343,9 +373,12 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
     for (auto & matchPath : species2matchPaths[maxSpecies[0]]) {
         coveredLength += matchPath.end - matchPath.start + 1;
         hammingDist += matchPath.hammingDist;
-        for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) {
-            speciesMatches.push_back(matchList[i]);
-        }
+    }
+    speciesMatches.reserve(speciesMatchRange[bestScore.taxId].second
+                        - speciesMatchRange[bestScore.taxId].first + 1);
+
+    for (size_t j = speciesMatchRange[bestScore.taxId].first; j < speciesMatchRange[bestScore.taxId].second; j++) {
+        speciesMatches.push_back(& matchList[j]);
     }
     bestScore.coverage = coveredLength / queryLength;
     bestScore.hammingDist = hammingDist;
@@ -353,7 +386,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
     return bestScore;                                  
 }
 
-TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
+TaxonScore Taxonomer::getBestSpeciesMatches(vector<const Match *> & speciesMatches,
                                             const Match *matchList,
                                             size_t end,
                                             size_t offset,
@@ -408,7 +441,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
 
     vector<TaxID> maxSpecies;
     for (auto & spScore : species2score) {
-        if (spScore.second > bestSpScore * 0.99) {
+        if (spScore.second > bestSpScore * 0.95) {
             maxSpecies.push_back(spScore.first);
         }
     }
@@ -432,9 +465,12 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<Match> &speciesMatches,
     for (auto & matchPath : species2matchPaths[maxSpecies[0]]) {
         coveredLength += matchPath.end - matchPath.start + 1;
         hammingDist += matchPath.hammingDist;
-        for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) {
-            speciesMatches.push_back(matchList[i]);
-        }
+    }
+    speciesMatches.reserve(speciesMatchRange[bestScore.taxId].second
+                        - speciesMatchRange[bestScore.taxId].first + 1);
+
+    for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) {
+        speciesMatches.push_back(&matchList[i]);
     }
     bestScore.coverage = coveredLength / (readLength1 + readLength2);
     bestScore.hammingDist = hammingDist;
@@ -550,7 +586,7 @@ void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2) {
         } else {
             path1.score += 2.0f - 0.5f * lastEndHamming;
         }
-        path1.matches.pop_back();
+        // path1.matches.pop_back();
     } else {
         path1.start = path2.end + 1;
         uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14);
@@ -561,7 +597,7 @@ void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2) {
         } else {
             path1.score += 2.0f - 0.5f * lastEndHamming;
         }
-        path1.matches.erase(path1.matches.begin());
+        // path1.matches.erase(path1.matches.begin());
     }
 }
 
@@ -691,6 +727,8 @@ depthScore Taxonomer::DFS(const vector<const Match *> &matches,
     depth++;
     depthScore bestDepthScore = depthScore(0, 0, 0);
     depthScore returnDepthScore;
+    depthScore curDepthScore;
+    float recievedScore = score;
     if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { // reached a leaf node
         uint8_t lastEndHamming = (matches[curMatchIdx]->rightEndHamming >> 14);
         if (lastEndHamming == 0) {
@@ -698,7 +736,7 @@ depthScore Taxonomer::DFS(const vector<const Match *> &matches,
         } else {
             score += 2.0f - 0.5f * lastEndHamming;
         }
-        idx2depthScore[curMatchIdx] = depthScore(depth, score, hammingDist + lastEndHamming);
+        idx2depthScore[curMatchIdx] = depthScore(1, score - recievedScore, lastEndHamming);
         return depthScore(depth, score, hammingDist + lastEndHamming);
     } else { // not a leaf node
         uint8_t lastEndHamming = (matches[curMatchIdx]->rightEndHamming >> 14);
@@ -709,26 +747,25 @@ depthScore Taxonomer::DFS(const vector<const Match *> &matches,
         }
         for (auto &nextMatchIdx: linkedMatches.at(curMatchIdx)) {
             used.insert(nextMatchIdx);
-
             // Reuse the depth score of nextMatchIdx if it has been calculated
-            if (idx2depthScore.find(nextMatchIdx) != idx2depthScore.end()) {
+            if (idx2depthScore.find(nextMatchIdx) != idx2depthScore.end()){
                 returnDepthScore = idx2depthScore[nextMatchIdx];
-                if (returnDepthScore.score > bestDepthScore.score
-                    && returnDepthScore.depth > MIN_DEPTH) {
-                    bestDepthScore = returnDepthScore;
-                    edges[matches[curMatchIdx]] = matches[nextMatchIdx];
-                }   
-                continue;
+                curDepthScore = depthScore(returnDepthScore.depth + depth,
+                                           returnDepthScore.score + score,
+                                           returnDepthScore.hammingDist + hammingDist + lastEndHamming);
+            } else {
+                curDepthScore = DFS(matches, nextMatchIdx, linkedMatches, depth, MIN_DEPTH, used, idx2depthScore, edges, score, hammingDist + lastEndHamming);
             }
-            returnDepthScore = DFS(matches, nextMatchIdx, linkedMatches, depth, MIN_DEPTH, used, idx2depthScore, edges, score, hammingDist + lastEndHamming);
-            if (returnDepthScore.score > bestDepthScore.score
-                && returnDepthScore.depth > MIN_DEPTH) {
-                bestDepthScore = returnDepthScore;
+            if (curDepthScore.score > bestDepthScore.score
+                && curDepthScore.depth > MIN_DEPTH) {
+                bestDepthScore = curDepthScore;
                 edges[matches[curMatchIdx]] = matches[nextMatchIdx];
-            } 
+            }
         }    
         if (bestDepthScore.depth > MIN_DEPTH) {
-            idx2depthScore[curMatchIdx] = bestDepthScore;
+            idx2depthScore[curMatchIdx] = depthScore(bestDepthScore.depth - depth + 1,
+                                                     bestDepthScore.score - recievedScore,
+                                                     bestDepthScore.hammingDist - hammingDist);
         }
     }
     return bestDepthScore;
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index ee131df1..f5810e43 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -5,6 +5,7 @@
 #include "Match.h"
 #include "common.h"
 #include "BitManipulateMacros.h"
+#include <unordered_map>
 #include <unordered_set>
 
 using namespace std;
@@ -102,6 +103,10 @@ class Taxonomer {
 
     void trimMatchPath(MatchPath & path1, const MatchPath & path2);
 
+    void filterRedundantMatches(vector<const Match*> & matchPaths,
+                                vector<const Match*> & filteredMatches,
+                                unordered_map<TaxID, unsigned int> & taxCnt);
+
     depthScore DFS(const vector<const Match *> &matches, size_t curMatchIdx,
                    const map<size_t, vector<size_t>> &linkedMatches,
                    size_t depth, size_t MIN_DEPTH, unordered_set<size_t> &used,
@@ -124,10 +129,10 @@ class Taxonomer {
     TaxonScore getBestGenusMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
                                    int readLength1, int readLength2);
 
-    TaxonScore getBestSpeciesMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end,
+    TaxonScore getBestSpeciesMatches(vector<const Match*> &speciesMatches, const Match *matchList, size_t end,
                                      size_t offset, int queryLength);
     
-    TaxonScore getBestSpeciesMatches(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end,
+    TaxonScore getBestSpeciesMatches(vector<const Match*> &speciesMatches, const Match *matchList, size_t end,
                                      size_t offset, int readLength1, int readLength2);
 
     // TaxonScore getBestGenusMatches_spaced(vector<Match> &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset,
@@ -171,7 +176,7 @@ class Taxonomer {
                             int queryLength,
                             int queryLength2);
 
-    TaxID lowerRankClassification(vector<Match> &matches, TaxID speciesID);
+    TaxID lowerRankClassification(const unordered_map<TaxID, unsigned int> & matches, TaxID speciesID);
 
     void getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> & taxCnt,
                                unordered_map<TaxID, TaxonCounts> & cladeCnt,

From a0a2f31ca2613cc90e349ad5c2ac5d9e480f309d Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Sun, 12 Nov 2023 16:51:31 +0900
Subject: [PATCH 61/65] removed a bug in filterRedundantMatches

---
 src/commons/Taxonomer.cpp | 31 +++++++++++--------------------
 src/commons/Taxonomer.h   |  9 ++++-----
 2 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index 4b0f230f..bac23385 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -133,9 +133,9 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
     }
 
     // Filter redundant matches  
-    vector<const Match *> filteredMatches;
-    unordered_map<TaxID, unsigned int> taxCnt;
-    filterRedundantMatches(speciesMatches, filteredMatches, taxCnt);
+    // vector<const Match *> filteredMatches;
+    // cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered" << endl;
+    filterRedundantMatches(speciesMatches, queryList[currentQuery].taxCnt);
     
     // If score is not enough, classify to the parent of the selected species
     if (speciesScore.score < par.minSpScore) {
@@ -145,19 +145,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
         queryList[currentQuery].score = speciesScore.score;
         queryList[currentQuery].coverage = speciesScore.coverage;
         queryList[currentQuery].hammingDist = speciesScore.hammingDist;
-        for (auto spMatch : filteredMatches) {
-            queryList[currentQuery].taxCnt[spMatch->targetId]++;
-        }
         return;
     }
 
     // Lower rank classification
-    TaxID result = lowerRankClassification(taxCnt, speciesScore.taxId);
-
-    // Record matches of selected species
-    for (auto & spMatch : filteredMatches) {
-            queryList[currentQuery].taxCnt[spMatch->targetId]++;
-    }
+    TaxID result = lowerRankClassification(queryList[currentQuery].taxCnt, speciesScore.taxId);
 
     // Store classification results
     queryList[currentQuery].isClassified = true;
@@ -180,16 +172,14 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
 }
 
 void Taxonomer::filterRedundantMatches(vector<const Match *> & speciesMatches,
-                                       vector<const Match *> & filteredMatches,
-                                       unordered_map<TaxID, unsigned int> & taxCnt) {
-    filteredMatches.reserve(speciesMatches.size());
+                                       map<TaxID, int > & taxCnt) {
     // Sort matches by the coordinate on the query
     sort(speciesMatches.begin(), speciesMatches.end(),
          [](const Match * a, const Match * b) { return a->qInfo.pos < b->qInfo.pos; });
     
     // Remove redundant matches
     size_t matchNum = speciesMatches.size();
-    for (size_t i = 0; i < matchNum; i++) {
+    for (size_t i = 0; i < matchNum;) {
         size_t currQuotient = speciesMatches[i]->qInfo.pos / 3;
         uint8_t minHamming = speciesMatches[i]->hamming;
         const Match * minHammingMatch = speciesMatches[i];
@@ -204,12 +194,13 @@ void Taxonomer::filterRedundantMatches(vector<const Match *> & speciesMatches,
             }
             i++;
         }
-        filteredMatches.push_back(&*minHammingMatch);
+        // cout << minHammingMatch->targetId << " " << minHammingMatch->qInfo.frame << " " << minHammingMatch->qInfo.pos << " " << int(minHammingMatch->hamming) <<  " "  << int(minHammingMatch->redundancy) << endl;
+
         taxCnt[minHammingTaxId]++;
     }
 }
 
-TaxID Taxonomer::lowerRankClassification(const unordered_map<TaxID, unsigned int> & taxCnt, TaxID spTaxId) {
+TaxID Taxonomer::lowerRankClassification(const map<TaxID, int> & taxCnt, TaxID spTaxId) {
     // size_t matchNum = matches.size();
     // for (size_t i = 0; i < matchNum; i++) {
     //     // cout << matches[i].targetId << endl;
@@ -252,7 +243,7 @@ TaxID Taxonomer::lowerRankClassification(const unordered_map<TaxID, unsigned int
     }
 }
 
-void Taxonomer::getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> &taxCnt,
+void Taxonomer::getSpeciesCladeCounts(const map<TaxID, int> &taxCnt,
                                        unordered_map<TaxID, TaxonCounts> & cladeCount,
                                        TaxID speciesTaxID) {
     for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) {
@@ -586,7 +577,7 @@ void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2) {
         } else {
             path1.score += 2.0f - 0.5f * lastEndHamming;
         }
-        // path1.matches.pop_back();
+        path1.matches.pop_back();
     } else {
         path1.start = path2.end + 1;
         uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14);
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index f5810e43..23f23b3b 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -103,9 +103,8 @@ class Taxonomer {
 
     void trimMatchPath(MatchPath & path1, const MatchPath & path2);
 
-    void filterRedundantMatches(vector<const Match*> & matchPaths,
-                                vector<const Match*> & filteredMatches,
-                                unordered_map<TaxID, unsigned int> & taxCnt);
+    void filterRedundantMatches(vector<const Match*> & speciesMatches,
+                                map<TaxID, int> & taxCnt);
 
     depthScore DFS(const vector<const Match *> &matches, size_t curMatchIdx,
                    const map<size_t, vector<size_t>> &linkedMatches,
@@ -176,9 +175,9 @@ class Taxonomer {
                             int queryLength,
                             int queryLength2);
 
-    TaxID lowerRankClassification(const unordered_map<TaxID, unsigned int> & matches, TaxID speciesID);
+    TaxID lowerRankClassification(const map<TaxID, int> & matches, TaxID speciesID);
 
-    void getSpeciesCladeCounts(const unordered_map<TaxID, unsigned int> & taxCnt,
+    void getSpeciesCladeCounts(const map<TaxID, int> & taxCnt,
                                unordered_map<TaxID, TaxonCounts> & cladeCnt,
                                TaxID spciesID);
 

From eae409a9ec8e1314cd791d934dce25d7fc4445ac Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Mon, 13 Nov 2023 14:46:57 +0900
Subject: [PATCH 62/65] an error found in combineMatchPaths()

---
 src/commons/KmerMatcher.cpp     |  2 +-
 src/commons/LocalParameters.cpp |  2 +-
 src/commons/Taxonomer.cpp       | 56 +++++++++++++++++++--------------
 src/commons/Taxonomer.h         | 10 ++----
 src/workflow/classify.cpp       |  1 -
 5 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 084b919b..248539fd 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -513,7 +513,7 @@ void KmerMatcher::compareDna(uint64_t query,
 
     // Select target k-mers that passed hamming criteria
     for (size_t h = 0; h < size; h++) {
-        if (hammingSums[h] <= min(minHammingSum * 2, 6)) {
+        if (hammingSums[h] <= min(minHammingSum * 2, 8)) {
             selectedMatches.push_back(h);
             selectedHammingSum.push_back(hammingSums[h]);
             if (frame < 3) {
diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp
index dc9ffe04..d4f8c2e9 100644
--- a/src/commons/LocalParameters.cpp
+++ b/src/commons/LocalParameters.cpp
@@ -307,7 +307,7 @@ LocalParameters::LocalParameters() :
     classify.push_back(&RAM_USAGE);
     classify.push_back(&MATCH_PER_KMER);
     classify.push_back(&ACCESSION_LEVEL);
-    classify.push_back(&MIN_SS_MATCH);
+    // classify.push_back(&MIN_SS_MATCH);
 
     // filter 
     filter.push_back(&PARAM_THREADS);
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index bac23385..ac2ad621 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -84,12 +84,12 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
                                 vector<Query> & queryList,
                                 const LocalParameters &par) {
 
-//    if (true) {
-//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
-//        for (size_t i = offset; i < end + 1; i++) {
-//            cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
-//        }
-//    }
+   if (true) {
+       cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
+       for (size_t i = offset; i < end + 1; i++) {
+           cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
+       }
+   }
     // Get the best species for current query
     vector<const Match*> speciesMatches;
     speciesMatches.reserve(end - offset + 1);
@@ -149,7 +149,9 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
     }
 
     // Lower rank classification
-    TaxID result = lowerRankClassification(queryList[currentQuery].taxCnt, speciesScore.taxId);
+    TaxID result = lowerRankClassification(queryList[currentQuery].taxCnt,
+                                         speciesScore.taxId,
+                                          queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2);
 
     // Store classification results
     queryList[currentQuery].isClassified = true;
@@ -200,7 +202,7 @@ void Taxonomer::filterRedundantMatches(vector<const Match *> & speciesMatches,
     }
 }
 
-TaxID Taxonomer::lowerRankClassification(const map<TaxID, int> & taxCnt, TaxID spTaxId) {
+TaxID Taxonomer::lowerRankClassification(const map<TaxID, int> & taxCnt, TaxID spTaxId, int queryLength) {
     // size_t matchNum = matches.size();
     // for (size_t i = 0; i < matchNum; i++) {
     //     // cout << matches[i].targetId << endl;
@@ -223,7 +225,7 @@ TaxID Taxonomer::lowerRankClassification(const map<TaxID, int> & taxCnt, TaxID s
     //     }
     //     taxCnt[minHammingTaxId]++;       
     // }
-
+    unsigned int maxCnt = (queryLength - 1)/100 + 1;
     unordered_map<TaxID, TaxonCounts> cladeCnt;
     getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
     if (accessionLevel == 2) { // Don't do accession-level classification
@@ -237,9 +239,9 @@ TaxID Taxonomer::lowerRankClassification(const map<TaxID, int> & taxCnt, TaxID s
                                                                  it->first));
             } 
         }
-        return BFS(cladeCnt, spTaxId);
+        return BFS(cladeCnt, spTaxId, maxCnt);
     } else {
-        return BFS(cladeCnt, spTaxId);
+        return BFS(cladeCnt, spTaxId, maxCnt);
     }
 }
 
@@ -262,11 +264,11 @@ void Taxonomer::getSpeciesCladeCounts(const map<TaxID, int> &taxCnt,
     }
 }
 
-TaxID Taxonomer::BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID root) {
+TaxID Taxonomer::BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID root, unsigned int maxCnt) {
+    unsigned int maxCnt2 = maxCnt;
     if (cladeCnt.at(root).children.empty()) { // root is a leaf
         return root;
     }
-    unsigned int maxCnt = minSSMatch;
     unsigned int currentCnt;
     vector<TaxID> bestChildren;
     for (auto it = cladeCnt.at(root).children.begin(); it != cladeCnt.at(root).children.end(); it++) {
@@ -280,7 +282,7 @@ TaxID Taxonomer::BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID r
         }
     }
     if (bestChildren.size() == 1) {
-        return BFS(cladeCnt, bestChildren[0]);
+        return BFS(cladeCnt, bestChildren[0], maxCnt2);
     } else {
         return root;
     }
@@ -346,20 +348,27 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<const Match *> & speciesMatch
     }
 
     // More than one species --> LCA
+    float coveredLength = 0.f;
     if (maxSpecies.size() > 1) {
         bestScore.LCA = true;
         bestScore.taxId = taxonomy->LCA(maxSpecies)->taxId;
         for (auto & sp : maxSpecies) {
             bestScore.score += species2score[sp];
+            coveredLength = 0;
+            for (auto & matchPath : species2matchPaths[maxSpecies[0]]) {
+                coveredLength += matchPath.end - matchPath.start + 1;
+            }
+            bestScore.coverage += coveredLength / queryLength;
         }
         bestScore.score /= maxSpecies.size();
+        bestScore.coverage /= maxSpecies.size();
         return bestScore;
     }
+    
 
     // One species
     bestScore.taxId = maxSpecies[0];
     bestScore.score = species2score[maxSpecies[0]];
-    float coveredLength = 0.f;
     int hammingDist = 0;
     for (auto & matchPath : species2matchPaths[maxSpecies[0]]) {
         coveredLength += matchPath.end - matchPath.start + 1;
@@ -438,20 +447,27 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector<const Match *> & speciesMatch
     }
 
     // More than one species --> LCA
+    float coveredLength = 0.f;
     if (maxSpecies.size() > 1) {
         bestScore.LCA = true;
         bestScore.taxId = taxonomy->LCA(maxSpecies)->taxId;
         for (auto & sp : maxSpecies) {
             bestScore.score += species2score[sp];
+            coveredLength = 0;
+            for (auto & matchPath : species2matchPaths[maxSpecies[0]]) {
+                coveredLength += matchPath.end - matchPath.start + 1;
+            }
+            bestScore.coverage += coveredLength / (readLength1 + readLength2);
         }
         bestScore.score /= maxSpecies.size();
+        bestScore.coverage /= maxSpecies.size();
         return bestScore;
     }
     
     // One species
     bestScore.taxId = maxSpecies[0];
     bestScore.score = species2score[maxSpecies[0]];
-    float coveredLength = 0.f;
+    
     int hammingDist = 0;
     for (auto & matchPath : species2matchPaths[maxSpecies[0]]) {
         coveredLength += matchPath.end - matchPath.start + 1;
@@ -698,14 +714,6 @@ void Taxonomer::remainConsecutiveMatches(const vector<const Match *> & curFrameM
             }
         }
     }
-
-//    if (par.printLog) {
-//        cout << "filteredMatchIdx: ";
-//        for (auto &idx: filteredMatchIdx) {
-//            cout << idx << " ";
-//        }
-//        cout << endl;
-//    }
 }
 
 depthScore Taxonomer::DFS(const vector<const Match *> &matches,
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index 23f23b3b..0d00e77d 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -111,12 +111,6 @@ class Taxonomer {
                    size_t depth, size_t MIN_DEPTH, unordered_set<size_t> &used,
                    unordered_map<size_t, depthScore> &idx2depthScore,
                    unordered_map<const Match *, const Match *> & edges, float score, int hammingDist);
-    // depthScore DFS(const vector<const Match *> & curFrameMatches,
-    //                size_t curMatchIdx,
-    //                const map<size_t, vector<size_t>>& linkedMatches,
-    //                size_t depth, size_t MIN_DEPTH, unordered_set<size_t>& used,
-    //            unordered_map<size_t, size_t> & idx2depth,
-    //            size_t startPos, vector<MatchPath> & matchPaths);
 
     static bool isConsecutive(const Match * match1, const Match * match2);
 
@@ -175,13 +169,13 @@ class Taxonomer {
                             int queryLength,
                             int queryLength2);
 
-    TaxID lowerRankClassification(const map<TaxID, int> & matches, TaxID speciesID);
+    TaxID lowerRankClassification(const map<TaxID, int> & matches, TaxID speciesID, int queryLength);
 
     void getSpeciesCladeCounts(const map<TaxID, int> & taxCnt,
                                unordered_map<TaxID, TaxonCounts> & cladeCnt,
                                TaxID spciesID);
 
-    TaxID BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID root);
+    TaxID BFS(const unordered_map<TaxID, TaxonCounts> & cladeCnt, TaxID root, unsigned int maxCnt);
 
     // Getters
     unordered_map<TaxID, unsigned int> & getTaxCounts() { return taxCounts; }
diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp
index 0ce445ff..460bab34 100644
--- a/src/workflow/classify.cpp
+++ b/src/workflow/classify.cpp
@@ -25,7 +25,6 @@ void setClassifyDefaults(LocalParameters & par){
     par.maskProb = 0.9;
     par.matchPerKmer = 4;
     par.accessionLevel = 0;
-    par.minSSMatch = 3;
 }
 
 int classify(int argc, const char **argv, const Command& command)

From 1e50b1e603f2518854a3fe10af3503beed0d1741 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Mon, 13 Nov 2023 17:22:59 +0900
Subject: [PATCH 63/65] 1. Imporved combineMatchPaths() 2.Undo changes in
 getExtendedORFs

---
 src/commons/Match.h         |  36 ++++++++--
 src/commons/SeqIterator.cpp | 130 ++++++++++++++++++------------------
 src/commons/Taxonomer.cpp   |  57 +++++++++-------
 src/commons/Taxonomer.h     |   2 +-
 4 files changed, 132 insertions(+), 93 deletions(-)

diff --git a/src/commons/Match.h b/src/commons/Match.h
index 9bc1fb44..d2a02846 100644
--- a/src/commons/Match.h
+++ b/src/commons/Match.h
@@ -29,8 +29,8 @@ struct Match { // 20 byte
         << targetId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << " " << getScore() << std::endl;
     }
 
-    float getScore(float score = 0.0f, int cnt = 0) const {
-        int currentHamming = GET_2_BITS(rightEndHamming >> cnt * 2);
+    float getScore(float score = 0.0f, int cnt = 0) const { 
+        int currentHamming = GET_2_BITS(rightEndHamming >> (cnt * 2));
         if (currentHamming == 0) {
             score += 3.0f;
         } else {
@@ -39,9 +39,37 @@ struct Match { // 20 byte
         if (cnt == 7) {
             return score;
         } else {
-            return getScore(score, cnt + 1);
+        return getScore(score, cnt + 1);    
         }
     }
+
+    // 87654321에서 678을 알고 싶은 거임
+    float getRightPartScore(const int range, float score = 0.0f, int cnt = 0) const {
+        if (cnt == range) {
+            return score;
+        }
+        int currentHamming = GET_2_BITS(rightEndHamming >> (14 - cnt * 2));
+        if (currentHamming == 0) {
+            score += 3.0f;
+        } else {
+            score += 2.0f - 0.5f * currentHamming;
+        }
+        return getRightPartScore(range, score, cnt + 1);    
+    }
+
+    // 87654321
+    float getLeftPartScore(const int range, float score = 0.0f, int cnt = 0) const {
+        if (cnt == range) {
+            return score;
+        }
+        int currentHamming = GET_2_BITS(rightEndHamming >> (cnt * 2));
+        if (currentHamming == 0) {
+            score += 3.0f;
+        } else {
+            score += 2.0f - 0.5f * currentHamming;
+        }
+        return getLeftPartScore(range, score, cnt + 1);    
+    }
 };
 
-#endif //ADCLASSIFIER2_MATCH_H
+#endif //ADCLASSIFIER2_MATCH_H
\ No newline at end of file
diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp
index 6d4cdd24..19fa8f7d 100644
--- a/src/commons/SeqIterator.cpp
+++ b/src/commons/SeqIterator.cpp
@@ -628,76 +628,78 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect
         }
     }
 
-    // For the last gene
-    // Extend to the end of the genome
-    isReverse = !(nodes[genes[numOfGene - 1].start_ndx].strand == 1);    
-    rightEnd = length - 1;
-    if (isReverse) {
-        frame = (genes[numOfGene - 1].end - 1) % 3;
-        while (rightEnd % 3 != frame) rightEnd--;
-    }
-    // If left region is not covered, cover it.
-    leftEnd = genes[numOfGene - 1].begin - 1;
-    if (hasBeenExtendedToLeft) {
-        leftEnd = genes[numOfGene - 2].end - 1 - 22;
-        if (!isReverse) {
+    // // For the last gene
+    // // Extend to the end of the genome
+    // isReverse = !(nodes[genes[numOfGene - 1].start_ndx].strand == 1);    
+    // rightEnd = length - 1;
+    // if (isReverse) {
+    //     frame = (genes[numOfGene - 1].end - 1) % 3;
+    //     while (rightEnd % 3 != frame) rightEnd--;
+    // }
+    // // If left region is not covered, cover it.
+    // leftEnd = genes[numOfGene - 1].begin - 1;
+    // if (hasBeenExtendedToLeft) {
+    //     leftEnd = genes[numOfGene - 2].end - 1 - 22;
+    //     if (!isReverse) {
+    //         frame = (genes[numOfGene - 1].begin - 1) % 3;
+    //         while (leftEnd % 3 != frame) leftEnd++;
+    //     }
+    // }
+    // blocks.emplace_back(leftEnd, rightEnd, isReverse ? -1 : 1);
+    // if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) {
+    //         intergenicKmerList.push_back(rightKmerHash);
+    // }
+
+    //For the last gene
+    if (find(intergenicKmerList.begin(), intergenicKmerList.end(), leftKmerHash) !=
+        intergenicKmerList.end()) { //extension to left
+        if (!isReverse) { //forward
             frame = (genes[numOfGene - 1].begin - 1) % 3;
+            leftEnd = genes[numOfGene - 2].end - 1 - 22;
             while (leftEnd % 3 != frame) leftEnd++;
+            blocks.emplace_back(leftEnd, length - 1, 1);
+            blockIdx++;
+        } else { // reverse
+            frame = (genes[numOfGene - 1].end - 1) % 3;
+            rightEnd = length - 1;
+            while (rightEnd % 3 != frame) rightEnd--;
+            blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1);
+            blockIdx++;
         }
-    }
-    blocks.emplace_back(leftEnd, rightEnd, isReverse ? -1 : 1);
-    if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) {
+    } else { //extension to right
+        if (hasBeenExtendedToLeft) {
+            if (!isReverse) { //forward
+                frame = (genes[numOfGene - 1].begin - 1) % 3;
+                leftEnd = genes[numOfGene - 2].end - 1 - 22;
+                while (leftEnd % 3 != frame) leftEnd++;
+                blocks.emplace_back(leftEnd, length - 1, 1);
+                blockIdx++;
+            } else {
+                frame = (genes[numOfGene - 1].end - 1) % 3;
+                rightEnd = length - 1;
+                while (rightEnd % 3 != frame) rightEnd--;
+                blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1);
+                blockIdx++;
+            }
+        } else {
+            if (!isReverse) {
+                blocks.emplace_back(genes[numOfGene - 1].begin, length - 1, 1);
+                blockIdx++;
+            } else {
+                frame = (genes[numOfGene - 1].end - 1) % 3;
+                rightEnd = length - 1;
+                while (rightEnd % 3 != frame) rightEnd--;
+                blocks.emplace_back(genes[numOfGene - 1].begin - 1, rightEnd, -1);
+                blockIdx++;
+            }
+        }
+
+        //If current intergenic sequences is new, update intergenicKmerList.
+        if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) {
             intergenicKmerList.push_back(rightKmerHash);
+        }
     }
 
-    // if (find(intergenicKmerList.begin(), intergenicKmerList.end(), leftKmerHash) !=
-    //     intergenicKmerList.end()) { //extension to left
-    //     if (!isReverse) { //forward
-    //         frame = (genes[numOfGene - 1].begin - 1) % 3;
-    //         leftEnd = genes[numOfGene - 2].end - 1 - 22;
-    //         while (leftEnd % 3 != frame) leftEnd++;
-    //         blocks.emplace_back(leftEnd, length - 1, 1);
-    //         blockIdx++;
-    //     } else { // reverse
-    //         frame = (genes[numOfGene - 1].end - 1) % 3;
-    //         rightEnd = length - 1;
-    //         while (rightEnd % 3 != frame) rightEnd--;
-    //         blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1);
-    //         blockIdx++;
-    //     }
-    // } else { //extension to right
-    //     if (hasBeenExtendedToLeft) {
-    //         if (!isReverse) { //forward
-    //             frame = (genes[numOfGene - 1].begin - 1) % 3;
-    //             leftEnd = genes[numOfGene - 2].end - 1 - 22;
-    //             while (leftEnd % 3 != frame) leftEnd++;
-    //             blocks.emplace_back(leftEnd, length - 1, 1);
-    //             blockIdx++;
-    //         } else {
-    //             frame = (genes[numOfGene - 1].end - 1) % 3;
-    //             rightEnd = length - 1;
-    //             while (rightEnd % 3 != frame) rightEnd--;
-    //             blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1);
-    //             blockIdx++;
-    //         }
-    //     } else {
-    //         if (!isReverse) {
-    //             blocks.emplace_back(genes[numOfGene - 1].begin, length - 1, 1);
-    //             blockIdx++;
-    //         } else {
-    //             frame = (genes[numOfGene - 1].end - 1) % 3;
-    //             rightEnd = length - 1;
-    //             while (rightEnd % 3 != frame) rightEnd--;
-    //             blocks.emplace_back(genes[numOfGene - 1].begin - 1, rightEnd, -1);
-    //             blockIdx++;
-    //         }
-    //     }
-
-    //     //If current intergenic sequences is new, update intergenicKmerList.
-    //     if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) {
-    //         intergenicKmerList.push_back(rightKmerHash);
-    //     }
-    // }
 
     free(newIntergenicKmer);
     free(leftKmer);
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index ac2ad621..5ced0f7e 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -84,12 +84,12 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery,
                                 vector<Query> & queryList,
                                 const LocalParameters &par) {
 
-   if (true) {
-       cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
-       for (size_t i = offset; i < end + 1; i++) {
-           cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
-       }
-   }
+//    if (true) {
+//        cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl;
+//        for (size_t i = offset; i < end + 1; i++) {
+//            cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) <<  " "  << int(matchList[i].redundancy) << endl;
+//        }
+//    }
     // Get the best species for current query
     vector<const Match*> speciesMatches;
     speciesMatches.reserve(end - offset + 1);
@@ -504,14 +504,23 @@ float Taxonomer::combineMatchPaths(vector<MatchPath> & matchPaths,
             bool isOverlapped = false;
             for (size_t j = 0; j < combinedMatchPaths.size(); j++) {
                 if (isMatchPathOverlapped(matchPaths[i], combinedMatchPaths[j])) { // overlap!
-                    if (isMatchPathLinked(matchPaths[i], combinedMatchPaths[j])) {
-                        // merge two linked matchPaths by editing the combinedMatchPaths[j]
-                        trimMatchPath(matchPaths[i], combinedMatchPaths[j]);                        
+                    int overlappedLength = min(matchPaths[i].end, combinedMatchPaths[j].end) 
+                                            - max(matchPaths[i].start, combinedMatchPaths[j].start) + 1;
+                    if (overlappedLength < 24) {
+                        trimMatchPath(matchPaths[i], combinedMatchPaths[j], overlappedLength);
                         continue;
                     } else {
                         isOverlapped = true;
                         break;
                     }
+                    // if (isMatchPathLinked(matchPaths[i], combinedMatchPaths[j])) {
+                    //     // merge two linked matchPaths by editing the combinedMatchPaths[j]
+                    //     trimMatchPath(matchPaths[i], combinedMatchPaths[j]);                        
+                    //     continue;
+                    // } else {
+                    //     isOverlapped = true;
+                    //     break;
+                    // }
                 } 
             }
             if (!isOverlapped) {
@@ -581,29 +590,29 @@ void Taxonomer::mergeMatchPaths(const MatchPath & source, MatchPath & target) {
     }
 }
 
-void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2) {
-    int margin = min(path1.end, path2.end) - max(path1.start, path2.start) + 1 - 21;
+void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength) {
+    // int margin = min(path1.end, path2.end) - max(path1.start, path2.start) + 1 - 21;
     if (path1.start < path2.start) { 
         path1.end = path2.start - 1;
         uint8_t lastEndHamming = GET_2_BITS(path1.matches.back()->rightEndHamming);
         path1.hammingDist = path1.hammingDist - (path1.matches.back()->hamming - lastEndHamming);
-        path1.score = path1.score - path1.matches.back()->getScore() - margin;
-        if (lastEndHamming == 0) {
-            path1.score += 3.0f;
-        } else {
-            path1.score += 2.0f - 0.5f * lastEndHamming;
-        }
-        path1.matches.pop_back();
+        path1.score = path1.score - path1.matches.back()->getRightPartScore(overlapLength/3) - (overlapLength % 3);
+        // if (lastEndHamming == 0) {
+        //     path1.score += 3.0f;
+        // } else {
+        //     path1.score += 2.0f - 0.5f * lastEndHamming;
+        // }
+        // path1.matches.pop_back(); // unnecessary without checking isLikned
     } else {
         path1.start = path2.end + 1;
         uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14);
         path1.hammingDist = path1.hammingDist - (path1.matches.front()->hamming - lastEndHamming);
-        path1.score = path1.score - path1.matches.front()->getScore() - margin;
-        if (lastEndHamming == 0) {
-            path1.score += 3.0f;
-        } else {
-            path1.score += 2.0f - 0.5f * lastEndHamming;
-        }
+        path1.score = path1.score - path1.matches.front()->getLeftPartScore(overlapLength/3) - (overlapLength % 3);
+        // if (lastEndHamming == 0) {
+        //     path1.score += 3.0f;
+        // } else {
+        //     path1.score += 2.0f - 0.5f * lastEndHamming;
+        // }
         // path1.matches.erase(path1.matches.begin());
     }
 }
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index 0d00e77d..70211255 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -101,7 +101,7 @@ class Taxonomer {
 
     void mergeMatchPaths(const MatchPath & source, MatchPath & target);
 
-    void trimMatchPath(MatchPath & path1, const MatchPath & path2);
+    void trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength);
 
     void filterRedundantMatches(vector<const Match*> & speciesMatches,
                                 map<TaxID, int> & taxCnt);

From c81d4265474ad45b635eae819c8ad6f360a4c0c9 Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Mon, 13 Nov 2023 19:24:15 +0900
Subject: [PATCH 64/65] Ignore k-mer matches with DNA sequence identity < 70%

---
 src/commons/KmerMatcher.cpp |  2 +-
 src/commons/Match.h         |  2 --
 src/commons/Taxonomer.cpp   | 32 +++++++-------------------------
 src/commons/Taxonomer.h     |  5 ++++-
 4 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp
index 248539fd..c7331405 100644
--- a/src/commons/KmerMatcher.cpp
+++ b/src/commons/KmerMatcher.cpp
@@ -513,7 +513,7 @@ void KmerMatcher::compareDna(uint64_t query,
 
     // Select target k-mers that passed hamming criteria
     for (size_t h = 0; h < size; h++) {
-        if (hammingSums[h] <= min(minHammingSum * 2, 8)) {
+        if (hammingSums[h] <= min(minHammingSum * 2, 7)) {
             selectedMatches.push_back(h);
             selectedHammingSum.push_back(hammingSums[h]);
             if (frame < 3) {
diff --git a/src/commons/Match.h b/src/commons/Match.h
index d2a02846..48950664 100644
--- a/src/commons/Match.h
+++ b/src/commons/Match.h
@@ -43,7 +43,6 @@ struct Match { // 20 byte
         }
     }
 
-    // 87654321에서 678을 알고 싶은 거임
     float getRightPartScore(const int range, float score = 0.0f, int cnt = 0) const {
         if (cnt == range) {
             return score;
@@ -57,7 +56,6 @@ struct Match { // 20 byte
         return getRightPartScore(range, score, cnt + 1);    
     }
 
-    // 87654321
     float getLeftPartScore(const int range, float score = 0.0f, int cnt = 0) const {
         if (cnt == range) {
             return score;
diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp
index 5ced0f7e..575663a9 100644
--- a/src/commons/Taxonomer.cpp
+++ b/src/commons/Taxonomer.cpp
@@ -27,6 +27,12 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon
     minConsCnt = par.minConsCnt;
     minConsCntEuk = par.minConsCntEuk;
     eukaryotaTaxId = par.eukaryotaTaxId;
+
+    if (par.seqMode == 1 || par.seqMode == 2) {
+        denominator = 100;
+    } else {
+        denominator = 1000;
+    }
 }
 
 Taxonomer::~Taxonomer() {
@@ -196,36 +202,12 @@ void Taxonomer::filterRedundantMatches(vector<const Match *> & speciesMatches,
             }
             i++;
         }
-        // cout << minHammingMatch->targetId << " " << minHammingMatch->qInfo.frame << " " << minHammingMatch->qInfo.pos << " " << int(minHammingMatch->hamming) <<  " "  << int(minHammingMatch->redundancy) << endl;
-
         taxCnt[minHammingTaxId]++;
     }
 }
 
 TaxID Taxonomer::lowerRankClassification(const map<TaxID, int> & taxCnt, TaxID spTaxId, int queryLength) {
-    // size_t matchNum = matches.size();
-    // for (size_t i = 0; i < matchNum; i++) {
-    //     // cout << matches[i].targetId << endl;
-    //     // taxCnt[matches[i].targetId] ++;
-    //     size_t currQuotient = matches[i].qInfo.pos / 3;
-    //     uint8_t minHamming = matches[i].hamming;
-    //     Match * minHammingMatch = & matches[i];
-    //     TaxID minHammingTaxId = minHammingMatch->targetId;
-    //     while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) {
-    //         if (matches[i].hamming < minHamming) {
-    //             minHamming = matches[i].hamming;
-    //             minHammingMatch = & matches[i];
-    //             minHammingTaxId = minHammingMatch->targetId;
-    //         } else if (matches[i].hamming == minHamming) {
-    //             minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId);
-    //             minHammingMatch->redundancy = true;
-    //             matches[i].redundancy = true;
-    //         }
-    //         i++;
-    //     }
-    //     taxCnt[minHammingTaxId]++;       
-    // }
-    unsigned int maxCnt = (queryLength - 1)/100 + 1;
+    unsigned int maxCnt = (queryLength - 1)/denominator + 1;
     unordered_map<TaxID, TaxonCounts> cladeCnt;
     getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId);
     if (accessionLevel == 2) { // Don't do accession-level classification
diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h
index 70211255..6da92429 100644
--- a/src/commons/Taxonomer.h
+++ b/src/commons/Taxonomer.h
@@ -48,7 +48,7 @@ class Taxonomer {
     int unmaskedPos[9];
     int spaceNum;
 
-    // Parameters
+    // Parameters from user
     int maxGap;
     int minCoveredPos;
     int accessionLevel;
@@ -57,6 +57,9 @@ class Taxonomer {
     int minConsCntEuk;
     int eukaryotaTaxId;
 
+    // Internal
+    int denominator;
+
     struct MatchBlock {
         MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {}
         MatchBlock() : start(0), end(0), id(0) {}

From 4f617158acf81cfc9944a2d93c580d7fb2a3a52f Mon Sep 17 00:00:00 2001
From: Jaebeom Kim <jbeom0731@gmail.com>
Date: Mon, 13 Nov 2023 23:37:15 +0900
Subject: [PATCH 65/65] solve merge conflict in mapping2taxon.cpp

---
 src/util/mapping2taxon.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/util/mapping2taxon.cpp b/src/util/mapping2taxon.cpp
index 543ad9fe..d7aedfdd 100644
--- a/src/util/mapping2taxon.cpp
+++ b/src/util/mapping2taxon.cpp
@@ -100,8 +100,4 @@ int mapping2taxon(int argc, const char **argv, const Command &command) {
     }
 
     return 0;
-<<<<<<< HEAD
 }
-=======
-}
->>>>>>> newScore