From 8a7783b662c34609717f425c0c5645915ded0e78 Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Fri, 11 Aug 2023 15:20:11 +0900 Subject: [PATCH 01/65] clean codes for parameters --- src/commons/LocalParameters.cpp | 105 +++++++++++++++----------------- src/commons/LocalParameters.h | 2 - src/workflow/classify.cpp | 1 - 3 files changed, 48 insertions(+), 60 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 5a7c7d5c..47b2b689 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -9,28 +9,28 @@ LocalParameters::LocalParameters() : "NCBI: 10239 [Default]\nCUSTOM: Check names.dmp file ", typeid(int), (void *) &virusTaxId, - "[^[1-9]\\d*$]"), + "^[0-9]+$"), BACTERIA_TAX_ID(BACTERIA_TAX_ID_ID, "--bacteria-taxid", "Taxonomy ID of bacteria taxon", "NCBI: 2 [Default]\nCUSTOM: Check names.dmp file ", typeid(int), (void *) &bacteriaTaxId, - "[^[1-9]\\d*$]"), + "^[0-9]+$"), ARCHAEA_TAX_ID(ARCHAEA_TAX_ID_ID, "--archaea-taxid", "Taxonomy ID of archaea taxon", "NCBI: 2157 [Default]\nCUSTOM: Check names.dmp file ", typeid(int), (void *) &archaeaTaxId, - "[^[1-9]\\d*$]"), + "^[0-9]+$"), EUKARYOTA_TAX_ID(EUKARYOTA_TAX_ID_ID, "--eukaryota-taxid", "Taxonomy ID of eukaryota taxon", "NCBI: 2759 [Default]\nCUSTOM: Check names.dmp file ", typeid(int), (void *) &eukaryotaTaxId, - "[^[1-9]\\d*$]"), + "^[0-9]+$"), SEQ_MODE(SEQ_MODE_ID, "--seq-mode", "Sequencing type", @@ -38,13 +38,6 @@ LocalParameters::LocalParameters() : typeid(int), (void *) &seqMode, "[1-3]"), - MEMORY_MODE(MEMORY_MODE_ID, - "--memory-mode", - "Keeping k-mer matches in the RAM or writing into a file", - "Writing: 1 \nRAM: 2", - typeid(int), - (void *) &memoryMode, - "[1-2]"), REDUCED_AA(REDUCED_AA_ID, "--reduced-aa", "Using reduced 15 alphabets to encode amino acids. It increases sensitivity", @@ -73,14 +66,14 @@ LocalParameters::LocalParameters() : "A mask should contain at least eight '1's, and '0' means skip.", typeid(std::string), (void *) &spaceMask, - ""), + "^.*$"), MIN_COVERED_POS(MIN_COVERED_POS_ID, "--min-covered-pos", "Minimum number of covered positions of a range", "Minimum number of covered positions of a range", typeid(int), (void *) &minCoveredPos, - ""), + "^[0-9]+$"), HAMMING_MARGIN(HAMMING_MARGIN_ID, "--hamming-margin", "If a query k-mer has multiple matches, the matches with hamming distance lower than sum of \n" @@ -103,35 +96,63 @@ LocalParameters::LocalParameters() : "Path to prodigal training information files", typeid(std::string), (void *) &tinfoPath, - ""), + "^.*$"), RAM_USAGE(RAM_USAGE_ID, "--max-ram", "RAM usage in GiB", "RAM usage in GiB", typeid(int), (void *) &ramUsage, - "^[1-9]{1}[0-9]*$"), + "^[0-9]+$"), PRINT_LOG(PRINT_LOG_ID, "--print-log", "Print logs to debug", "Print logs to debug", typeid(int), (void *) &printLog, - ""), + "^[0-9]+$"), + MAX_GAP(MAX_GAP_ID, + "--max-gap", + "Maximum gap between two consecutive k-mers (used only with spaced k-mer)", + "Maximum gap between two consecutive k-mers (used only with spaced k-mer)", + typeid(int), + (void *) &maxGap, + "^[0-9]+$"), + MIN_CONS_CNT(MIN_CONS_CNT_ID, + "--min-cons-cnt", + "Minimum number of consecutive metamer matches to be used for prokaryote/virus classification", + "Minimum number of consecutive metamer matches to be used for prokaryote/virus classification", + typeid(int), + (void *) &minConsCnt, + "^[0-9]+$"), + MIN_CONS_CNT_EUK(MIN_CONS_CNT_EUK_ID, + "--min-cons-cnt-euk", + "Minimum number of consecutive metamer matches to be used for eukaryote classification", + "Minimum number of consecutive metamer matches to be used for eukaryote classification", + typeid(int), + (void *) &minConsCntEuk, + "^[0-9]+$"), + MATCH_PER_KMER(MATCH_PER_KMER_ID, + "--match-per-kmer", + "Number of matches per query k-mer", + "Number of matches per query k-mer. Larger values assign more memory for storing k-mer matches.", + typeid(int), + (void *) &matchPerKmer, + "^[0-9]+$"), LIBRARY_PATH(LIBRARY_PATH_ID, "--library-path", "Path to library where the FASTA files are stored", "Path to library where the FASTA files are stored", typeid(std::string), (void *) &libraryPath, - ""), + "^.*$"), TAXONOMY_PATH(TAXONOMY_PATH_ID, "--taxonomy-path", "Directory where the taxonomy dump files are stored", "Directory where the taxonomy dump files are stored", typeid(std::string), (void *) &taxonomyPath, - ""), + "^.*$"), IS_ASSEMBLY(IS_ASSEMBLY_ID, "--assembly", "Input is an assembly", @@ -139,97 +160,69 @@ LocalParameters::LocalParameters() : typeid(bool), (void *) &assembly, ""), - MAX_GAP(MAX_GAP_ID, - "--max-gap", - "Maximum gap between two consecutive k-mers (used only with spaced k-mer)", - "Maximum gap between two consecutive k-mers (used only with spaced k-mer)", - typeid(int), - (void *) &maxGap, - ""), - MIN_CONS_CNT(MIN_CONS_CNT_ID, - "--min-cons-cnt", - "Minimum number of consecutive metamer matches to be used for prokaryote/virus classification", - "Minimum number of consecutive metamer matches to be used for prokaryote/virus classification", - typeid(int), - (void *) &minConsCnt, - ""), - MIN_CONS_CNT_EUK(MIN_CONS_CNT_EUK_ID, - "--min-cons-cnt-euk", - "Minimum number of consecutive metamer matches to be used for eukaryote classification", - "Minimum number of consecutive metamer matches to be used for eukaryote classification", - typeid(int), - (void *) &minConsCntEuk, - ""), SPLIT_NUM(SPLIT_NUM_ID, "--split-num", "A database is divided to N splits (offsets). During classification, unnecessary splits are skipped", "A database is divided to N splits (offsets). During classification, unnecessary splits are skipped", typeid(int), (void *) &splitNum, - ""), + "^[0-9]+$"), BUFFER_SIZE(BUFFER_SIZE_ID, "--buffer-size", "Buffer size (the number of k-mers)", "Buffer size (the number of k-mers)", typeid(size_t), (void *) &bufferSize, - ""), + "^[0-9]+$"), TEST_RANK(TEST_RANK_ID, "--test-rank", ".", "csv of ranks to be tested", typeid(std::string), (void *) &testRank, - ""), + "^.*$"), TEST_TYPE(TEST_TYPE_ID, "--test-type", ".", "Test Type", typeid(std::string), (void *) &testType, - ""), + "^.*$"), READID_COL(READID_COL_ID, "--readid-col", "Column number of accession in classification result", "Column number of accession in classification result", typeid(int), (void *) &readIdCol, - ""), + "^[0-9]+$"), TAXID_COL(TAXID_COL_ID, "--taxid-col", "Column number of taxonomy ID in classification result", "Column number of taxonomy ID in classification result", typeid(int), (void *) &taxidCol, - ""), + "^[0-9]+$"), SCORE_COL(SCORE_COL_ID, "--score-col", "Column number of score in classification result", "Column number of score in classification result", typeid(int), (void *) &scoreCol, - ""), + "^[0-9]+$"), COVERAGE_COL(COVERAGE_COL_ID, "--coverage-col", "Column number of coverage in classification result", "Column number of coverage in classification result", typeid(int), (void *) &coverageCol, - ""), - MATCH_PER_KMER(MATCH_PER_KMER_ID, - "--match-per-kmer", - "Number of matches per query k-mer", - "Number of matches per query k-mer. Larger values assign more memory for storing k-mer matches.", - typeid(int), - (void *) &matchPerKmer, - ""), + "^[0-9]+$"), PRINT_COLUMNS(PRINT_COLUMNS_ID, "--print-columns", "CSV of column numbers to be printed", "CSV of column numbers to be printed", typeid(std::string), (void *) &printColumns, - "") + "^.*$") { //add_to_library @@ -247,12 +240,10 @@ LocalParameters::LocalParameters() : classify.push_back(&PARAM_THREADS); classify.push_back(&SEQ_MODE); classify.push_back(&VIRUS_TAX_ID); -// classify.push_back(&MEMORY_MODE); classify.push_back(&REDUCED_AA); classify.push_back(&MIN_SCORE); classify.push_back(&MIN_COVERAGE); classify.push_back(&SPACED); -// classify.push_back(&MIN_CONSECUTIVE); classify.push_back(&HAMMING_MARGIN); classify.push_back(&MIN_SP_SCORE); classify.push_back(&PARAM_V); diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index abde1c8f..4169ab25 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -40,7 +40,6 @@ class LocalParameters : public Parameters { // Classify PARAMETER(SEQ_MODE) - PARAMETER(MEMORY_MODE) PARAMETER(REDUCED_AA) PARAMETER(MIN_SCORE) PARAMETER(MIN_COVERAGE) @@ -81,7 +80,6 @@ class LocalParameters : public Parameters { // Classify int seqMode; - int memoryMode; int reducedAA; float minScore; std::string spaceMask; diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp index 69786781..e88a4aa4 100644 --- a/src/workflow/classify.cpp +++ b/src/workflow/classify.cpp @@ -6,7 +6,6 @@ #include "FileUtil.h" void setClassifyDefaults(LocalParameters & par){ - par.virusTaxId = 10239;// Taxonomy ID of virus taxon in NCBI par.seqMode = 2; par.memoryMode = 1; par.reducedAA = 0; From ccb05116edaab67fef70d3d4224506fb4a163a91 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim <68528165+jaebeom-kim@users.noreply.github.com> Date: Fri, 1 Sep 2023 13:31:29 +0900 Subject: [PATCH 02/65] Update README.md --- README.md | 103 +++++++++++++++++++++++++++++------------------------- 1 file changed, 55 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 65b00984..6765aa17 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ metabuli classify --seq-mode 1 read.fna dbdir outdir jobid --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids. Give the same value used for DB creation. --spacing-mask : Binary patterend mask for spaced k-mer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip. - * --min-score and --min-sp-score for precision mode are optimized only for short reads. + * Values of --min-score and --min-sp-score for precision mode are optimized only for short reads. * We don't recommend using them for long reads. ``` @@ -135,79 +135,86 @@ We tested it with a MacBook Air (2020, M1, 8 GiB), where we classified about 1.5 ## Custom database To build a custom database, you need three things: -1. **FASTA files** : Each sequence of your FASTA files must be separated by '>accession.version' like '>CP001849.1' -2. **accession2taxid** : Mapping from acession to taxonomy identifier. Sequences whose accessions are not listed in this file will be skipped. -3. **NCBI-style taxonomy dump** : 'names.dmp' , 'nodes.dmp', and 'merged.dmp' are required. Sequences whose taxid are not included here will be skipped. +1. **FASTA files** : Each sequence of your FASTA files must be separated by '>accession.version' like '>CP001849.1'. +2. **accession2taxid** : Mapping from accession to taxonomy ID. The sequences whose accessions are not listed here will be skipped. +3. **NCBI-style taxonomy dump** : 'names.dmp' , 'nodes.dmp', and 'merged.dmp' are required. The sequences whose taxonomy IDs are not included here will be skipped. -Next, the steps for creating a database based on NCBI or GTDB taxonomy are described. +The steps for building a database with NCBI or GTDB taxonomy are described below. +### To build a database with NCBI taxonomy #### 1. Prepare taxonomy and accession2taxid -##### NCBI taxonomy - * accession2taxid can be downloaded from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/ * Taxonomy dump files can be downloaded from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/ - -##### GTDB taxonomy - - Follow two steps below to generate GTDB taxonomy and accession2taxid file. - * Requirements: You need assembly FASTA files whose file name (or path) includes the assembly accession. - If you downloaded assemblies using [ncbi-genome-download](https://github.com/kblin/ncbi-genome-download), you probably don't have to care about it. - The regular expression of assembly accessions is (GC[AF]_[0-9].[0-9]) - - ``` - # 1. - In 'util' directory - ./prepare_gtdb_taxonomy.sh - - DBDIR : Result files are stored in 'DBDIR/taxonomy'. - Make sure that 'DBDIR/taxonomy' is exist and empty. - The same path should be used in step 1. - ``` - This will generate taxonomy dump files and `assacc_to_taxid.tsv` with other files. - - ``` - # 2. - metabuli add-to-library --assembly true - - FASTA list : A list of absolute paths of each assembly files. - Each absolute path must include assembly accession. - - accession2taxid : 'assacc_to_taxid.tsv' from the previous step - - DBDIR : The same DBDIR from the previous step. - ``` - This will add your FASTA files to `DBDIR/library` according to their species taxonomy ID and generate 'my.accession2taxid' - -#### 2. Add to libarary (optional) +#### 2. Add to library ``` metabuli add-to-library - - FASTA list: A list of absolute paths of each FASTA files. - - accession2taxid: A path to NCBI-style accession2taxid - - DBDIR: The same DBDIR from the previous step. +- FASTA list: A file containing absolute paths of each FASTA file. +- accession2taxid: A path to NCBI-style accession2taxid. +- DBDIR: Sequences will be stored in 'DBDIR/library'. ``` -This command groups your FASTA files of the same species and add stores them in separate files to DBDIR/library. -You can skip this step in the case of -1. You have already used this command during the preparation for GTDB taxonomy. -2. Your FASTA list includes only one FASTA file per species. - +It groups your sequences into separate files according to their species. +Accessions that are not included in the `` will be skipped and listed in `unmapped.txt`. #### 3. Build ``` metabuli build [options] - DBDIR: The same DBDIR from the previous step. -- FASTA list: A list of absolute paths to your FASTA files (in DBDIR/library) -- accession2taxid : accession2taxid file +- FASTA list: A file containing absolute paths of the FASTA files in DBDIR/library +- accession2taxid : A path to NCBI-style accession2taxid. * Options --threads : The number of CPU-cores used (all by default) - --tinfo-path : Path to prodigal training information files. (DBDIR/prodigal by default) --taxonomy-path: Directory where the taxonomy dump files are stored. (DBDIR/taxonomy by default) --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids. - --spacing-mask : Binary patterend mask for spaced k-mer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip. + --spacing-mask : Binary mask for spaced metamer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip. ``` This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete '\*\_diffIdx' and '\*\_info' if generated. +### To build a database with GTDB taxonomy +#### 1. Prepare GTDB taxonomy and accession2taxid +*Requirements*: You need assembly FASTA files whose file name (or path) includes the assembly accession. +If you downloaded assemblies using `ncbi-genome-download`, you probably don't have to care about it. +The regular expression of assembly accessions is (GC[AF]_[0-9].[0-9]) + +``` +# 1. +In the 'util' directory +./prepare_gtdb_taxonomy.sh + - DBDIR : Result files are stored in 'DBDIR/taxonomy'. +``` +This will generate taxonomy dump files and `assacc_to_taxid.tsv` with other files. + +``` +# 2. +metabuli add-to-library --assembly true + - FASTA list : A file containing absolute paths of each assembly file. + Each path must include a corresponding assembly accession. + - accession2taxid : 'assacc_to_taxid.tsv' from the previous step + - DBDIR : The same DBDIR from the previous step. +``` +This will add your FASTA files to DBDIR/library according to their species taxonomy ID and generate 'my.accession2taxid' + +#### 2. Build +``` +metabuli build [options] +- DBDIR: The same DBDIR from the previous step. +- FASTA list: A file containing absolute paths of the FASTA files in DBDIR/library +- accession2taxid : A path to NCBI-style accession2taxid. + + * Options + --threads : The number of CPU-cores used (all by default) + --taxonomy-path: Directory where the taxonomy dump files are stored. (DBDIR/taxonomy by default) + --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids. + --spacing-mask : Binary mask for spaced metamer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip. +``` +This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete '\*\_diffIdx' and '\*\_info' if generated. + + ## Example ``` Classifying RNA-seq reads from a COVID-19 patient to identify the culprit variant. From d77f6b18ca39b5abefd7206a668a5d019c86bebc Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Fri, 11 Aug 2023 15:53:37 +0900 Subject: [PATCH 03/65] support fna.gz files for add-to-library module --- src/workflow/add_to_library.cpp | 114 +++++++++++--------------------- 1 file changed, 37 insertions(+), 77 deletions(-) diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp index bf4d6fa9..0d533eec 100644 --- a/src/workflow/add_to_library.cpp +++ b/src/workflow/add_to_library.cpp @@ -5,7 +5,6 @@ #include "KSeqWrapper.h" #include #include "IndexCreator.h" -#include #include "FileUtil.h" using namespace std; @@ -28,9 +27,8 @@ int addToLibrary(int argc, const char **argv, const Command &command){ if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; if (par.libraryPath == "DBDIR/library/") par.libraryPath = dbDir + "/library/"; -// string libraryPath = dbDir + "/library"; // If the library directory does not exist, create it - if (FileUtil::directoryExists(par.libraryPath.c_str()) == false) { + if (!FileUtil::directoryExists(par.libraryPath.c_str())) { FileUtil::makeDir(par.libraryPath.c_str()); } @@ -40,7 +38,6 @@ int addToLibrary(int argc, const char **argv, const Command &command){ string merged = par.taxonomyPath + "/merged.dmp"; NcbiTaxonomy ncbiTaxonomy(names, nodes, merged); - // Load file names ifstream fileListFile; fileListFile.open(fileList); @@ -72,38 +69,23 @@ int addToLibrary(int argc, const char **argv, const Command &command){ } cout << "done" << endl; - vector sequences; - vector unmapped; - // Process each file - size_t numberOfFiles = fileNames.size(); - for (size_t i = 0; i < numberOfFiles; ++i) { - sequences.clear(); - string fileName = fileNames[i]; - - // Getting start and end position of each sequence - IndexCreator::getSeqSegmentsWithHead(sequences, fileName.c_str()); - - // Mmap the file - struct MmapedData seqFile = mmapData(fileName.c_str()); - kseq_buffer_t buffer; - kseq_t *seq; - for (size_t j = 0; j < sequences.size(); ++j) { - buffer = {const_cast(&seqFile.data[sequences[j].start]), - static_cast(sequences[j].length)}; - seq = kseq_init(&buffer); - kseq_read(seq); + // Process each file + vector unmapped; + for (size_t i = 0; i < fileNames.size(); ++i) { + KSeqWrapper* kseq = KSeqFactory(fileNames[i].c_str()); + while (kseq->ReadEntry()) { + const KSeqWrapper::KSeqEntry & e = kseq->entry; // Extract accession and Remove the version number - string accession = string(seq->name.s); + string accession = string(e.name.s); size_t pos = accession.find('.'); if (pos != string::npos) { accession = accession.substr(0, pos); } // Skip if accession is not in the mapping file if (acc2taxid.find(accession) == acc2taxid.end()) { - cout << "During processing " << fileName << ", accession " << accession << + cout << "During processing " << fileNames[i] << ", accession " << accession << " is not found in the mapping file. It is skipped." << endl; - kseq_destroy(seq); unmapped.push_back(accession); continue; } @@ -111,27 +93,26 @@ int addToLibrary(int argc, const char **argv, const Command &command){ // Get species taxID int speciesTaxID = ncbiTaxonomy.getTaxIdAtRank(acc2taxid[accession], "species"); + // Skip if species taxID is not found if (speciesTaxID == 0) { - cout << "During processing " << fileName << ", accession " << accession << + cout << "During processing " << fileNames[i] << ", accession " << accession << " is not matched to any species. It is skipped." << endl; - kseq_destroy(seq); continue; } - // Write to file + // Write each sequence to file with species taxID as file name FILE *file = fopen((dbDir + "/library/" + to_string(speciesTaxID) + ".fna").c_str(), "a"); - fprintf(file, ">%s %s\n", seq->name.s, seq->comment.s); - fprintf(file, "%s\n", seq->seq.s); + fprintf(file, ">%s %s\n", e.name.s, e.comment.s); + fprintf(file, "%s\n", e.sequence.s); fclose(file); - - kseq_destroy(seq); } - munmap(seqFile.data, seqFile.fileSize + 1); + delete kseq; } + // Write unmapped accession to file FILE *file = fopen((dbDir + "/unmapped.txt").c_str(), "w"); - for (size_t i = 0; i < unmapped.size(); ++i) { - fprintf(file, "%s\n", unmapped[i].c_str()); + for (const auto & i : unmapped) { + fprintf(file, "%s\n", i.c_str()); } fclose(file); } @@ -156,33 +137,20 @@ int addToLibrary(int argc, const char **argv, const Command &command){ cerr << "Cannot open the mapping from assembly accession to tax ID" << endl; } - vector sequences; + // Process each file vector unmapped; regex regex1("(GC[AF]_[0-9]*\\.[0-9]*)"); - // Process each file - size_t numberOfFiles = fileNames.size(); - for (size_t i = 0; i < numberOfFiles; ++i) { - sequences.clear(); - string fileName = fileNames[i]; - - // Getting start and end position of each sequence - IndexCreator::getSeqSegmentsWithHead(sequences, fileName.c_str()); - - // Mmap the file - struct MmapedData seqFile = mmapData(fileName.c_str()); - kseq_buffer_t buffer; - kseq_t *seq; - + for (size_t i = 0; i < fileNames.size(); ++i) { // Get assembly accession from file name using regex and remove the version number smatch match; - regex_search(fileName, match, regex1); + regex_search(fileNames[i], match, regex1); string assemblyID = match[0]; size_t pos = assemblyID.find('.'); if (pos != string::npos) { assemblyID = assemblyID.substr(0, pos); } // Skip if current assembly accession is not in the mapping file if (assembly2taxid.find(assemblyID) == assembly2taxid.end()) { - cout << "During processing " << fileName << ", accession " << assemblyID << + cout << "During processing " << fileNames[i] << ", accession " << assemblyID << " is not found in the mapping file. It is skipped." << endl; unmapped.push_back(assemblyID); continue; @@ -191,35 +159,27 @@ int addToLibrary(int argc, const char **argv, const Command &command){ // Get species taxID int speciesTaxID = ncbiTaxonomy.getTaxIdAtRank(assembly2taxid[assemblyID], "species"); if (speciesTaxID == 0) { - cout << "During processing " << fileName << ", accession " << assemblyID << + cout << "During processing " << fileNames[i] << ", accession " << assemblyID << " is not matched to any species. It is skipped." << endl; continue; } - for (size_t j = 0; j < sequences.size(); ++j) { - buffer = {const_cast(&seqFile.data[sequences[j].start]), - static_cast(sequences[j].length)}; - seq = kseq_init(&buffer); - kseq_read(seq); - - // Extract accession - string accession = string(seq->name.s); - acc2taxid[accession] = assembly2taxid[assemblyID]; - + KSeqWrapper* kseq = KSeqFactory(fileNames[i].c_str()); + while (kseq->ReadEntry()){ + const KSeqWrapper::KSeqEntry & e = kseq->entry; // Write to file -// FILE *file = fopen((dbDir + "/library/" + to_string(speciesTaxID) + ".fna").c_str(), "a"); -// fprintf(file, ">%s %s\n", seq->name.s, seq->comment.s); -// fprintf(file, "%s\n", seq->seq.s); -// fclose(file); - - kseq_destroy(seq); + FILE *file = fopen((dbDir + "/library/" + to_string(speciesTaxID) + ".fna").c_str(), "a"); + fprintf(file, ">%s %s\n", e.name.s, e.comment.s); + fprintf(file, "%s\n", e.sequence.s); + fclose(file); } - munmap(seqFile.data, seqFile.fileSize + 1); + delete kseq; } + // Write unmapped accession to file FILE *file = fopen((dbDir + "/unmapped.txt").c_str(), "w"); - for (size_t i = 0; i < unmapped.size(); ++i) { - fprintf(file, "%s\n", unmapped[i].c_str()); + for (const auto & i : unmapped) { + fprintf(file, "%s\n", i.c_str()); } fclose(file); @@ -227,12 +187,12 @@ int addToLibrary(int argc, const char **argv, const Command &command){ cout << "Write mapping from accession to taxonomy ID" << endl; file = fopen((dbDir + "/my.accession2taxid").c_str(), "w"); fprintf(file, "accession\taccession.version\ttaxid\tgi"); - for (auto it = acc2taxid.begin(); it != acc2taxid.end(); ++it) { + for (auto & it : acc2taxid) { // Get accession without a version number - string accession = it->first; + string accession = it.first; size_t pos = accession.find('.'); if (pos != string::npos) { accession = accession.substr(0, pos);} - fprintf(file, "\n%s\t%s\t%d\t0", accession.c_str(), it->first.c_str(), it->second); + fprintf(file, "\n%s\t%s\t%d\t0", accession.c_str(), it.first.c_str(), it.second); } fclose(file); } From 34194627bcf18e2a9064d0b4cd517a7e14bf57bf Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Fri, 11 Aug 2023 15:58:05 +0900 Subject: [PATCH 04/65] some code clean --- src/commons/IndexCreator.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index 37a9a29e..be7c0f77 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -63,12 +63,6 @@ void IndexCreator::createIndex(const LocalParameters &par) { makeBlocksForParallelProcessing(); cout << "Made blocks for each thread" << endl; - // Train Prodigal for each species -// time_t prodigalStart = time(nullptr); -// trainProdigal(); -// time_t prodigalEnd = time(nullptr); -// cout << "Prodigal training time: " << prodigalEnd - prodigalStart << " seconds" << endl; - // Write taxonomy id list string taxidListFileName = dbDir + "/taxID_list"; FILE * taxidListFile = fopen(taxidListFileName.c_str(), "w"); From 4aef8ae79916c43ac05095417b454ab794efc187 Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Tue, 15 Aug 2023 16:37:09 +0900 Subject: [PATCH 05/65] Assign a proper mode to each command --- src/metabuli.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/metabuli.cpp b/src/metabuli.cpp index e972c35f..f41040d1 100644 --- a/src/metabuli.cpp +++ b/src/metabuli.cpp @@ -48,7 +48,7 @@ std::vector commands = { " ", CITATION_SPACEPHARER, {{"Directory where the DB will be generated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}}, - {"updateDB", build, &localPar.build, COMMAND_MAIN, + {"updateDB", build, &localPar.build, COMMAND_DATABASE_CREATION, "Update database based on the list of FASTA files.", NULL, "Jaebeom Kim ", @@ -57,7 +57,7 @@ std::vector commands = { {{"DB directory to be updated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty}, {"A list of FASTA files", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, - {"classify", classify, &localPar.classify, COMMAND_MAIN, + {"classify", classify, &localPar.classify, COMMAND_TAXONOMY, "Assigning taxonomy label to query reads", NULL, "Jaebeom Kim ", @@ -85,7 +85,7 @@ std::vector commands = { CITATION_SPACEPHARER, {{"read-classification", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, - {"add-to-library", addToLibrary, &localPar.addToLibrary, COMMAND_MAIN, + {"add-to-library", addToLibrary, &localPar.addToLibrary, COMMAND_DATABASE_CREATION, "It bins sequences into distinct files according to their species referring their accession number.\n " "It requires a mapping file (accession to tax ID) and NCBI style tax dump files in a taxonomy directory.", NULL, From 4bb97449474dbd1db28d6fffb541b5f1305c0e7b Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Fri, 18 Aug 2023 11:21:36 +0900 Subject: [PATCH 06/65] Classifier --> QueryIndexer, KmerExtractor, KmerMatcher, Taxonomer, Reporter --- src/LocalCommandDeclarations.h | 7 +- src/commons/CMakeLists.txt | 17 +- src/commons/Classifier.cpp | 2132 +---------------- src/commons/Classifier.h | 356 +-- src/commons/IndexCreator.h | 8 +- src/commons/KmerExtractor.cpp | 216 ++ src/commons/KmerExtractor.h | 56 + src/commons/KmerMatcher.cpp | 466 ++++ src/commons/KmerMatcher.h | 196 ++ src/commons/LocalUtil.cpp | 40 + src/commons/LocalUtil.h | 23 + src/commons/Mmap.h | 4 - src/commons/QueryFilter.cpp | 52 + src/commons/QueryFilter.h | 23 + src/commons/QueryIndexer.cpp | 102 + src/commons/QueryIndexer.h | 67 + src/commons/ReducedClassifier.cpp | 11 - ...ducedClassifier.h => ReducedKmerMatcher.h} | 25 +- src/commons/Reporter.cpp | 94 + src/commons/Reporter.h | 42 + src/commons/Taxonomer.cpp | 1164 +++++++++ src/commons/Taxonomer.h | 133 + src/commons/common.h | 26 + src/metabuli.cpp | 37 +- src/workflow/CMakeLists.txt | 1 + src/workflow/classify.cpp | 11 +- src/workflow/filter.cpp | 57 + 27 files changed, 2891 insertions(+), 2475 deletions(-) create mode 100644 src/commons/KmerExtractor.cpp create mode 100644 src/commons/KmerExtractor.h create mode 100644 src/commons/KmerMatcher.cpp create mode 100644 src/commons/KmerMatcher.h create mode 100644 src/commons/LocalUtil.cpp create mode 100644 src/commons/LocalUtil.h create mode 100644 src/commons/QueryFilter.cpp create mode 100644 src/commons/QueryFilter.h create mode 100644 src/commons/QueryIndexer.cpp create mode 100644 src/commons/QueryIndexer.h delete mode 100644 src/commons/ReducedClassifier.cpp rename src/commons/{ReducedClassifier.h => ReducedKmerMatcher.h} (80%) create mode 100644 src/commons/Reporter.cpp create mode 100644 src/commons/Reporter.h create mode 100644 src/commons/Taxonomer.cpp create mode 100644 src/commons/Taxonomer.h create mode 100644 src/workflow/filter.cpp diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h index 332b5ecc..000aa648 100644 --- a/src/LocalCommandDeclarations.h +++ b/src/LocalCommandDeclarations.h @@ -1,15 +1,11 @@ -// -// Created by KJB on 25/09/2020. -// - #ifndef ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H #define ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H #include "Command.h" -//extern int download_databases(int argc, const char **argv, const Command& command); extern int build(int argc, const char **argv, const Command& command); extern int updataDB(int argc, const char **argv, const Command& command); extern int classify(int argc, const char **argv, const Command& command); +extern int filter(int argc, const char **argv, const Command& command); extern int grade(int argc, const char **argv, const Command& command); extern int seqHeader2TaxId(int argc, const char **argv, const Command& command); extern int addToLibrary(int argc, const char **argv, const Command& command); @@ -17,4 +13,5 @@ extern int applyThreshold(int argc, const char **argv, const Command& command); extern int binning2report(int argc, const char **argv, const Command& command); extern int filterByGenus(int argc, const char **argv, const Command& command); extern int databaseReport(int argc, const char **argv, const Command& command); + #endif //ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H diff --git a/src/commons/CMakeLists.txt b/src/commons/CMakeLists.txt index 995f8a2b..39d5498c 100644 --- a/src/commons/CMakeLists.txt +++ b/src/commons/CMakeLists.txt @@ -18,8 +18,21 @@ set(commons_source_files commons/LocalParameters.h commons/ProdigalWrapper.h commons/ProdigalWrapper.cpp - commons/ReducedClassifier.cpp - commons/ReducedClassifier.h commons/Match.h commons/common.cpp + commons/QueryFilter.h + commons/QueryFilter.cpp + commons/LocalUtil.h + commons/LocalUtil.cpp + commons/QueryIndexer.h + commons/QueryIndexer.cpp + commons/KmerMatcher.h + commons/KmerMatcher.cpp + commons/ReducedKmerMatcher.h + commons/KmerExtractor.h + commons/KmerExtractor.cpp + commons/Taxonomer.h + commons/Taxonomer.cpp + commons/Reporter.h + commons/Reporter.cpp PARENT_SCOPE) diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index 810aa7e6..eebbc63f 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -1,199 +1,46 @@ #include "Classifier.h" #include "LocalParameters.h" -//#include "krona_prelude.html.h" #include "taxonomyreport.cpp" -#include -Classifier::Classifier(LocalParameters & par) : maskMode(par.maskMode), maskProb(par.maskProb) { +Classifier::Classifier(LocalParameters & par) { // Load parameters - if (par.seqMode == 2){ - queryPath_1 = par.filenames[0]; - queryPath_2 = par.filenames[1]; - dbDir = par.filenames[2]; - outDir = par.filenames[3]; - jobId = par.filenames[4]; - cout << "Query file 1: " << queryPath_1 << endl; - cout << "Query file 2: " << queryPath_2 << endl; - cout << "Database directory: " << dbDir << endl; - cout << "Output directory: " << outDir << endl; - cout << "Job ID: " << jobId << endl; - } else { - queryPath_1 = par.filenames[0]; - dbDir = par.filenames[1]; - outDir = par.filenames[2]; - jobId = par.filenames[3]; - cout << "Query file: " << queryPath_1 << endl; - cout << "Database directory: " << dbDir << endl; - cout << "Output directory: " << outDir << endl; - cout << "Job ID: " << jobId << endl; - } - if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; - - MARKER = 16777215; - MARKER = ~ MARKER; - bitsForCodon = 3; - numOfSplit = 0; - minCoveredPos = par.minCoveredPos; - minSpScore = par.minSpScore; - verbosity = par.verbosity; - maxGap = par.maxGap; - - // Mask for spaced k-mer - size_t maskLen = par.spaceMask.length(); - mask = new uint32_t[maskLen]; - spaceNum = 0; - spaceNum_int = 0; - for(size_t i = 0, j = 0; i < maskLen; i++){ - mask[i] = par.spaceMask[i] - 48; - spaceNum += (mask[i] == 0); - spaceNum_int += (mask[i] == 0); - if(mask[i]==1){ - unmaskedPos[j] = (int) i; - j++; - } - } - - // Hamming Dist. margin - hammingMargin = (uint8_t) par.hammingMargin; + dbDir = par.filenames[1 + (par.seqMode == 2)]; + matchPerKmer = par.matchPerKmer; // Taxonomy - const string names = par.taxonomyPath + "/names.dmp"; - const string nodes = par.taxonomyPath + "/nodes.dmp"; - const string merged = par.taxonomyPath + "/merged.dmp"; - taxonomy = new NcbiTaxonomy(names, nodes, merged); - - // Taxonomy ID list - // Load the taxonomical ID list - FILE * taxIdFile; - if((taxIdFile = fopen((dbDir + "/taxID_list").c_str(),"r")) == NULL){ - cout<<"Cannot open the taxID list file."<taxonNode(taxId); - if (taxId == taxon->taxId) { - TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); - TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); - while (taxon->taxId != speciesTaxID) { - taxId2speciesId[taxon->taxId] = speciesTaxID; - taxId2genusId[taxon->taxId] = genusTaxID; - taxon = taxonomy->taxonNode(taxon->parentTaxId); - } - taxId2speciesId[speciesTaxID] = speciesTaxID; - taxId2genusId[speciesTaxID] = genusTaxID; - } else { - TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); - TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); - while (taxon->taxId != speciesTaxID) { - taxId2speciesId[taxon->taxId] = speciesTaxID; - taxId2genusId[taxon->taxId] = genusTaxID; - taxon = taxonomy->taxonNode(taxon->parentTaxId); - } - taxId2speciesId[speciesTaxID] = speciesTaxID; - taxId2genusId[speciesTaxID] = genusTaxID; - taxId2speciesId[taxId] = speciesTaxID; - taxId2genusId[taxId] = genusTaxID; - } + if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; + taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp", + par.taxonomyPath + "/nodes.dmp", + par.taxonomyPath + "/merged.dmp"); + + // Agents + queryIndexer = new QueryIndexer(par); + kmerExtractor = new KmerExtractor(par); + if (par.reducedAA) { + kmerMatcher = new ReducedKmerMatcher(par, taxonomy); + } else { + kmerMatcher = new KmerMatcher(par, taxonomy); } - fclose(taxIdFile); - - subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0); - probMatrix = new ProbabilityMatrix(*(subMat)); -// localIndexBufferSize = 16 * 1024 * 1024; -// localMatchBufferSize = 2 * 1024 * 1024; + taxonomer = new Taxonomer(par, taxonomy); + reporter = new Reporter(par, taxonomy); } Classifier::~Classifier() { - delete[] mask; - delete subMat; - delete probMatrix; delete taxonomy; -} - -static inline bool compareForLinearSearch(const QueryKmer &a, const QueryKmer &b) { - if (a.ADkmer < b.ADkmer) { - return true; - } else if (a.ADkmer == b.ADkmer) { - return (a.info.sequenceID < b.info.sequenceID); - } - return false; + delete queryIndexer; + delete kmerExtractor; + delete kmerMatcher; + delete taxonomer; + delete reporter; } void Classifier::startClassify(const LocalParameters &par) { - // Calculate maximum number of k-mers for each iteration. - size_t matchPerKmer = par.matchPerKmer; - size_t c = sizeof(QueryKmer) + matchPerKmer * sizeof(Match); - size_t ram_threads = ((size_t) par.ramUsage * (size_t) 1024 * 1024 * 1024) - - ((size_t) 134217728 * (size_t) par.threads); - - - // Load query file cout << "Indexing query file ..."; - vector sequences_read1; - vector sequences_read2; - size_t numOfSeq = 0; - size_t start = 0; - size_t kmerCnt = 0; - size_t currentKmerCnt = 0; - size_t seqCnt = 0; - vector splitKmerCnt; - vector> queryReadSplit; - size_t totalReadLength = 0; - if (par.seqMode == 1 || par.seqMode == 3) { - splitQueryFile(sequences_read1, queryPath_1); - - // Make query read splits - numOfSeq = sequences_read1.size(); - for (size_t i = 0; i < numOfSeq; i++) { - currentKmerCnt = getQueryKmerNumber(sequences_read1[i].seqLength); - kmerCnt += currentKmerCnt; - seqCnt++; - if (c * kmerCnt + ((size_t) 200 * seqCnt) > ram_threads) { - splitKmerCnt.push_back(kmerCnt - currentKmerCnt); - queryReadSplit.emplace_back(start, i); - kmerCnt = currentKmerCnt; - start = i; - seqCnt = 1; - } - totalReadLength += sequences_read1[i].seqLength; - } - queryReadSplit.emplace_back(start, numOfSeq); - splitKmerCnt.push_back(kmerCnt); - } else { - splitQueryFile(sequences_read1, queryPath_1); - splitQueryFile(sequences_read2, queryPath_2); - - // Check if the number of reads in the two files are equal - if (sequences_read1.size() != sequences_read2.size()) { - Debug(Debug::ERROR) << "The number of reads in the two files are not equal." << "\n"; - EXIT(EXIT_FAILURE); - } - - // Make query read splits - numOfSeq = sequences_read1.size(); - for (size_t i = 0; i < numOfSeq; i++) { - totalReadLength += sequences_read1[i].seqLength + sequences_read2[i].seqLength; - currentKmerCnt = getQueryKmerNumber(sequences_read1[i].seqLength) + - getQueryKmerNumber(sequences_read2[i].seqLength); - kmerCnt += currentKmerCnt; - seqCnt ++; - if (c * kmerCnt + ((size_t) 200 * seqCnt) > ram_threads) { - splitKmerCnt.push_back(kmerCnt - currentKmerCnt); - queryReadSplit.emplace_back(start, i); - kmerCnt = currentKmerCnt; - start = i; - seqCnt = 1; - } - } - queryReadSplit.emplace_back(start, numOfSeq); - splitKmerCnt.push_back(kmerCnt); - } - + queryIndexer->indexQueryFile(); + size_t numOfSeq = queryIndexer->getReadNum_1(); + size_t totalReadLength = queryIndexer->getTotalReadLength(); + const vector & queryReadSplit = queryIndexer->getQuerySplits(); cout << "Done" << endl; cout << "Total number of sequences: " << numOfSeq << endl; cout << "Total read length: " << totalReadLength << "nt" << endl; @@ -206,63 +53,50 @@ void Classifier::startClassify(const LocalParameters &par) { size_t totalMatchCnt = 0; size_t processedSeqCnt = 0; - ofstream readClassificationFile; - readClassificationFile.open(outDir + "/" + jobId + "_classifications.tsv"); + reporter->openReadClassificationFile(); #ifdef OPENMP omp_set_num_threads(par.threads); #endif // Extract k-mers from query sequences and compare them to target k-mer DB - double vm, rss; KSeqWrapper* kseq1 = KSeqFactory(par.filenames[0].c_str()); KSeqWrapper* kseq2 = nullptr; if (par.seqMode == 2) { kseq2 = KSeqFactory(par.filenames[1].c_str()); } +// while (true) { +// bool success = false; +// while (!success) { +// +// } +// if (complete) { +// break; +// } +// } for (size_t splitIdx = 0; splitIdx < queryReadSplit.size(); splitIdx++) { // Allocate memory for query list queryList.clear(); - queryList.resize(queryReadSplit[splitIdx].second - queryReadSplit[splitIdx].first); + queryList.resize(queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start); // Allocate memory for query k-mer list and match list - kmerBuffer.reallocateMemory(splitKmerCnt[splitIdx]); - if (splitKmerCnt.size() == 1) { - size_t remain = ram_threads - splitKmerCnt[splitIdx] * sizeof(QueryKmer) - numOfSeq * 200; - matchPerKmer = remain / (sizeof(Match) * splitKmerCnt[splitIdx]); - matchBuffer.reallocateMemory(splitKmerCnt[splitIdx] * matchPerKmer); + kmerBuffer.reallocateMemory(queryReadSplit[splitIdx].kmerCnt); + if (queryReadSplit.size() == 1) { + size_t remain = queryIndexer->getAvailableRam() - queryReadSplit[splitIdx].kmerCnt * sizeof(QueryKmer) - numOfSeq * 200; + matchBuffer.reallocateMemory(remain / sizeof(Match)); } else { - matchBuffer.reallocateMemory(splitKmerCnt[splitIdx] * matchPerKmer); + matchBuffer.reallocateMemory(queryReadSplit[splitIdx].kmerCnt * matchPerKmer); } - // Initialize query k-mer buffer and match buffer kmerBuffer.startIndexOfReserve = 0; matchBuffer.startIndexOfReserve = 0; // Extract query k-mer - time_t beforeKmerExtraction = time(nullptr); - cout << "Extracting query metamers ... " << endl; - if (par.seqMode == 1 || par.seqMode == 3) { // Single-end short-read sequence or long-read sequence - fillQueryKmerBufferParallel(kseq1, - kmerBuffer, - queryList, - queryReadSplit[splitIdx], - par); - } else if (par.seqMode == 2) { - fillQueryKmerBufferParallel_paired(kseq1, - kseq2, - kmerBuffer, - queryList, - queryReadSplit[splitIdx], - par); - } - + kmerExtractor->extractQueryKmers(kmerBuffer, + queryList, + queryReadSplit[splitIdx], + par, + kseq1, + kseq2); numOfTatalQueryKmerCnt += kmerBuffer.startIndexOfReserve; - cout << "Time spent for metamer extraction: " << double(time(nullptr) - beforeKmerExtraction) << endl; - - // Sort query k-mer - time_t beforeQueryKmerSort = time(nullptr); - cout << "Sorting query metamer list ..." << endl; - SORT_PARALLEL(kmerBuffer.buffer, kmerBuffer.buffer + kmerBuffer.startIndexOfReserve, compareForLinearSearch); - cout << "Time spent for sorting query metamer list: " << double(time(nullptr) - beforeQueryKmerSort) << endl; //#ifdef OPENMP // if (par.printLog == 1) { @@ -271,26 +105,9 @@ void Classifier::startClassify(const LocalParameters &par) { // omp_set_num_threads(par.threads); // } //#endif - // Search matches between query and target k-mers - linearSearchParallel(kmerBuffer.buffer, kmerBuffer.startIndexOfReserve, matchBuffer, par); - -#ifdef OPENMP - omp_set_num_threads(par.threads); -#endif - // Sort matches - time_t beforeSortMatches = time(nullptr); - totalMatchCnt += matchBuffer.startIndexOfReserve; - cout << "Sorting matches ..." << endl; - SORT_PARALLEL(matchBuffer.buffer, matchBuffer.buffer + matchBuffer.startIndexOfReserve, - sortMatch()); - cout << "Time spent for sorting matches: " << double(time(nullptr) - beforeSortMatches) << endl; -// for (size_t i = 0; i < matchBuffer.startIndexOfReserve; i++) { -// cout << matchBuffer.buffer[i].queryId << " " << matchBuffer.buffer[i].splitIdx << " " << -// matchBuffer.buffer[i].targetSplitIdx << " " << matchBuffer.buffer[i].targetId << " " << -// genusTaxIdList[matchBuffer.buffer[i].targetId] << " " << speciesTaxIdList[matchBuffer.buffer[i].targetId] << " " -// << matchBuffer.buffer[i].position << " " << (int) matchBuffer.buffer[i].hamming << " " << taxIdList[matchBuffer.buffer[i].targetId] << endl; -// } + // Search matches between query and target k-mers + kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer); //#ifdef OPENMP @@ -302,25 +119,27 @@ void Classifier::startClassify(const LocalParameters &par) { //#endif // Classify queries based on the matches - time_t beforeAnalyze = time(nullptr); - cout << "Analyzing matches ..." << endl; - fromMatchToClassification(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par); - cout << "Time spent for analyzing: " << double(time(nullptr) - beforeAnalyze) << endl; - processedSeqCnt += queryReadSplit[splitIdx].second - queryReadSplit[splitIdx].first; + taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par); + processedSeqCnt += queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start; cout << "The number of processed sequences: " << processedSeqCnt << " (" << (double) processedSeqCnt / (double) numOfSeq << ")" << endl; +// for (size_t i = 0; i < matchBuffer.startIndexOfReserve; i++) { +// cout << matchBuffer.buffer[i].queryId << " " << matchBuffer.buffer[i].splitIdx << " " << +// matchBuffer.buffer[i].targetSplitIdx << " " << matchBuffer.buffer[i].targetId << " " << +// genusTaxIdList[matchBuffer.buffer[i].targetId] << " " << speciesTaxIdList[matchBuffer.buffer[i].targetId] << " " +// << matchBuffer.buffer[i].position << " " << (int) matchBuffer.buffer[i].hamming << " " << taxIdList[matchBuffer.buffer[i].targetId] << endl; +// } + // Write classification results - writeReadClassification(queryList, - (int) (queryReadSplit[splitIdx].second - queryReadSplit[splitIdx].first), - readClassificationFile); + reporter->writeReadClassification(queryList); } cout << "Number of query k-mers: " << numOfTatalQueryKmerCnt << endl; cout << "The number of matches: " << totalMatchCnt << endl; - readClassificationFile.close(); + reporter->closeReadClassificationFile(); // Write report files - writeReportFile(outDir, numOfSeq, taxCounts); + reporter->writeReportFile(numOfSeq, taxonomer->getTaxCounts()); // Memory deallocation free(matchBuffer.buffer); @@ -328,1830 +147,3 @@ void Classifier::startClassify(const LocalParameters &par) { delete kseq2; } - -void Classifier::fillQueryKmerBufferParallel(KSeqWrapper* kseq1, - QueryKmerBuffer &kmerBuffer, - vector & queryList, - const pair & currentSplit, - const LocalParameters &par) { - size_t queryNum = currentSplit.second - currentSplit.first; - size_t processedQueryNum = 0; - - // Array to store reads of thread number - vector reads1(par.threads); - - while (processedQueryNum < queryNum) { - size_t currentQueryNum = min(queryNum - processedQueryNum, (size_t) par.threads); - size_t count = 0; - while (count < currentQueryNum) { - // Read query - kseq1->ReadEntry(); - const KSeqWrapper::KSeqEntry & e1 = kseq1->entry; - - // Get k-mer count - int kmerCnt = getQueryKmerNumber((int) e1.sequence.l); - - // Query Info - queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l); - queryList[processedQueryNum].name = string(e1.name.s); - queryList[processedQueryNum].kmerCnt = (int) (kmerCnt); - - // Store reads - reads1[count] = string(kseq1->entry.sequence.s); - - processedQueryNum ++; - count ++; - } -#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1) - { - SeqIterator seqIterator(par); - size_t posToWrite; -#pragma omp for schedule(dynamic, 1) - for (size_t i = 0; i < currentQueryNum; i ++) { - size_t queryIdx = processedQueryNum - currentQueryNum + i; - // Get k-mer count - auto kmerCnt = getQueryKmerNumber(reads1[i].length()); - - // Ignore short read - if (kmerCnt < 1) { continue; } - - // Get masked sequence - char *maskedSeq1 = nullptr; - if (maskMode) { - maskedSeq1 = new char[reads1[i].length() + 1]; - SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat); - } else { - maskedSeq1 = const_cast(reads1[i].c_str()); - } - - posToWrite = kmerBuffer.reserveMemory(kmerCnt); - - // Process Read 1 - seqIterator.sixFrameTranslation(maskedSeq1, (int) reads1[i].length()); - seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) reads1[i].length(), kmerBuffer, posToWrite, - (uint32_t) queryIdx); - - if (maskMode) { - delete[] maskedSeq1; - } - } - } - } -} - - - -int Classifier::getMaxCoveredLength(int queryLength) { - if (queryLength % 3 == 2) { - return queryLength - 2; // 2 - } else if (queryLength % 3 == 1) { - return queryLength - 4; // 4 - } else { - return queryLength - 3; // 3 - } -} - -template -T Classifier::getQueryKmerNumber(T queryLength) { - return (getMaxCoveredLength(queryLength) / 3 - kmerLength - spaceNum_int + 1) * 6; -} - -void Classifier::fillQueryKmerBufferParallel_paired(KSeqWrapper* kseq1, - KSeqWrapper* kseq2, - QueryKmerBuffer &kmerBuffer, - vector & queryList, - const pair & currentSplit, - const LocalParameters &par) { - size_t queryNum = currentSplit.second - currentSplit.first; - size_t processedQueryNum = 0; - - // Array to store reads of thread number - vector reads1(par.threads); - vector reads2(par.threads); - - while (processedQueryNum < queryNum) { - size_t currentQueryNum = min(queryNum - processedQueryNum, (size_t) par.threads); - size_t count = 0; - - // Fill reads in sequential - while (count < currentQueryNum) { - // Read query - kseq1->ReadEntry(); - kseq2->ReadEntry(); - const KSeqWrapper::KSeqEntry & e1 = kseq1->entry; - const KSeqWrapper::KSeqEntry & e2 = kseq2->entry; - - // Get k-mer count - int kmerCnt = getQueryKmerNumber((int) e1.sequence.l); - int kmerCnt2 = getQueryKmerNumber((int) e2.sequence.l); - - // Query Info - queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l); - queryList[processedQueryNum].queryLength2 = getMaxCoveredLength((int) e2.sequence.l); - queryList[processedQueryNum].name = string(e1.name.s); - queryList[processedQueryNum].kmerCnt = (int) (kmerCnt + kmerCnt2); - - // Store reads - reads1[count] = string(kseq1->entry.sequence.s); - reads2[count] = string(kseq2->entry.sequence.s); - - processedQueryNum ++; - count ++; - } - - // Process reads in parallel -#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1, reads2) - { - SeqIterator seqIterator(par); - SeqIterator seqIterator2(par); - size_t posToWrite; -#pragma omp for schedule(dynamic, 1) - for (size_t i = 0; i < currentQueryNum; i ++) { - size_t queryIdx = processedQueryNum - currentQueryNum + i; - // Get k-mer count - auto kmerCnt = getQueryKmerNumber(reads1[i].length()); - auto kmerCnt2 = getQueryKmerNumber(reads2[i].length()); - - // Ignore short read - if (kmerCnt2 < 1 || kmerCnt < 1) { continue; } - - // Get masked sequence - char *maskedSeq1 = nullptr; - char *maskedSeq2 = nullptr; - if (maskMode) { - maskedSeq1 = new char[reads1[i].length() + 1]; - maskedSeq2 = new char[reads2[i].length() + 1]; - SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat); - SeqIterator::maskLowComplexityRegions(reads2[i].c_str(),maskedSeq2, *probMatrix, maskProb, subMat); - } else { - maskedSeq1 = const_cast(reads1[i].c_str()); - maskedSeq2 = const_cast(reads2[i].c_str()); - } - - posToWrite = kmerBuffer.reserveMemory(kmerCnt + kmerCnt2); - - // Process Read 1 - seqIterator.sixFrameTranslation(maskedSeq1, (int) reads1[i].length()); - seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) reads1[i].length(), kmerBuffer, posToWrite, - (uint32_t) queryIdx); - - // Process Read 2 - seqIterator2.sixFrameTranslation(maskedSeq2, (int) reads2[i].length()); - seqIterator2.fillQueryKmerBuffer(maskedSeq2, (int) reads2[i].length(), kmerBuffer, posToWrite, - (uint32_t) queryIdx, queryList[queryIdx].queryLength); - - if (maskMode) { - delete[] maskedSeq1; - delete[] maskedSeq2; - } - } - } - } -} - -void Classifier::linearSearchParallel(QueryKmer *queryKmerList, size_t &queryKmerCnt, - Buffer &matchBuffer, const LocalParameters &par) { - int threadNum = par.threads; - string targetDiffIdxFileName = dbDir + "/diffIdx"; - string targetInfoFileName = dbDir + "/info"; - string diffIdxSplitFileName = dbDir + "/split";; - - struct stat diffIdxFileSt{}; - stat(targetDiffIdxFileName.c_str(), &diffIdxFileSt); - size_t numOfDiffIdx = diffIdxFileSt.st_size / sizeof(uint16_t); - - struct MmapedData diffIdxSplits = mmapData(diffIdxSplitFileName.c_str(), 3); - - cout << "Comparing query and reference metamers..." << endl; - - // Find the first index of garbage query k-mer (UINT64_MAX) and discard from there - for (size_t checkN = queryKmerCnt - 1; checkN > 0; checkN--) { - if (queryKmerList[checkN].ADkmer != UINT64_MAX) { - queryKmerCnt = checkN + 1; - break; - } - } - - // Filter out meaningless target splits - size_t numOfDiffIdxSplits = diffIdxSplits.fileSize / sizeof(DiffIdxSplit); - size_t numOfDiffIdxSplits_use = numOfDiffIdxSplits; - for (size_t i = 1; i < numOfDiffIdxSplits; i++) { - if (diffIdxSplits.data[i].ADkmer == 0 || diffIdxSplits.data[i].ADkmer == UINT64_MAX) { - diffIdxSplits.data[i] = {UINT64_MAX, UINT64_MAX, UINT64_MAX}; - numOfDiffIdxSplits_use--; - } - } - - // Divide query k-mer list into blocks for multi threading. - // Each split has start and end points of query list + proper offset point of target k-mer list - vector querySplits; - uint64_t queryAA; - vector targetSplitIdxs; - if (threadNum == 1) { //Single thread - querySplits.emplace_back(0, queryKmerCnt - 1, queryKmerCnt, diffIdxSplits.data[0]); - } else if (threadNum == 2) { //Two threads - size_t splitWidth = queryKmerCnt / 2; - querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]); - for (size_t tSplitCnt = 0; tSplitCnt < numOfDiffIdxSplits_use; tSplitCnt++) { - queryAA = AminoAcidPart(queryKmerList[splitWidth].ADkmer); - if (queryAA <= AminoAcidPart(diffIdxSplits.data[tSplitCnt].ADkmer)) { - tSplitCnt = tSplitCnt - (tSplitCnt != 0); - querySplits.emplace_back(splitWidth, queryKmerCnt - 1, queryKmerCnt - splitWidth, - diffIdxSplits.data[tSplitCnt]); - break; - } - } - } else { //More than two threads - // Devide query k-mers into blocks - size_t splitWidth = queryKmerCnt / (threadNum - 1); - querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]); - for (int i = 1; i < threadNum; i++) { - queryAA = AminoAcidPart(queryKmerList[splitWidth * i].ADkmer); - bool needLastTargetBlock = true; - for (size_t j = 0; j < numOfDiffIdxSplits_use; j++) { - if (queryAA <= AminoAcidPart(diffIdxSplits.data[j].ADkmer)) { - j = j - (j != 0); - if (i != threadNum - 1) { - querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth, - diffIdxSplits.data[j]); - } else { - querySplits.emplace_back(splitWidth * i, queryKmerCnt - 1, queryKmerCnt - splitWidth * i, - diffIdxSplits.data[j]); - } - targetSplitIdxs.emplace_back(j); - needLastTargetBlock = false; - break; - } - } - if (needLastTargetBlock) { - if (i != threadNum - 1) { // If it is not the last split - querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth, - diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); - targetSplitIdxs.emplace_back(numOfDiffIdxSplits_use - 2); - } else { - querySplits.emplace_back(splitWidth * i, queryKmerCnt - 1, queryKmerCnt - splitWidth * i, - diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); - targetSplitIdxs.emplace_back(numOfDiffIdxSplits_use - 2); - } - } - } - } - - bool *splitCheckList = (bool *) malloc(sizeof(bool) * threadNum); - fill_n(splitCheckList, threadNum, false); - int completedSplitCnt = 0; - - time_t beforeSearch = time(nullptr); - - while (completedSplitCnt < threadNum) { - bool hasOverflow = false; -#pragma omp parallel default(none), shared(completedSplitCnt, splitCheckList, hasOverflow, \ -querySplits, queryKmerList, matchBuffer, cout, par, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName, targetSplitIdxs) - { - // FILE - FILE * diffIdxFp = fopen(targetDiffIdxFileName.c_str(), "rb"); - FILE * kmerInfoFp = fopen(targetInfoFileName.c_str(), "rb"); - - // Target K-mer buffer - uint16_t * diffIdxBuffer = (uint16_t *) malloc(sizeof(uint16_t) * (BufferSize + 1)); // size = 32 Mb - TargetKmerInfo * kmerInfoBuffer = (TargetKmerInfo *) malloc(sizeof(TargetKmerInfo) * (BufferSize+1)); // 64 Mb - size_t kmerInfoBufferIdx = 0; - size_t diffIdxBufferIdx = 0; - - //query variables - uint64_t currentQuery = UINT64_MAX; - uint64_t currentQueryAA = UINT64_MAX; - QueryKmerInfo currentQueryInfo; - - //target variables - size_t diffIdxPos = 0; - vector candidateTargetKmers; //vector for candidate target k-mer, some of which are selected after based on hamming distance - vector candidateKmerInfos; - uint64_t currentTargetKmer; - - //Match buffer for each thread - int localBufferSize = 2'000'000; // 32 Mb - auto *matches = new Match[localBufferSize]; // 16 * 2'000'000 = 32 Mb - int matchCnt = 0; - - // For debug - SeqIterator seqIterator(par); - - //vectors for selected target k-mers - vector selectedHammingSum; - vector selectedMatches; - vector selectedHammings; - size_t posToWrite; - - int currMatchNum; - size_t idx; -#pragma omp for schedule(dynamic, 1) - for (size_t i = 0; i < querySplits.size(); i++) { - if (hasOverflow || splitCheckList[i]) { - continue; - } - - currentTargetKmer = querySplits[i].diffIdxSplit.ADkmer; - diffIdxBufferIdx = querySplits[i].diffIdxSplit.diffIdxOffset; - kmerInfoBufferIdx = querySplits[i].diffIdxSplit.infoIdxOffset - - (querySplits[i].diffIdxSplit.ADkmer != 0); - diffIdxPos = querySplits[i].diffIdxSplit.diffIdxOffset; - - fseek(kmerInfoFp, 4 * (long)(kmerInfoBufferIdx), SEEK_SET); - loadBuffer(kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx, BufferSize); - fseek(diffIdxFp, 2 * (long) (diffIdxBufferIdx), SEEK_SET); - loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize); - - if (i == 0) { - currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, - diffIdxBufferIdx, diffIdxPos); - } - currentQuery = UINT64_MAX; - currentQueryAA = UINT64_MAX; - - size_t lastMovedQueryIdx = 0; - for (size_t j = querySplits[i].start; j < querySplits[i].end + 1; j++) { - querySplits[i].start++; - - // Reuse the comparison data if queries are exactly identical - if (currentQuery == queryKmerList[j].ADkmer - && (currentQueryInfo.frame/3 == queryKmerList[j].info.frame/3)) { - currMatchNum = selectedMatches.size(); - // If local buffer is full, copy them to the shared buffer. - if (matchCnt + currMatchNum > localBufferSize) { - // Check if the shared buffer is full. - posToWrite = matchBuffer.reserveMemory(matchCnt); - if (posToWrite + matchCnt >= matchBuffer.bufferSize) { - hasOverflow = true; - querySplits[i].start = lastMovedQueryIdx + 1; - __sync_fetch_and_sub(& matchBuffer.startIndexOfReserve, matchCnt); - break; - } else { // not full -> copy matches to the shared buffer - moveMatches(matchBuffer.buffer + posToWrite, matches, matchCnt); - lastMovedQueryIdx = j; - } - } - for (int k = 0; k < currMatchNum; k++) { - idx = selectedMatches[k]; - matches[matchCnt] = {queryKmerList[j].info, - candidateKmerInfos[idx].sequenceID, - taxId2genusId[candidateKmerInfos[idx].sequenceID], - taxId2speciesId[candidateKmerInfos[idx].sequenceID], - selectedHammings[k], - selectedHammingSum[k], - (bool) candidateKmerInfos[idx].redundancy}; - matchCnt++; - } - continue; - } - selectedMatches.clear(); - selectedHammingSum.clear(); - selectedHammings.clear(); - - // Reuse the candidate target k-mers to compare in DNA level if queries are the same at amino acid level but not at DNA level - if (currentQueryAA == AminoAcidPart(queryKmerList[j].ADkmer)) { - compareDna(queryKmerList[j].ADkmer, candidateTargetKmers, selectedMatches, - selectedHammingSum, selectedHammings,queryKmerList[j].info.frame); - currMatchNum = selectedMatches.size(); - - // If local buffer is full, copy them to the shared buffer. - if (matchCnt + currMatchNum > localBufferSize) { - // Check if the shared buffer is full. - posToWrite = matchBuffer.reserveMemory(matchCnt); - if (posToWrite + matchCnt >= matchBuffer.bufferSize) { - hasOverflow = true; - querySplits[i].start = lastMovedQueryIdx + 1; - __sync_fetch_and_sub(& matchBuffer.startIndexOfReserve, matchCnt); - break; - } else { // not full -> copy matches to the shared buffer - moveMatches(matchBuffer.buffer + posToWrite, matches, matchCnt); - lastMovedQueryIdx = j; - } - } - for (int k = 0; k < currMatchNum; k++) { - idx = selectedMatches[k]; - matches[matchCnt] = {queryKmerList[j].info, - candidateKmerInfos[idx].sequenceID, - taxId2genusId[candidateKmerInfos[idx].sequenceID], - taxId2speciesId[candidateKmerInfos[idx].sequenceID], - selectedHammings[k], - selectedHammingSum[k], - (bool) candidateKmerInfos[idx].redundancy}; - matchCnt++; - } - currentQuery = queryKmerList[j].ADkmer; - currentQueryAA = AminoAcidPart(currentQuery); - currentQueryInfo = queryKmerList[j].info; - continue; - } - candidateTargetKmers.clear(); - candidateKmerInfos.clear(); - - // Get next query, and start to find - currentQuery = queryKmerList[j].ADkmer; - currentQueryAA = AminoAcidPart(currentQuery); - currentQueryInfo = queryKmerList[j].info; - - // Skip target k-mers that are not matched in amino acid level - while (diffIdxPos != numOfDiffIdx - && (currentQueryAA > AminoAcidPart(currentTargetKmer))) { - if (unlikely(BufferSize < diffIdxBufferIdx + 7)){ - loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 ); - } - currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, - diffIdxBufferIdx, diffIdxPos); - kmerInfoBufferIdx ++; - } - - if (currentQueryAA != AminoAcidPart(currentTargetKmer)) // Move to next query k-mer if there isn't any match. - continue; - - // Load target k-mers that are matched in amino acid level - while (diffIdxPos != numOfDiffIdx && - currentQueryAA == AminoAcidPart(currentTargetKmer)) { - candidateTargetKmers.push_back(currentTargetKmer); - candidateKmerInfos.push_back(getKmerInfo(BufferSize, kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx)); - // Print the target k-mer -// if (par.printLog == 1) { -// cout << queryKmerList[j].info.sequenceID << "\t" << queryKmerList[j].info.pos << "\t" -// << (int) queryKmerList[j].info.frame << endl; -// cout << "Query k-mer: "; -// print_binary64(64, currentQuery); -// cout << "\t"; -// seqIterator.printKmerInDNAsequence(currentQuery); -// cout << endl; -// cout << "Target k-mer: "; -// print_binary64(64, currentTargetKmer); -// cout << "\t"; -// seqIterator.printKmerInDNAsequence(currentTargetKmer); -// cout << "\t" << kmerInfoBuffer[kmerInfoBufferIdx].sequenceID -// << "\t" << taxId2speciesId[kmerInfoBuffer[kmerInfoBufferIdx].sequenceID] << endl; -// cout << (int) getHammingDistanceSum(currentQuery, currentTargetKmer) << "\t"; -// print_binary16(16, getHammings(currentQuery, currentTargetKmer)); cout << endl; -// } - - if (unlikely(BufferSize < diffIdxBufferIdx + 7)){ - loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, - BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 ); - } - - currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, - diffIdxBufferIdx, diffIdxPos); - kmerInfoBufferIdx ++; - } - - // Compare the current query and the loaded target k-mers and select - compareDna(currentQuery, candidateTargetKmers, selectedMatches, selectedHammingSum, - selectedHammings, queryKmerList[j].info.frame); - - // If local buffer is full, copy them to the shared buffer. - currMatchNum = selectedMatches.size(); - if (matchCnt + currMatchNum > localBufferSize) { - // Check if the shared buffer is full. - posToWrite = matchBuffer.reserveMemory(matchCnt); - if (posToWrite + matchCnt >= matchBuffer.bufferSize) { // full -> write matches to file first - hasOverflow = true; - querySplits[i].start = lastMovedQueryIdx + 1; - __sync_fetch_and_sub(&matchBuffer.startIndexOfReserve, matchCnt); - break; - } else { // not full -> copy matches to the shared buffer - moveMatches(matchBuffer.buffer + posToWrite, matches, matchCnt); - lastMovedQueryIdx = j; - } - } - - for (int k = 0; k < currMatchNum; k++) { - idx = selectedMatches[k]; - matches[matchCnt] = {queryKmerList[j].info, - candidateKmerInfos[idx].sequenceID, - taxId2genusId[candidateKmerInfos[idx].sequenceID], - taxId2speciesId[candidateKmerInfos[idx].sequenceID], - selectedHammings[k], - selectedHammingSum[k], - (bool) candidateKmerInfos[idx].redundancy}; - matchCnt++; - } - } // End of one split - - // Move matches in the local buffer to the shared buffer - posToWrite = matchBuffer.reserveMemory(matchCnt); - if (posToWrite + matchCnt >= matchBuffer.bufferSize) { - hasOverflow = true; - querySplits[i].start = lastMovedQueryIdx + 1; - __sync_fetch_and_sub(& matchBuffer.startIndexOfReserve, matchCnt); - } else { - moveMatches(matchBuffer.buffer + posToWrite, matches, matchCnt); - } - - // Check whether current split is completed or not - if (querySplits[i].start - 1 == querySplits[i].end) { - splitCheckList[i] = true; - __sync_fetch_and_add(&completedSplitCnt, 1); - } - } // End of omp for (Iterating for splits) - delete[] matches; - fclose(diffIdxFp); - fclose(kmerInfoFp); - free(diffIdxBuffer); - free(kmerInfoBuffer); - } // End of omp parallel - if (hasOverflow) { - cout << "overflow!!!" << endl; - break; - } - } // end of while(completeSplitCnt < threadNum) - cout << "Time spent for the comparison: " << double(time(nullptr) - beforeSearch) << endl; - munmap(diffIdxSplits.data, diffIdxSplits.fileSize + 1); - free(splitCheckList); - queryKmerCnt = 0; -} - -void Classifier::moveMatches(Match *dest, Match *src, int &matchNum) { - memcpy(dest, src, sizeof(Match) * matchNum); - matchNum = 0; -} - -// It compares query k-mers to target k-mers. -// If a query has matches, the matches with the smallest hamming distance will be selected -void Classifier::compareDna(uint64_t query, vector &targetKmersToCompare, - vector &selectedMatches, vector &selectedHammingSum, - vector &selectedHammings, uint8_t frame) { - - size_t size = targetKmersToCompare.size(); - auto *hammingSums = new uint8_t[size + 1]; - uint8_t currentHammingSum; - uint8_t minHammingSum = UINT8_MAX; - - // Calculate hamming distance - for (size_t i = 0; i < size; i++) { - currentHammingSum = getHammingDistanceSum(query, targetKmersToCompare[i]); - if (currentHammingSum < minHammingSum) { - minHammingSum = currentHammingSum; - } - hammingSums[i] = currentHammingSum; - } - - // Select target k-mers that passed hamming criteria - for (size_t h = 0; h < size; h++) { - if (hammingSums[h] <= minHammingSum + hammingMargin) { - selectedMatches.push_back(h); - selectedHammingSum.push_back(hammingSums[h]); - if (frame < 3) { - selectedHammings.push_back(getHammings(query, targetKmersToCompare[h])); - } else { - selectedHammings.push_back(getHammings_reverse(query, targetKmersToCompare[h])); - } - } - } - delete[] hammingSums; -} - -// It analyses the result of linear search. -void Classifier::fromMatchToClassification(const Match *matchList, - size_t numOfMatches, - vector & queryList, - const LocalParameters &par) { - - // Devide matches into blocks for multi threading - size_t seqNum = queryList.size(); - MatchBlock *matchBlocks = new MatchBlock[seqNum]; - size_t matchIdx = 0; - size_t blockIdx = 0; - uint32_t currentQuery; - while (matchIdx < numOfMatches) { - currentQuery = matchList[matchIdx].qInfo.sequenceID; - matchBlocks[blockIdx].id = currentQuery; - matchBlocks[blockIdx].start = matchIdx; - while ((currentQuery == matchList[matchIdx].qInfo.sequenceID) && (matchIdx < numOfMatches)) ++matchIdx; - matchBlocks[blockIdx].end = matchIdx - 1; - blockIdx++; - } - - // Process each block -#pragma omp parallel default(none), shared(cout, matchBlocks, matchList, seqNum, queryList, blockIdx, par) - { -#pragma omp for schedule(dynamic, 1) - for (size_t i = 0; i < blockIdx; ++i) { - chooseBestTaxon(matchBlocks[i].id, - matchBlocks[i].start, - matchBlocks[i].end, - matchList, - queryList, - par); - } - } - - for (size_t i = 0; i < seqNum; i++) { - ++taxCounts[queryList[i].classification]; - } - delete[] matchBlocks; -} - - -void Classifier::chooseBestTaxon(uint32_t currentQuery, - size_t offset, - size_t end, - const Match *matchList, - vector & queryList, - const LocalParameters &par) { - TaxID selectedTaxon; -// if (par.printLog) { -// cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; -// for (size_t i = offset; i < end + 1; i++) { -// cout << taxId2genusId[matchList[i].targetId] << " " << taxId2speciesId[matchList[i].targetId] << -// " " << matchList[i].targetId << " " << matchList[i].qInfo.frame << " "; -// print_binary16(16, matchList[i].rightEndHamming); -// cout << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; -// } -// } - - // Get the best genus for current query - vector genusMatches; - genusMatches.reserve(end - offset + 1); - - int res; - TaxonScore genusScore(0, 0, 0, 0); - if (par.seqMode == 2) { - if (par.spaceMask != "11111111"){ - genusScore = getBestGenusMatches_spaced(genusMatches, matchList, end, offset, - queryList[currentQuery].queryLength, - queryList[currentQuery].queryLength2); - } else { - genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, - queryList[currentQuery].queryLength, - queryList[currentQuery].queryLength2, par); - } - } else { - if (par.spaceMask != "11111111") { - genusScore = getBestGenusMatches_spaced(genusMatches, matchList, end, offset, - queryList[currentQuery].queryLength); - } else { - genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, - queryList[currentQuery].queryLength, par); - } - } - -// if (par.printLog) { -// cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n"; -// for (size_t i = 0; i < genusMatches.size(); i++) { -// cout << taxId2genusId[genusMatches[i].targetId] << " " << taxId2speciesId[genusMatches[i].targetId] << -// " " << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " "; -// print_binary16(16, genusMatches[i].rightEndHamming); -// cout << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) << " " << int(genusMatches[i].redundancy) << endl; -// } -// cout << "Genus score: " << genusScore.score << "\n"; -// } - - // If there is no proper genus for current query, it is un-classified. - if (genusScore.score == 0 || genusScore.coverage < par.minCoverage || genusScore.score < par.minScore) { - queryList[currentQuery].isClassified = false; - queryList[currentQuery].classification = 0; - queryList[currentQuery].score = genusScore.score; - queryList[currentQuery].coverage = genusScore.coverage; - queryList[currentQuery].hammingDist = genusScore.hammingDist; - queryList[currentQuery].newSpecies = false; - return; - } - - // If there are two or more good genus level candidates, find the LCA. - if (genusScore.taxId == 0) { - vector genusList; - genusList.reserve(genusMatches.size()); - for (auto & genusMatch : genusMatches) { - genusList.push_back(genusMatch.genusId); - } - selectedTaxon = taxonomy->LCA(genusList)->taxId; - queryList[currentQuery].isClassified = true; - queryList[currentQuery].classification = selectedTaxon; - queryList[currentQuery].score = genusScore.score; - queryList[currentQuery].coverage = genusScore.coverage; - queryList[currentQuery].hammingDist = genusScore.hammingDist; - for (auto & genusMatch : genusMatches) { - queryList[currentQuery].taxCnt[genusMatch.targetId]++; - } - return; - } - - // Choose the species with the highest coverage. - TaxID selectedSpecies; - TaxonScore speciesScore; - vector species; - unordered_map> speciesMatchRange; - if (par.seqMode == 2) { - speciesScore = chooseSpecies(genusMatches, - queryList[currentQuery].queryLength, - queryList[currentQuery].queryLength2, - species, - speciesMatchRange); - } else { - speciesScore = chooseSpecies(genusMatches, - queryList[currentQuery].queryLength, - species, - speciesMatchRange); - } - - - // Classify to LCA if more than one species are selected - if (species.size() > 1) { - queryList[currentQuery].isClassified = true; - queryList[currentQuery].classification = taxonomy->LCA(species)->taxId; - queryList[currentQuery].score = genusScore.score; - queryList[currentQuery].coverage = genusScore.coverage; - queryList[currentQuery].hammingDist = genusScore.hammingDist; - for (auto & genusMatch : genusMatches) { - queryList[currentQuery].taxCnt[genusMatch.targetId]++; - } - return; - } - - // If score is not enough, classify to the parent of the selected species - if (speciesScore.score < minSpScore) { - queryList[currentQuery].isClassified = true; - queryList[currentQuery].classification = taxonomy->taxonNode( - taxonomy->getTaxIdAtRank(species[0], "species"))->parentTaxId; - queryList[currentQuery].score = genusScore.score; - queryList[currentQuery].coverage = genusScore.coverage; - queryList[currentQuery].hammingDist = genusScore.hammingDist; - for (auto & genusMatch : genusMatches) { - if(genusMatch.speciesId == species[0]){ - queryList[currentQuery].taxCnt[genusMatch.targetId]++; - } - } - return; - } - - // Sort matches by the position of the query sequence - selectedSpecies = species[0]; -// sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first, -// genusMatches.begin() + speciesMatchRange[selectedSpecies].second, -// [](const Match & a, const Match & b) { -// if (a.qInfo.position / 3 == b.qInfo.position / 3) -// return a.hamming < b.hamming; -// else -// return a.qInfo.position / 3 < b.qInfo.position / 3; -// }); - - sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first, - genusMatches.begin() + speciesMatchRange[selectedSpecies].second, - [](const Match & a, const Match & b) { return a.qInfo.pos > b.qInfo.pos; }); - - - TaxID result = lowerRankClassification(genusMatches, speciesMatchRange[selectedSpecies], selectedSpecies); - - // Record matches of selected species - for (size_t i = speciesMatchRange[selectedSpecies].first; i < speciesMatchRange[selectedSpecies].second; i++) { - queryList[currentQuery].taxCnt[genusMatches[i].targetId]++; - } - - - // Store classification results - queryList[currentQuery].isClassified = true; - queryList[currentQuery].classification = result; - queryList[currentQuery].score = speciesScore.score; - queryList[currentQuery].coverage = speciesScore.coverage; - queryList[currentQuery].hammingDist = speciesScore.hammingDist; - queryList[currentQuery].newSpecies = false; -// if (par.printLog) { -// cout << "# " << currentQuery << endl; -// for (size_t i = 0; i < genusMatches.size(); i++) { -// cout << i << " " << genusMatches[i].qInfo.pos << " " << -// genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl; -// } -// cout << "Score: " << speciesScore.score << " " << selectedSpecies << " " -// << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx) -// -// << endl; -// } -} - -TaxID Classifier::lowerRankClassification(vector &matches, pair &matchRange, TaxID spTaxId) { - int i = matchRange.second - 1; - unordered_map taxCnt; - - while ( i >= matchRange.first ) { - size_t currQuotient = matches[i].qInfo.pos / 3; - uint8_t minHamming = matches[i].hamming; - Match * minHammingMatch = & matches[i]; - TaxID minHammingTaxId = minHammingMatch->targetId; - bool first = true; - i --; - while ( (i >= matchRange.first) && (currQuotient == matches[i].qInfo.pos / 3) ) { - if (matches[i].hamming < minHamming) { - minHamming = matches[i].hamming; - minHammingMatch = & matches[i]; - minHammingTaxId = minHammingMatch->targetId; - } else if (matches[i].hamming == minHamming) { - minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId); - minHammingMatch->redundancy = true; - matches[i].redundancy = true; - } - i--; - } - taxCnt[minHammingTaxId]++; - } - - unordered_map cladeCnt; - getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); - - return BFS(cladeCnt, spTaxId); -} - -void Classifier::getSpeciesCladeCounts(const unordered_map &taxCnt, - unordered_map & cladeCount, - TaxID speciesTaxID) { - for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) { -// cladeCount[it->first].taxCount = it->second; -// cladeCount[it->first].cladeCount += it->second; - TaxonNode const * taxon = taxonomy->taxonNode(it->first); - cladeCount[taxon->taxId].taxCount = it->second; - cladeCount[taxon->taxId].cladeCount += it->second; - while (taxon->taxId != speciesTaxID) { - if (find(cladeCount[taxon->parentTaxId].children.begin(), - cladeCount[taxon->parentTaxId].children.end(), - taxon->taxId) == cladeCount[taxon->parentTaxId].children.end()) { - cladeCount[taxon->parentTaxId].children.push_back(taxon->taxId); - } - cladeCount[taxon->parentTaxId].cladeCount += it->second; - taxon = taxonomy->taxonNode(taxon->parentTaxId); - } - } -} - -TaxID Classifier::BFS(const unordered_map & cladeCnt, TaxID root) { - if (cladeCnt.at(root).children.empty()) { // root is a leaf - return root; - } - unsigned int maxCnt = 3; - unsigned int currentCnt; - vector bestChildren; - for (auto it = cladeCnt.at(root).children.begin(); it != cladeCnt.at(root).children.end(); it++) { - currentCnt = cladeCnt.at(*it).cladeCount; - if (currentCnt > maxCnt) { - bestChildren.clear(); - bestChildren.push_back(*it); - maxCnt = currentCnt; - } else if (currentCnt == maxCnt) { - bestChildren.push_back(*it); - } - } - if (bestChildren.size() == 1) { - return BFS(cladeCnt, bestChildren[0]); - } else { - return root; - } -} - -TaxonScore Classifier::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, - size_t offset, int readLength1, int readLength2, const LocalParameters & par) { - TaxID currentGenus; - TaxID currentSpecies; - - vector filteredMatches; - vector> matchesForEachGenus; - vector genusScores; - TaxonScore bestScore; - size_t i = offset; - uint8_t curFrame; - vector curFrameMatches; - while (i < end + 1) { -// currentGenus = taxId2genusId[matchList[i].targetId]; - currentGenus = matchList[i].genusId; - // For current genus - while ((i < end + 1) && currentGenus == matchList[i].genusId) { -// currentSpecies = taxId2speciesId[matchList[i].targetId]; - currentSpecies = matchList[i].speciesId; -// if (par.printLog) { -// cout << currentGenus << " " << currentSpecies << endl; -// } - // For current species - while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { - curFrame = matchList[i].qInfo.frame; - curFrameMatches.clear(); - - // For current frame - while ((i < end + 1) && currentSpecies == matchList[i].speciesId - && curFrame == matchList[i].qInfo.frame) { - curFrameMatches.push_back(&matchList[i]); - i ++; - } - if (curFrameMatches.size() > 1) { - remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus, par); - } - } - } - - // Construct a match combination using filtered matches of current genus - // so that it can best cover the query, and score the combination - if (!filteredMatches.empty()) { - matchesForEachGenus.push_back(filteredMatches); - genusScores.push_back(scoreGenus(filteredMatches, readLength1, readLength2)); - } - filteredMatches.clear(); - } - - // If there are no meaningful genus - if (genusScores.empty()) { - bestScore.score = 0; - return bestScore; - } - - TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), - [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); - - vector maxIdx; - for (size_t g = 0; g < genusScores.size(); g++) { - if (genusScores[g].score > maxScore.score * 0.95f) { - maxIdx.push_back(g); - } - } - bestScore = maxScore; - - for (unsigned long g : maxIdx) { - for (const Match * m : matchesForEachGenus[g]) { - genusMatches.push_back(*m); - } - } - - - - // More than one genus - if (maxIdx.size() > 1) { - bestScore.taxId = 0; - return bestScore; - } - - return bestScore; - - //Three cases - //1. one genus - //2. more than one genus - //4. no genus -} - - - -void Classifier::remainConsecutiveMatches(vector & curFrameMatches, - vector & filteredMatches, - TaxID genusId, - const LocalParameters & par) { - size_t i = 0; - size_t end = curFrameMatches.size(); - vector> curPosMatches; // - vector> nextPosMatches; - map> linkedMatches; // - - size_t currPos = curFrameMatches[0]->qInfo.pos; - while ( i < end && curFrameMatches[i]->qInfo.pos == currPos) { - curPosMatches.emplace_back(curFrameMatches[i], i); - i++; - } - while (i < end) { - uint32_t nextPos = curFrameMatches[i]->qInfo.pos; - while (i < end && nextPos == curFrameMatches[i]->qInfo.pos) { - nextPosMatches.emplace_back(curFrameMatches[i], i); - ++ i; - } - // Check if current position and next position are consecutive - if (currPos + 3 == nextPos) { - // Compare curPosMatches and nextPosMatches - for (auto &curPosMatch: curPosMatches) { - for (auto &nextPosMatch: nextPosMatches) { - if (isConsecutive(curPosMatch.first, nextPosMatch.first)) { - linkedMatches[curPosMatch.second].push_back(nextPosMatch.second); - } - } - } - - } - // Update curPosMatches and nextPosMatches - curPosMatches = nextPosMatches; - nextPosMatches.clear(); - currPos = nextPos; - } - // Print linkedMatches -// if (par.printLog) { -// cout << "linkedMatches: " << endl; -// for (const auto &entry: linkedMatches) { -// cout << entry.first << ": "; -// for (auto &idx: entry.second) { -// cout << idx << " "; -// } -// cout << endl; -// } -// } - - // Iterate linkedMatches to get filteredMatches - int MIN_DEPTH = par.minConsCnt - 1; - if (taxonomy->IsAncestor(par.eukaryotaTaxId, genusId)) { - MIN_DEPTH = par.minConsCntEuk - 1; - } - unordered_set used; - vector filteredMatchIdx; - unordered_map idx2depth; - for (const auto& entry : linkedMatches) { - if (!used.count(entry.first)) { - used.insert(entry.first); - vector curMatches; - DFS(entry.first, linkedMatches, filteredMatchIdx, 0, MIN_DEPTH, used, idx2depth); - } - } - -// if (par.printLog) { -// cout << "filteredMatchIdx: "; -// for (auto &idx: filteredMatchIdx) { -// cout << idx << " "; -// } -// cout << endl; -// } - - for (auto &idx: filteredMatchIdx) { - filteredMatches.push_back(curFrameMatches[idx]); - } -} - - -size_t Classifier::DFS(size_t curMatchIdx, const map> & linkedMatches, - vector& filteredMatches, size_t depth, size_t MIN_DEPTH, unordered_set& used, - unordered_map & idx2depth) { - depth++; - size_t maxDepth = 0; - size_t returnDepth = 0; - if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { //|| linkedMatches.at(curMatchIdx).empty()) { - // reached a leaf node - idx2depth[curMatchIdx] = depth; - if (depth > MIN_DEPTH) { - filteredMatches.push_back(curMatchIdx); - } - return depth; - } else { // not a leaf node - for (auto &nextMatchIdx: linkedMatches.at(curMatchIdx)) { - used.insert(nextMatchIdx); - if (idx2depth.find(nextMatchIdx) != idx2depth.end()) { - returnDepth = idx2depth[nextMatchIdx]; - maxDepth = max(maxDepth, returnDepth); - continue; - } - returnDepth = DFS(nextMatchIdx, linkedMatches, filteredMatches, depth, MIN_DEPTH, used, idx2depth); - maxDepth = max(maxDepth, returnDepth); - } - if (maxDepth > MIN_DEPTH) { - filteredMatches.push_back(curMatchIdx); - idx2depth[curMatchIdx] = maxDepth; - } - } - return maxDepth; -} - -TaxonScore Classifier::getBestGenusMatches_spaced(vector &genusMatches, const Match *matchList, size_t end, - size_t offset, int readLength1, int readLength2) { - TaxID currentGenus; - TaxID currentSpecies; - - vector tempMatchContainer; - vector filteredMatches; - vector> matchesForEachGenus; - vector conservedWithinGenus; - vector genusScores; - TaxonScore bestScore; - size_t i = offset; - bool lastIn; - while (i + 1 < end + 1) { - currentGenus = matchList[i].genusId; - // For current genus - while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) { -// currentSpecies = taxId2speciesId[matchList[i].targetId]; - currentSpecies = matchList[i].speciesId; - // For current species - // Filter un-consecutive matches (probably random matches) - lastIn = false; - int distance = 0; - int diffPosCntOfCurrRange = 1; - int dnaDist = 0; - - // For the same species - while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) { - distance = matchList[i+1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3; - dnaDist = matchList[i+1].qInfo.pos - matchList[i].qInfo.pos; - if (distance == 0) { // At the same position - tempMatchContainer.push_back(matchList + i); - } else if (dnaDist < (8 + spaceNum_int + maxGap) * 3) { // Overlapping - lastIn = true; - tempMatchContainer.push_back(matchList + i); - diffPosCntOfCurrRange ++; - } else { // Not consecutive --> End range - if (lastIn){ - tempMatchContainer.push_back(matchList + i); - if (diffPosCntOfCurrRange >= minCoveredPos) { - filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), - tempMatchContainer.end()); - } - } - lastIn = false; - // Initialize range info - tempMatchContainer.clear(); - diffPosCntOfCurrRange = 1; - } - i++; - } - - // Met next species - if (lastIn) { - tempMatchContainer.push_back(matchList + i); - if (diffPosCntOfCurrRange >= minCoveredPos) { - filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), - tempMatchContainer.end()); - } - } - tempMatchContainer.clear(); - i++; - } - - // Construct a match combination using filtered matches of current genus - // so that it can best cover the query, and score the combination - if (!filteredMatches.empty()) { - genusScores.push_back(scoreGenus(filteredMatches, readLength1, readLength2)); - } - filteredMatches.clear(); - } - - // If there are no meaningful genus - if (genusScores.empty()) { - bestScore.score = 0; - return bestScore; - } - - TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), - [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); - - vector maxIdx; - for (size_t g = 0; g < genusScores.size(); g++) { - if (genusScores[g].score > maxScore.score * 0.95f) { - maxIdx.push_back(g); - } - } - bestScore = maxScore; - - for (unsigned long g : maxIdx) { - for (const Match * m : matchesForEachGenus[g]) { - genusMatches.push_back(*m); - } - } - - // More than one genus - if (maxIdx.size() > 1) { - bestScore.taxId = 0; - return bestScore; - } - return bestScore; - - //Three cases - //1. one genus - //2. more than one genus - //4. no genus -} - -TaxonScore Classifier::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, - size_t offset, int queryLength, const LocalParameters & par) { - TaxID currentGenus; - TaxID currentSpecies; - - vector filteredMatches; - vector> matchesForEachGenus; - vector genusScores; - TaxonScore bestScore; - size_t i = offset; - uint8_t curFrame; - vector curFrameMatches; - while (i < end + 1) { - currentGenus = matchList[i].genusId; - // For current genus - while ((i < end + 1) && currentGenus == matchList[i].genusId) { - currentSpecies = matchList[i].speciesId; - - // For current species - while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { - curFrame = matchList[i].qInfo.frame; - curFrameMatches.clear(); - - // For current frame - while ((i < end + 1) && currentSpecies == matchList[i].speciesId - && curFrame == matchList[i].qInfo.frame) { - curFrameMatches.push_back(&matchList[i]); - i ++; - } - if (curFrameMatches.size() > 1) { - remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus, par); - } - } - } - - // Construct a match combination using filtered matches of current genus - // so that it can best cover the query, and score the combination - - if (!filteredMatches.empty()) { - matchesForEachGenus.push_back(filteredMatches); - genusScores.push_back(scoreGenus(filteredMatches, queryLength)); - } - filteredMatches.clear(); - } - - // If there are no meaningful genus - if (genusScores.empty()) { - bestScore.score = 0; - return bestScore; - } - - TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), - [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); - - vector maxIdx; - for (size_t g = 0; g < genusScores.size(); g++) { - if (genusScores[g].score > maxScore.score * 0.95f) { - maxIdx.push_back(g); - } - } - - bestScore = maxScore; - - for (unsigned long g : maxIdx) { - for (const Match * m : matchesForEachGenus[g]) { - genusMatches.push_back(*m); - } - } - - // More than one genus - if (maxIdx.size() > 1) { - bestScore.taxId = 0; - return bestScore; - } - return bestScore; - - //Three cases - //1. one genus - //2. more than one genus - //4. no genus -} - -TaxonScore Classifier::getBestGenusMatches_spaced(vector &genusMatches, const Match *matchList, size_t end, - size_t offset, int readLength) { - TaxID currentGenus; - TaxID currentSpecies; - - vector tempMatchContainer; - vector filteredMatches; - vector> matchesForEachGenus; - vector conservedWithinGenus; - vector genusScores; - TaxonScore bestScore; - size_t i = offset; - bool lastIn; - size_t speciesMatchCnt; - while (i + 1 < end + 1) { - currentGenus = matchList[i].genusId; - // For current genus - while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) { - currentSpecies = matchList[i].speciesId; - // For current species - // Filter un-consecutive matches (probably random matches) - lastIn = false; - int distance = 0; - int diffPosCntOfCurrRange = 1; - int dnaDist = 0; - - // For the same species - while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) { - distance = matchList[i + 1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3; - dnaDist = matchList[i + 1].qInfo.pos - matchList[i].qInfo.pos; - if (distance == 0) { // At the same position - tempMatchContainer.push_back(matchList + i); - } else if (dnaDist < (8 + spaceNum_int + maxGap) * 3) { // Overlapping - lastIn = true; - tempMatchContainer.push_back(matchList + i); - diffPosCntOfCurrRange++; - } else { // Not consecutive --> End range - if (lastIn) { - tempMatchContainer.push_back(matchList + i); - if (diffPosCntOfCurrRange >= minCoveredPos) { - filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), - tempMatchContainer.end()); - } - } - lastIn = false; - // Initialize range info - tempMatchContainer.clear(); - diffPosCntOfCurrRange = 1; - } - i++; - } - - // Met next species - if (lastIn) { - tempMatchContainer.push_back(matchList + i); - if (diffPosCntOfCurrRange >= minCoveredPos) { - filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), - tempMatchContainer.end()); - } - } - tempMatchContainer.clear(); - i++; - } - - // Construct a match combination using filtered matches of current genus - // so that it can best cover the query, and score the combination - if (!filteredMatches.empty()) { - genusScores.push_back(scoreGenus(filteredMatches, readLength)); - } - filteredMatches.clear(); - } - - // If there are no meaningful genus - if (genusScores.empty()) { - bestScore.score = 0; - return bestScore; - } - - TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), - [](const TaxonScore &a, const TaxonScore &b) { return a.score < b.score; }); - - vector maxIdx; - for (size_t g = 0; g < genusScores.size(); g++) { - if (genusScores[g].score > maxScore.score * 0.95f) { - maxIdx.push_back(g); - } - } - bestScore = maxScore; - - for (unsigned long g: maxIdx) { - genusMatches.insert(genusMatches.end(), - matchesForEachGenus[g].begin(), - matchesForEachGenus[g].end()); - } - - // More than one genus - if (maxIdx.size() > 1) { - bestScore.taxId = 0; - return bestScore; - } - return bestScore; - - //Three cases - //1. one genus - //2. more than one genus - //4. no genus -} - -TaxonScore Classifier::scoreGenus(vector &filteredMatches, - int queryLength) { - // Calculate Hamming distance & covered length - int coveredPosCnt = 0; - uint16_t currHammings; - int aminoAcidNum = (int) queryLength / 3; - int currPos; - size_t matchNum = filteredMatches.size(); - size_t f = 0; - - // Get the largest hamming distance at each position of query - auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1]; - memset(hammingsAtEachPos, -1, (aminoAcidNum + 1)); - while (f < matchNum) { - currPos = filteredMatches[f]->qInfo.pos / 3; - currHammings = filteredMatches[f]->rightEndHamming; - if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]]) - hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); - if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]]) - hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); - if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]]) - hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); - if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]]) - hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); - if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]]) - hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); - if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]]) - hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); - if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]]) - hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); - if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]]) - hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); - f++; - } - - // Sum up hamming distances and count the number of position covered by the matches. - float hammingSum = 0; - for (int h = 0; h < aminoAcidNum; h++) { - if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. - coveredPosCnt++; - } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively - hammingSum += 1.0f + (0.5f * hammingsAtEachPos[h]); - coveredPosCnt++; - } - } - delete[] hammingsAtEachPos; - - // Score current genus - int coveredLength = coveredPosCnt * 3; - if (coveredLength > queryLength) coveredLength = queryLength; - float score = ((float) coveredLength - hammingSum) / (float) queryLength; - float coverage = (float) (coveredLength) / (float) (queryLength); - - return {filteredMatches[0]->genusId, score, coverage, (int) hammingSum}; -} - -TaxonScore Classifier::scoreGenus(vector &filteredMatches, - int readLength1, - int readLength2) { - - // Calculate Hamming distance & covered length - uint16_t currHammings; - int aminoAcidNum_total = ((int) readLength1 / 3) + ((int) readLength2 / 3); - int aminoAcidNum_read1 = ((int) readLength1 / 3); - int currPos; - size_t matchNum = filteredMatches.size(); - size_t f = 0; - - // Get the largest hamming distance at each position of query - auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3]; - memset(hammingsAtEachPos, -1, (aminoAcidNum_total + 3)); - while (f < matchNum) { - currPos = (int) filteredMatches[f]->qInfo.pos / 3; - currHammings = filteredMatches[f]->rightEndHamming; - if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]]) - hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); - if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]]) - hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); - if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]]) - hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); - if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]]) - hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); - if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]]) - hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); - if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]]) - hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); - if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]]) - hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); - if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]]) - hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); - f++; - } - - // Sum up hamming distances and count the number of position covered by the matches. - float hammingSum = 0; - int coveredPosCnt_read1 = 0; - int coveredPosCnt_read2 = 0; - for (int h = 0; h < aminoAcidNum_total; h++) { - // Read 1 - if (h < aminoAcidNum_read1) { - if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. - coveredPosCnt_read1++; - } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively - hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); - coveredPosCnt_read1++; - } - } - // Read 2 - else { - if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. - coveredPosCnt_read2++; - } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively - hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); - coveredPosCnt_read2++; - } - } - } - delete[] hammingsAtEachPos; - - // Score current genus - int coveredLength_read1 = coveredPosCnt_read1 * 3; - int coveredLength_read2 = coveredPosCnt_read2 * 3; - if (coveredLength_read1 > readLength1) coveredLength_read1 = readLength1; - if (coveredLength_read2 > readLength2) coveredLength_read2 = readLength2; - float score = - ((float) (coveredLength_read1 + coveredLength_read2) - hammingSum) / (float) (readLength1 + readLength2); - float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (readLength1 + readLength2); - -// matchesForEachGenus.push_back(move(filteredMatches)); - return {filteredMatches[0]->genusId, score, coverage, (int) hammingSum}; -} - -TaxonScore Classifier::chooseSpecies(const vector &matches, - int queryLength, - vector &species, - unordered_map> & speciesMatchRange) { - // Score each species - std::unordered_map speciesScores; - size_t i = 0; - TaxID currentSpeices; - size_t numOfMatch = matches.size(); - size_t speciesBegin, speciesEnd; - while (i < numOfMatch) { - currentSpeices = matches[i].speciesId; - speciesBegin = i; - while ((i < numOfMatch) && currentSpeices == matches[i].speciesId) { - i++; - } - speciesEnd = i; - speciesScores[currentSpeices] = scoreSpecies(matches, speciesBegin, speciesEnd, queryLength); - speciesMatchRange[currentSpeices] = {(int) speciesBegin, (int) speciesEnd}; - speciesScores[currentSpeices].taxId = currentSpeices; - } - - // Get the best species - TaxonScore bestScore; - for (auto & sp : speciesScores) { - if (sp.second.score > bestScore.score) { - species.clear(); - species.push_back(sp.first); - bestScore = sp.second; - } else if (sp.second.coverage == bestScore.coverage) { - species.push_back(sp.first); - } - } - return bestScore; -} - -TaxonScore Classifier::chooseSpecies(const vector &matches, - int read1Length, - int read2Length, - vector &species, - unordered_map> & speciesMatchRange) { - // Score each species - std::unordered_map speciesScores; - - - size_t i = 0; - TaxID currentSpeices; - size_t numOfMatch = matches.size(); - size_t speciesBegin, speciesEnd; - while (i < numOfMatch) { - currentSpeices = matches[i].speciesId; - speciesBegin = i; - while ((i < numOfMatch) && currentSpeices == matches[i].speciesId) { - i++; - } - speciesEnd = i; - speciesScores[currentSpeices] = scoreSpecies(matches, speciesBegin, speciesEnd, read1Length, read2Length); - speciesMatchRange[currentSpeices] = {(int) speciesBegin, (int) speciesEnd}; - speciesScores[currentSpeices].taxId = currentSpeices; - } - - // Get the best species - TaxonScore bestScore; - for (auto & sp : speciesScores) { - if (sp.second.score > bestScore.score) { - species.clear(); - species.push_back(sp.first); - bestScore = sp.second; - } else if (sp.second.coverage == bestScore.coverage) { - species.push_back(sp.first); - } - } - return bestScore; -} - -TaxonScore Classifier::scoreSpecies(const vector &matches, - size_t begin, - size_t end, - int queryLength) { - - // Get the largest hamming distance at each position of query - int aminoAcidNum = queryLength / 3; - auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1]; - memset(hammingsAtEachPos, -1, (aminoAcidNum + 1)); - int currPos; - size_t walker = begin; - uint16_t currHammings; - while (walker < end) { - currPos = matches[walker].qInfo.pos / 3; - currHammings = matches[walker].rightEndHamming; - if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]]) - hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); - if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]]) - hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); - if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]]) - hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); - if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]]) - hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); - if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]]) - hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); - if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]]) - hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); - if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]]) - hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); - if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]]) - hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); - walker++; - } - - // Sum up hamming distances and count the number of position covered by the matches. - float hammingSum = 0; - int hammingDist = 0; - int coveredPosCnt = 0; - for (int h = 0; h < aminoAcidNum; h++) { - if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. - coveredPosCnt++; - } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively - hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); - hammingDist += hammingsAtEachPos[h]; - coveredPosCnt++; - } - } - delete[] hammingsAtEachPos; - // Score - int coveredLength = coveredPosCnt * 3; - if (coveredLength >= queryLength) coveredLength = queryLength; - - float score = ((float)coveredLength - hammingSum) / (float) queryLength; - float coverage = (float) coveredLength / (float) (queryLength); - - return {0, score, coverage, hammingDist}; -} - -TaxonScore Classifier::scoreSpecies(const vector &matches, - size_t begin, - size_t end, - int queryLength, - int queryLength2) { - - // Get the smallest hamming distance at each position of query - int aminoAcidNum_total = queryLength / 3 + queryLength2 / 3; - int aminoAcidNum_read1 = queryLength / 3; - auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3]; - memset(hammingsAtEachPos, -1, (aminoAcidNum_total + 3)); - - int currPos; - size_t walker = begin; - uint16_t currHammings; - - while (walker < end) { - currPos = matches[walker].qInfo.pos / 3; - currHammings = matches[walker].rightEndHamming; - if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]]) - hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); - if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]]) - hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); - if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]]) - hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); - if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]]) - hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); - if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]]) - hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); - if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]]) - hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); - if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]]) - hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); - if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]]) - hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); - walker++; - } - - // Sum up hamming distances and count the number of position covered by the matches. - float hammingSum = 0; - int hammingDist = 0; - int coveredPosCnt_read1 = 0; - int coveredPosCnt_read2 = 0; - for (int h = 0; h < aminoAcidNum_total; h++) { - // Read 1 - if (h < aminoAcidNum_read1) { - if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. - coveredPosCnt_read1++; - } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively - hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); - hammingDist += hammingsAtEachPos[h]; - coveredPosCnt_read1++; - } - } - // Read 2 - else { - if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. - coveredPosCnt_read2++; - } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively - hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); - hammingDist += hammingsAtEachPos[h]; - coveredPosCnt_read2++; - } - } - } - delete[] hammingsAtEachPos; - - // Score - int coveredLength_read1 = coveredPosCnt_read1 * 3; - int coveredLength_read2 = coveredPosCnt_read2 * 3; - if (coveredLength_read1 >= queryLength) coveredLength_read1 = queryLength; - if (coveredLength_read2 >= queryLength2) coveredLength_read2 = queryLength2; - - float score = ((float) (coveredLength_read1 + coveredLength_read2) - hammingSum) / (float) (queryLength + queryLength2); - float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (queryLength + queryLength2); - - return {0, score, coverage, hammingDist}; -} - -void Classifier::writeReadClassification(const vector & queryList, int queryNum, ofstream &readClassificationFile) { - for (int i = 0; i < queryNum; i++) { - readClassificationFile << queryList[i].isClassified << "\t" << queryList[i].name << "\t" - << queryList[i].classification << "\t" - << queryList[i].queryLength + queryList[i].queryLength2 << "\t" - << queryList[i].score << "\t" - << queryList[i].coverage << "\t" - << queryList[i].hammingDist << "\t" - << taxonomy->getString(taxonomy->taxonNode(queryList[i].classification)->rankIdx) << "\t"; - for (auto it = queryList[i].taxCnt.begin(); it != queryList[i].taxCnt.end(); ++it) { - readClassificationFile << it->first << ":" << it->second << " "; - } - readClassificationFile << "\n"; - } -} - -void Classifier::writeReportFile(const string &outdir, int numOfQuery, unordered_map &taxCnt) { - unordered_map cladeCounts = taxonomy->getCladeCounts(taxCnt); - FILE *fp; - fp = fopen((outdir + + "/" + jobId + "_report.tsv").c_str(), "w"); - writeReport(fp, cladeCounts, numOfQuery); - fclose(fp); - - // Write Krona chart - FILE *kronaFile = fopen((outDir + "/" + jobId + "_krona.html").c_str(), "w"); - fwrite(krona_prelude_html, krona_prelude_html_len, sizeof(char), kronaFile); - fprintf(kronaFile, "%zu", numOfQuery); - kronaReport(kronaFile, *taxonomy, cladeCounts, numOfQuery); - fprintf(kronaFile, ""); - -} - -void Classifier::writeReport(FILE *FP, const std::unordered_map &cladeCounts, - unsigned long totalReads, TaxID taxID, int depth) { - std::unordered_map::const_iterator it = cladeCounts.find(taxID); - unsigned int cladeCount = it == cladeCounts.end() ? 0 : it->second.cladeCount; - unsigned int taxCount = it == cladeCounts.end() ? 0 : it->second.taxCount; - if (taxID == 0) { - if (cladeCount > 0) { - fprintf(FP, "%.4f\t%i\t%i\tno rank\t0\tunclassified\n", - 100 * cladeCount / double(totalReads), - cladeCount, taxCount); - } - writeReport(FP, cladeCounts, totalReads, 1); - } else { - if (cladeCount == 0) { - return; - } - const TaxonNode *taxon = taxonomy->taxonNode(taxID); - fprintf(FP, "%.4f\t%i\t%i\t%s\t%i\t%s%s\n", - 100 * cladeCount / double(totalReads), cladeCount, taxCount, - taxonomy->getString(taxon->rankIdx), taxID, std::string(2 * depth, ' ').c_str(), taxonomy->getString(taxon->nameIdx)); - std::vector children = it->second.children; - SORT_SERIAL(children.begin(), children.end(), [&](int a, int b) { return cladeCountVal(cladeCounts, a) > cladeCountVal(cladeCounts, b); }); - for (size_t i = 0; i < children.size(); ++i) { - TaxID childTaxId = children[i]; - if (cladeCounts.count(childTaxId)) { - writeReport(FP, cladeCounts, totalReads, childTaxId, depth + 1); - } else { - break; - } - } - } -} - -unsigned int Classifier::cladeCountVal(const std::unordered_map &map, TaxID key) { - typename std::unordered_map::const_iterator it = map.find(key); - if (it == map.end()) { - return 0; - } else { - return it->second.cladeCount; - } -} - -void Classifier::splitQueryFile(vector & sequences, const std::string &queryPath) { - KSeqWrapper* kseq = nullptr; - kseq = KSeqFactory(queryPath.c_str()); - while (kseq->ReadEntry()) { - const KSeqWrapper::KSeqEntry & e = kseq->entry; - sequences.emplace_back(e.headerOffset - 1, - e.sequenceOffset + e.sequence.l, - e.sequenceOffset + e.sequence.l - e.headerOffset + 2, - e.sequence.l); - } - delete kseq; -} - -bool Classifier::isConsecutive(const Match * match1, const Match * match2) { - return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF); -} - -bool Classifier::isConsecutive(const Match & match1, const Match & match2, const LocalParameters & par) { - uint16_t hamming1 = match1.rightEndHamming; - uint16_t hamming2 = match2.rightEndHamming; -// if (par.printLog) { -// print_binary16(16, hamming1); cout << endl; -// print_binary16(16, hamming2); cout << endl; -// } - - // set most significant two bits to 0 - hamming2 &= 0x3FFF; // 07654321 - // move bits to right by 2 - hamming1 >>= 2; // 07654321 -// if (par.printLog) { -// print_binary16(16, hamming1); cout << endl; -// print_binary16(16, hamming2); cout << endl; -// } - - return hamming1 == hamming2; -} - diff --git a/src/commons/Classifier.h b/src/commons/Classifier.h index 06379189..5cf61ce9 100644 --- a/src/commons/Classifier.h +++ b/src/commons/Classifier.h @@ -24,286 +24,37 @@ #include #include "Match.h" #include - +#include "LocalUtil.h" +#include "QueryIndexer.h" +#include "ReducedKmerMatcher.h" +#include "KmerExtractor.h" +#include "Taxonomer.h" +#include "Reporter.h" #define BufferSize 16'777'216 //16 * 1024 * 1024 // 16 M using namespace std; -struct TaxonScore { - TaxID taxId; - float score; - float coverage; - int hammingDist; - TaxonScore(TaxID taxId, float score, float coverage, int hammingDist) : - taxId(taxId), score(score), coverage(coverage), hammingDist(hammingDist) {} - TaxonScore() : taxId(0), score(0.0f), coverage(0.0f), hammingDist(0) {} -}; + class Classifier { protected: // Parameters - int verbosity; - const int maskMode; - const float maskProb; - string queryPath_1; - string queryPath_2; string dbDir; - string outDir; - string jobId; - -// size_t localIndexBufferSize; -// size_t localMatchBufferSize; - - // For spaced k-mer - uint32_t * mask; - uint32_t spaceNum; - int spaceNum_int; - int unmaskedPos[9]; - - // For masking reads - ProbabilityMatrix * probMatrix; - BaseMatrix * subMat; - - uint8_t hammingMargin; - float minSpScore; - int minCoveredPos; - int maxGap; - + size_t matchPerKmer; + + // Agents + QueryIndexer * queryIndexer; + KmerExtractor * kmerExtractor; + KmerMatcher * kmerMatcher; + Taxonomer * taxonomer; + Reporter * reporter; NcbiTaxonomy * taxonomy; - unordered_map taxId2speciesId; - unordered_map taxId2genusId; - - - struct MatchBlock { - MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {} - MatchBlock() : start(0), end(0), id(0) {} - size_t start; - size_t end; - uint32_t id; - }; - - struct QueryKmerSplit { - QueryKmerSplit(size_t start, size_t end, size_t length, const DiffIdxSplit& diffIdxSplit) - : start(start), end(end), length(length), diffIdxSplit(diffIdxSplit) {} - - size_t start; // start idx in query k-mer list - size_t end; // end idx in query k-mer list - size_t length; - DiffIdxSplit diffIdxSplit; // index in target k-mer list from where the search begins. - }; - - - template - struct Buffer { - T *buffer; - size_t startIndexOfReserve; - size_t bufferSize; - - explicit Buffer(size_t sizeOfBuffer=100) { - buffer = (T *) malloc(sizeof(T) * sizeOfBuffer); - bufferSize = sizeOfBuffer; - startIndexOfReserve = 0; - }; - - size_t reserveMemory(size_t numOfKmer) { - size_t offsetToWrite = __sync_fetch_and_add(&startIndexOfReserve, numOfKmer); - return offsetToWrite; - }; - - void reallocateMemory(size_t sizeOfBuffer) { - if (sizeOfBuffer > bufferSize) { - buffer = (T *) realloc(buffer, sizeof(T) * sizeOfBuffer); - bufferSize = sizeOfBuffer; - } - }; - }; - - int numOfSplit; - unordered_map taxCounts; - uint64_t MARKER; - int bitsForCodon; - uint8_t hammingLookup[8][8] = { - {0, 1, 1, 1, 2, 1, 3, 3}, - {1, 0, 1, 1, 2, 2, 3, 2}, - {1, 1, 0, 1, 2, 2, 2, 3}, - {1, 1, 1, 0, 1, 2, 3, 3}, - {2, 2, 2, 1, 0, 1, 4, 4}, - {1, 2, 2, 2, 1, 0, 4, 4}, - {3, 3, 2, 3, 4, 4, 0, 1}, - {3, 2, 3, 3, 4, 4, 1, 0}}; - - // Index reads in query file - static void splitQueryFile(vector & seqSegments, const string & queryPath); - - // Extract query k-mer - void fillQueryKmerBufferParallel(KSeqWrapper* kseq1, - QueryKmerBuffer &kmerBuffer, - vector & queryList, - const pair & currentSplit, - const LocalParameters &par); - - void fillQueryKmerBufferParallel_paired(KSeqWrapper* kseq1, - KSeqWrapper* kseq2, - QueryKmerBuffer &kmerBuffer, - vector &queryList, - const pair ¤tSplit, - const LocalParameters &par); - - static int getMaxCoveredLength(int queryLength); - - template - T getQueryKmerNumber(T queryLength); - - void linearSearchParallel( - QueryKmer *queryKmerList, - size_t &queryKmerCnt, - Buffer &matchBuffer, - const LocalParameters &par); - - void compareDna(uint64_t query, vector &targetKmersToCompare, vector &selectedMatches, - vector &selectedHammingSum, vector &rightEndHammings, uint8_t frame); - - virtual uint8_t getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2); - - virtual uint16_t getHammings(uint64_t kmer1, uint64_t kmer2); - - virtual uint16_t getHammings_reverse(uint64_t kmer1, uint64_t kmer2); - - void moveMatches(Match *dest, Match *src, int& matchNum); - - // Analyzing k-mer matches - void fromMatchToClassification(const Match *matchList, - size_t numOfMatches, - vector & queryList, - const LocalParameters &par); - - void chooseBestTaxon(uint32_t currentQuery, - size_t offset, - size_t end, - const Match *matchList, - vector & queryList, - const LocalParameters &par); - - void remainConsecutiveMatches(vector & curFrameMatches, - vector & filteredMatches, - TaxID genusId, - const LocalParameters & par); - - size_t DFS(size_t curMatchIdx, const map>& linkedMatches, - vector& fiteredMatchIdx, size_t depth, size_t MIN_DEPTH, unordered_set& used, - unordered_map & idx2depth); - - static bool isConsecutive(const Match * match1, const Match * match2); - bool isConsecutive(const Match & match1, const Match & match2, const LocalParameters &par); - TaxonScore getBestGenusMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, - size_t offset, int queryLength, const LocalParameters &par); - - TaxonScore getBestGenusMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, - int readLength1, int readLength2, const LocalParameters &par); - - TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, - int readLength1, int readLength2); - TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, - int readLength1); - - TaxonScore scoreGenus(vector &filteredMatches, - int queryLength); - - TaxonScore scoreGenus(vector &filteredMatches, - int readLength1, - int readLength2); - - void scoreGenus_ExtensionScore(vector &filteredMatches, - vector> &matchesForEachGenus, - vector &scoreOfEachGenus, - int readLength1, int readLength2); - - TaxonScore chooseSpecies(const std::vector &matches, - int queryLength, - vector &species, - unordered_map> & speciesMatchRange); - - TaxonScore chooseSpecies(const std::vector &matches, - int read1Length, - int read2Length, - vector &species, - unordered_map> & speciesMatchRange); - - TaxonScore scoreSpecies(const vector &matches, - size_t begin, - size_t end, - int queryLength); - - TaxonScore scoreSpecies(const vector &matches, - size_t begin, - size_t end, - int queryLength, - int queryLength2); - - TaxID lowerRankClassification(vector &matches, pair &matchRange, TaxID speciesID); - - void getSpeciesCladeCounts(const unordered_map & taxCnt, - unordered_map & cladeCnt, - TaxID spciesID); - - TaxID BFS(const unordered_map & cladeCnt, TaxID root); - - template - static void loadBuffer(FILE * fp, T * buffer, size_t & bufferIdx, size_t size, int cnt){ - fseek(fp, cnt * sizeof(T), SEEK_CUR); - fread(buffer, sizeof(T), size, fp); - bufferIdx = 0; - } - - template - static void loadBuffer(FILE * fp, T * buffer, size_t & bufferIdx, size_t size){ - fread(buffer, sizeof(T), size, fp); - bufferIdx = 0; - } - - // Write report - void writeReadClassification(const vector & queryList, int queryNum, ofstream &readClassificationFile); - - void writeReportFile(const string &reportFileName, int numOfQuery, unordered_map &taxCnt); - - void writeReport(FILE *FP, const std::unordered_map &cladeCounts, - unsigned long totalReads, TaxID taxID = 0, int depth = 0); - - unsigned int cladeCountVal(const std::unordered_map &map, TaxID key); - - size_t AminoAcidPart(size_t kmer) const { - return (kmer) & MARKER; - } - - static size_t getCodonBits(size_t num) { - return num & 0X7U; - } - - void setMarker(uint64_t marker) { - MARKER = marker; - MARKER = ~MARKER; - } - - void setNumOfBitsForCodon(int num) { - bitsForCodon = num; - } public: - void startClassify(const LocalParameters &par); -// static uint64_t getNextTargetKmer(uint64_t lookingTarget, const uint16_t *targetDiffIdxList, size_t &diffIdxPos); - - static uint64_t getNextTargetKmer(uint64_t lookingTarget, - const uint16_t * diffIdxBuffer, - size_t & diffBufferIdx, - size_t & totalPos);// size_t bufferSize, FILE * diffIdxFp); - - static TargetKmerInfo getKmerInfo(size_t bufferSize, FILE * kmerInfoFp, TargetKmerInfo * infoBuffer, - size_t & infoBufferIdx); - explicit Classifier(LocalParameters & par); virtual ~Classifier(); @@ -311,59 +62,8 @@ class Classifier { }; -struct sortMatch { - bool operator() (const Match& a, const Match& b) const { - if (a.qInfo.sequenceID != b.qInfo.sequenceID) - return a.qInfo.sequenceID < b.qInfo.sequenceID; - - if (a.genusId != b.genusId) - return a.genusId < b.genusId; - - if (a.speciesId != b.speciesId) - return a.speciesId < b.speciesId; - - if (a.qInfo.frame != b.qInfo.frame) - return a.qInfo.frame < b.qInfo.frame; - - if (a.qInfo.pos != b.qInfo.pos) - return a.qInfo.pos < b.qInfo.pos; - - return a.hamming < b.hamming; - } -}; - -inline uint8_t Classifier::getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2) {//12345678 - uint8_t hammingSum = 0; - hammingSum += hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 3U)][GET_3_BITS(kmer2 >> 3U)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 6U)][GET_3_BITS(kmer2 >> 6U)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 9U)][GET_3_BITS(kmer2 >> 9U)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 12U)][GET_3_BITS(kmer2 >> 12U)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 15U)][GET_3_BITS(kmer2 >> 15U)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 18U)][GET_3_BITS(kmer2 >> 18U)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 21U)][GET_3_BITS(kmer2 >> 21U)]; - return hammingSum; -} -inline uint16_t Classifier::getHammings(uint64_t kmer1, uint64_t kmer2) { //hammings 87654321 - uint16_t hammings = 0; - for (int i = 0; i < 8; i++) { - hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * i; - kmer1 >>= bitsForCodon; - kmer2 >>= bitsForCodon; - } - return hammings; -} -inline uint16_t Classifier::getHammings_reverse(uint64_t kmer1, uint64_t kmer2) { //hammings 87654321 - uint16_t hammings = 0; - for (int i = 0; i < 8; i++) { - hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * (7-i); - kmer1 >>= bitsForCodon; - kmer2 >>= bitsForCodon; - } - return hammings; -} //inline uint64_t //Classifier::getNextTargetKmer(uint64_t lookingTarget, const uint16_t *targetDiffIdxList, size_t &diffIdxPos) { @@ -384,33 +84,7 @@ inline uint16_t Classifier::getHammings_reverse(uint64_t kmer1, uint64_t kmer2) // return diffIn64bit + lookingTarget; //} -inline uint64_t -Classifier::getNextTargetKmer(uint64_t lookingTarget, const uint16_t * diffIdxBuffer, size_t & diffBufferIdx, size_t & totalPos) { -// size_t bufferSize, FILE * diffIdxFp) { - uint16_t fragment; - uint16_t check = 32768; // 2^15 - uint64_t diffIn64bit = 0; - fragment = diffIdxBuffer[diffBufferIdx++]; - totalPos ++; - while (!(fragment & check)) { // 27 % - diffIn64bit |= fragment; - diffIn64bit <<= 15u; - fragment = diffIdxBuffer[diffBufferIdx++]; - totalPos ++; - } - fragment &= ~check; // not; 8.47 % - diffIn64bit |= fragment; // or : 23.6% - return diffIn64bit + lookingTarget; -} -inline -TargetKmerInfo Classifier::getKmerInfo(size_t bufferSize, FILE * kmerInfoFp, TargetKmerInfo * infoBuffer, - size_t & infoBufferIdx){ - if (unlikely(infoBufferIdx >= bufferSize)) { - loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize, (int) (infoBufferIdx - bufferSize)); - } - return infoBuffer[infoBufferIdx]; -} #endif //ADKMER4_SEARCHER_H diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h index 44461d9d..087cc3b3 100644 --- a/src/commons/IndexCreator.h +++ b/src/commons/IndexCreator.h @@ -21,18 +21,14 @@ #include "NucleotideMatrix.h" #include "SubstitutionMatrix.h" #include "tantan.h" -//#include "DBReader.h" -//#include "DBWriter.h" -//#include "Debug.h" -//#include "Util.h" -//#include "FileUtil.h" + #ifdef OPENMP #include #endif -#define kmerLength 8 + struct TaxId2Fasta{ TaxID species; diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp new file mode 100644 index 00000000..f8860ec9 --- /dev/null +++ b/src/commons/KmerExtractor.cpp @@ -0,0 +1,216 @@ +#include "KmerExtractor.h" + +KmerExtractor::KmerExtractor(const LocalParameters &par) { + spaceNum = par.spaceMask.length() - 8; + maskMode = par.maskMode; + maskProb = par.maskProb; +} + +KmerExtractor::~KmerExtractor() { + delete probMatrix; + delete subMat; +} + +void KmerExtractor::extractQueryKmers(QueryKmerBuffer &kmerBuffer, + vector & queryList, + const QuerySplit & currentSplit, + const LocalParameters &par, + KSeqWrapper* kseq1, + KSeqWrapper* kseq2) { + time_t beforeKmerExtraction = time(nullptr); + std::cout << "Extracting query metamers ... " << endl; + if (par.seqMode == 1 || par.seqMode == 3) { // Single-end short-read sequence or long-read sequence + fillQueryKmerBufferParallel(kseq1, + kmerBuffer, + queryList, + currentSplit, + par); + } else if (par.seqMode == 2) { + fillQueryKmerBufferParallel_paired(kseq1, + kseq2, + kmerBuffer, + queryList, + currentSplit, + par); + } + cout << "Time spent for metamer extraction: " << double(time(nullptr) - beforeKmerExtraction) << endl; + + // Sort query k-mer + time_t beforeQueryKmerSort = time(nullptr); + cout << "Sorting query metamer list ..." << endl; + SORT_PARALLEL(kmerBuffer.buffer, kmerBuffer.buffer + kmerBuffer.startIndexOfReserve, compareForLinearSearch); + cout << "Time spent for sorting query metamer list: " << double(time(nullptr) - beforeQueryKmerSort) << endl; +} + +void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq1, + QueryKmerBuffer &kmerBuffer, + vector &queryList, + const QuerySplit ¤tSplit, + const LocalParameters &par) { + size_t queryNum = currentSplit.end - currentSplit.start; + size_t processedQueryNum = 0; + + // Array to store reads of thread number + vector reads1(par.threads); + + while (processedQueryNum < queryNum) { + size_t currentQueryNum = min(queryNum - processedQueryNum, (size_t) par.threads); + size_t count = 0; + while (count < currentQueryNum) { + // Read query + kseq1->ReadEntry(); + const KSeqWrapper::KSeqEntry & e1 = kseq1->entry; + + // Get k-mer count + int kmerCnt = LocalUtil::getQueryKmerNumber((int) e1.sequence.l, spaceNum); + + // Query Info + queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l); + queryList[processedQueryNum].name = string(e1.name.s); + queryList[processedQueryNum].kmerCnt = (int) (kmerCnt); + + // Store reads + reads1[count] = string(kseq1->entry.sequence.s); + + processedQueryNum ++; + count ++; + } +#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1) + { + SeqIterator seqIterator(par); + size_t posToWrite; +#pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < currentQueryNum; i ++) { + size_t queryIdx = processedQueryNum - currentQueryNum + i; + // Get k-mer count + auto kmerCnt = LocalUtil::getQueryKmerNumber(reads1[i].length(), spaceNum); + + // Ignore short read + if (kmerCnt < 1) { continue; } + + // Get masked sequence + char *maskedSeq1 = nullptr; + if (maskMode) { + maskedSeq1 = new char[reads1[i].length() + 1]; + SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat); + } else { + maskedSeq1 = const_cast(reads1[i].c_str()); + } + + posToWrite = kmerBuffer.reserveMemory(kmerCnt); + + // Process Read 1 + seqIterator.sixFrameTranslation(maskedSeq1, (int) reads1[i].length()); + seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) reads1[i].length(), kmerBuffer, posToWrite, + (uint32_t) queryIdx); + + if (maskMode) { + delete[] maskedSeq1; + } + } + } + } +} + +void KmerExtractor::fillQueryKmerBufferParallel_paired(KSeqWrapper *kseq1, + KSeqWrapper *kseq2, + QueryKmerBuffer &kmerBuffer, + vector &queryList, + const QuerySplit ¤tSplit, + const LocalParameters &par) { + size_t queryNum = currentSplit.end - currentSplit.start; + size_t processedQueryNum = 0; + + // Array to store reads of thread number + vector reads1(par.threads); + vector reads2(par.threads); + + while (processedQueryNum < queryNum) { + size_t currentQueryNum = min(queryNum - processedQueryNum, (size_t) par.threads); + size_t count = 0; + + // Fill reads in sequential + while (count < currentQueryNum) { + // Read query + kseq1->ReadEntry(); + kseq2->ReadEntry(); + const KSeqWrapper::KSeqEntry & e1 = kseq1->entry; + const KSeqWrapper::KSeqEntry & e2 = kseq2->entry; + + // Get k-mer count + int kmerCnt = LocalUtil::getQueryKmerNumber((int) e1.sequence.l, spaceNum); + int kmerCnt2 = LocalUtil::getQueryKmerNumber((int) e2.sequence.l, spaceNum); + + // Query Info + queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l); + queryList[processedQueryNum].queryLength2 = getMaxCoveredLength((int) e2.sequence.l); + queryList[processedQueryNum].name = string(e1.name.s); + queryList[processedQueryNum].kmerCnt = (int) (kmerCnt + kmerCnt2); + + // Store reads + reads1[count] = string(kseq1->entry.sequence.s); + reads2[count] = string(kseq2->entry.sequence.s); + + processedQueryNum ++; + count ++; + } + + // Process reads in parallel +#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1, reads2) + { + SeqIterator seqIterator(par); + SeqIterator seqIterator2(par); + size_t posToWrite; +#pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < currentQueryNum; i ++) { + size_t queryIdx = processedQueryNum - currentQueryNum + i; + // Get k-mer count + auto kmerCnt = LocalUtil::getQueryKmerNumber(reads1[i].length(), spaceNum); + auto kmerCnt2 = LocalUtil::getQueryKmerNumber(reads2[i].length(), spaceNum); + + // Ignore short read + if (kmerCnt2 < 1 || kmerCnt < 1) { continue; } + + // Get masked sequence + char *maskedSeq1 = nullptr; + char *maskedSeq2 = nullptr; + if (maskMode) { + maskedSeq1 = new char[reads1[i].length() + 1]; + maskedSeq2 = new char[reads2[i].length() + 1]; + SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat); + SeqIterator::maskLowComplexityRegions(reads2[i].c_str(),maskedSeq2, *probMatrix, maskProb, subMat); + } else { + maskedSeq1 = const_cast(reads1[i].c_str()); + maskedSeq2 = const_cast(reads2[i].c_str()); + } + + posToWrite = kmerBuffer.reserveMemory(kmerCnt + kmerCnt2); + + // Process Read 1 + seqIterator.sixFrameTranslation(maskedSeq1, (int) reads1[i].length()); + seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) reads1[i].length(), kmerBuffer, posToWrite, + (uint32_t) queryIdx); + + // Process Read 2 + seqIterator2.sixFrameTranslation(maskedSeq2, (int) reads2[i].length()); + seqIterator2.fillQueryKmerBuffer(maskedSeq2, (int) reads2[i].length(), kmerBuffer, posToWrite, + (uint32_t) queryIdx, queryList[queryIdx].queryLength); + + if (maskMode) { + delete[] maskedSeq1; + delete[] maskedSeq2; + } + } + } + } +} + +int KmerExtractor::getMaxCoveredLength(int queryLength) { + if (queryLength % 3 == 2) { + return queryLength - 2; // 2 + } else if (queryLength % 3 == 1) { + return queryLength - 4; // 4 + } else { + return queryLength - 3; // 3 + } +} \ No newline at end of file diff --git a/src/commons/KmerExtractor.h b/src/commons/KmerExtractor.h new file mode 100644 index 00000000..260bc78c --- /dev/null +++ b/src/commons/KmerExtractor.h @@ -0,0 +1,56 @@ +#ifndef METABULI_KMEREXTRACTER_H +#define METABULI_KMEREXTRACTER_H +#include "SeqIterator.h" +#include "QueryIndexer.h" +#include "KseqWrapper.h" + +class KmerExtractor { +private: + // Parameters + int spaceNum; + int maskMode; + float maskProb; + + // For masking reads + ProbabilityMatrix * probMatrix; + BaseMatrix * subMat; + + // Extract query k-mer + void fillQueryKmerBufferParallel(KSeqWrapper* kseq1, + QueryKmerBuffer &kmerBuffer, + vector & queryList, + const QuerySplit & currentSplit, + const LocalParameters &par); + + void fillQueryKmerBufferParallel_paired(KSeqWrapper* kseq1, + KSeqWrapper* kseq2, + QueryKmerBuffer &kmerBuffer, + vector &queryList, + const QuerySplit & currentSplit, + const LocalParameters &par); + + static int getMaxCoveredLength(int queryLength) ; + +public: + explicit KmerExtractor(const LocalParameters & par); + ~KmerExtractor(); + void extractQueryKmers(QueryKmerBuffer &kmerBuffer, + vector & queryList, + const QuerySplit & currentSplit, + const LocalParameters &par, + KSeqWrapper* kseq1, + KSeqWrapper* kseq2 = nullptr); + + +}; + +static inline bool compareForLinearSearch(const QueryKmer &a, const QueryKmer &b) { + if (a.ADkmer < b.ADkmer) { + return true; + } else if (a.ADkmer == b.ADkmer) { + return (a.info.sequenceID < b.info.sequenceID); + } + return false; +} + +#endif //METABULI_KMEREXTRACTER_H diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp new file mode 100644 index 00000000..7117e3ce --- /dev/null +++ b/src/commons/KmerMatcher.cpp @@ -0,0 +1,466 @@ +#include "KmerMatcher.h" + +KmerMatcher::KmerMatcher(const LocalParameters & par, + NcbiTaxonomy * taxonomy) { + threads = par.threads; + std::string dbDir = par.filenames[1 + (par.seqMode == 2)]; + targetDiffIdxFileName = dbDir + "/diffIdx"; + targetInfoFileName = dbDir + "/info"; + diffIdxSplitFileName = dbDir + "/split"; + + diffIdxSplits = mmapData(diffIdxSplitFileName.c_str(), 3); + + MARKER = 16777215; + MARKER = ~ MARKER; + hammingMargin = par.hammingMargin; + totalMatchCnt = 0; + + // Load the taxonomy ID list + FILE * taxIdFile; + if((taxIdFile = fopen((dbDir + "/taxID_list").c_str(),"r")) == NULL){ + std::cout<<"Cannot open the taxID list file."<taxonNode(taxId); + if (taxId == taxon->taxId) { + TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); + TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); + while (taxon->taxId != speciesTaxID) { + taxId2speciesId[taxon->taxId] = speciesTaxID; + taxId2genusId[taxon->taxId] = genusTaxID; + taxon = taxonomy->taxonNode(taxon->parentTaxId); + } + taxId2speciesId[speciesTaxID] = speciesTaxID; + taxId2genusId[speciesTaxID] = genusTaxID; + } else { + TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); + TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); + while (taxon->taxId != speciesTaxID) { + taxId2speciesId[taxon->taxId] = speciesTaxID; + taxId2genusId[taxon->taxId] = genusTaxID; + taxon = taxonomy->taxonNode(taxon->parentTaxId); + } + taxId2speciesId[speciesTaxID] = speciesTaxID; + taxId2genusId[speciesTaxID] = genusTaxID; + taxId2speciesId[taxId] = speciesTaxID; + taxId2genusId[taxId] = genusTaxID; + } + } + fclose(taxIdFile); +} + +KmerMatcher::~KmerMatcher() { + munmap(diffIdxSplits.data, diffIdxSplits.fileSize + 1); +} + +int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer * matchBuffer) { + size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve; + QueryKmer *queryKmerList = queryKmerBuffer->buffer; + + size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t); + + std::cout << "Comparing query and reference metamers..." << std::endl; + + // Find the first index of garbage query k-mer (UINT64_MAX) and discard from there + for (size_t checkN = queryKmerNum - 1; checkN > 0; checkN--) { + if (queryKmerList[checkN].ADkmer != UINT64_MAX) { + queryKmerNum = checkN + 1; + break; + } + } + + // Filter out meaningless target splits + size_t numOfDiffIdxSplits = diffIdxSplits.fileSize / sizeof(DiffIdxSplit); + size_t numOfDiffIdxSplits_use = numOfDiffIdxSplits; + for (size_t i = 1; i < numOfDiffIdxSplits; i++) { + if (diffIdxSplits.data[i].ADkmer == 0 || diffIdxSplits.data[i].ADkmer == UINT64_MAX) { + diffIdxSplits.data[i] = {UINT64_MAX, UINT64_MAX, UINT64_MAX}; + numOfDiffIdxSplits_use--; + } + } + + // Divide query k-mer list into blocks for multi threading. + // Each split has start and end points of query list + proper offset point of target k-mer list + std::vector querySplits; + uint64_t queryAA; + std::vector targetSplitIdxs; + + if (threads == 1) { //Single thread + querySplits.emplace_back(0, queryKmerNum - 1, queryKmerNum, diffIdxSplits.data[0]); + } else if (threads == 2) { //Two threads + size_t splitWidth = queryKmerNum / 2; + querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]); + for (size_t tSplitCnt = 0; tSplitCnt < numOfDiffIdxSplits_use; tSplitCnt++) { + queryAA = AminoAcidPart(queryKmerList[splitWidth].ADkmer); + if (queryAA <= AminoAcidPart(diffIdxSplits.data[tSplitCnt].ADkmer)) { + tSplitCnt = tSplitCnt - (tSplitCnt != 0); + querySplits.emplace_back(splitWidth, queryKmerNum - 1, queryKmerNum - splitWidth, + diffIdxSplits.data[tSplitCnt]); + break; + } + } + } else { //More than two threads + // Devide query k-mers into blocks + size_t splitWidth = queryKmerNum / (threads - 1); + querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]); + for (size_t i = 1; i < threads; i++) { + queryAA = AminoAcidPart(queryKmerList[splitWidth * i].ADkmer); + bool needLastTargetBlock = true; + for (size_t j = 0; j < numOfDiffIdxSplits_use; j++) { + if (queryAA <= AminoAcidPart(diffIdxSplits.data[j].ADkmer)) { + j = j - (j != 0); + if (i != threads - 1) { + querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth, + diffIdxSplits.data[j]); + } else { + querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i, + diffIdxSplits.data[j]); + } + targetSplitIdxs.emplace_back(j); + needLastTargetBlock = false; + break; + } + } + if (needLastTargetBlock) { + if (i != threads - 1) { // If it is not the last split + querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth, + diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); + targetSplitIdxs.emplace_back(numOfDiffIdxSplits_use - 2); + } else { + querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i, + diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); + targetSplitIdxs.emplace_back(numOfDiffIdxSplits_use - 2); + } + } + } + } + + bool *splitCheckList = (bool *) malloc(sizeof(bool) * threads); + std::fill_n(splitCheckList, threads, false); + size_t completedSplitCnt = 0; + + time_t beforeSearch = time(nullptr); + + while (completedSplitCnt < threads) { + bool hasOverflow = false; +#pragma omp parallel default(none), shared(completedSplitCnt, splitCheckList, hasOverflow, \ +querySplits, queryKmerList, matchBuffer, cout, par, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName, targetSplitIdxs) + { + // FILE + FILE * diffIdxFp = fopen(targetDiffIdxFileName.c_str(), "rb"); + FILE * kmerInfoFp = fopen(targetInfoFileName.c_str(), "rb"); + + // Target K-mer buffer + uint16_t * diffIdxBuffer = (uint16_t *) malloc(sizeof(uint16_t) * (BufferSize + 1)); // size = 32 Mb + TargetKmerInfo * kmerInfoBuffer = (TargetKmerInfo *) malloc(sizeof(TargetKmerInfo) * (BufferSize+1)); // 64 Mb + size_t kmerInfoBufferIdx = 0; + size_t diffIdxBufferIdx = 0; + + //query variables + uint64_t currentQuery = UINT64_MAX; + uint64_t currentQueryAA = UINT64_MAX; + QueryKmerInfo currentQueryInfo; + + //target variables + size_t diffIdxPos = 0; + std::vector candidateTargetKmers; //vector for candidate target k-mer, some of which are selected after based on hamming distance + std::vector candidateKmerInfos; + uint64_t currentTargetKmer; + + //Match buffer for each thread + int localBufferSize = 2'000'000; // 32 Mb + auto *matches = new Match[localBufferSize]; // 16 * 2'000'000 = 32 Mb + int matchCnt = 0; + + // For debug +// SeqIterator seqIterator(par); + + //vectors for selected target k-mers + std::vector selectedHammingSum; + std::vector selectedMatches; + std::vector selectedHammings; + size_t posToWrite; + + int currMatchNum; + size_t idx; +#pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < querySplits.size(); i++) { + if (hasOverflow || splitCheckList[i]) { + continue; + } + + currentTargetKmer = querySplits[i].diffIdxSplit.ADkmer; + diffIdxBufferIdx = querySplits[i].diffIdxSplit.diffIdxOffset; + kmerInfoBufferIdx = querySplits[i].diffIdxSplit.infoIdxOffset + - (querySplits[i].diffIdxSplit.ADkmer != 0); + diffIdxPos = querySplits[i].diffIdxSplit.diffIdxOffset; + + fseek(kmerInfoFp, 4 * (long)(kmerInfoBufferIdx), SEEK_SET); + loadBuffer(kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx, BufferSize); + fseek(diffIdxFp, 2 * (long) (diffIdxBufferIdx), SEEK_SET); + loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize); + + if (i == 0) { + currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, + diffIdxBufferIdx, diffIdxPos); + } + currentQuery = UINT64_MAX; + currentQueryAA = UINT64_MAX; + + size_t lastMovedQueryIdx = 0; + for (size_t j = querySplits[i].start; j < querySplits[i].end + 1; j++) { + querySplits[i].start++; + + // Reuse the comparison data if queries are exactly identical + if (currentQuery == queryKmerList[j].ADkmer + && (currentQueryInfo.frame/3 == queryKmerList[j].info.frame/3)) { + currMatchNum = selectedMatches.size(); + // If local buffer is full, copy them to the shared buffer. + if (matchCnt + currMatchNum > localBufferSize) { + // Check if the shared buffer is full. + posToWrite = matchBuffer->reserveMemory(matchCnt); + if (posToWrite + matchCnt >= matchBuffer->bufferSize) { + hasOverflow = true; + querySplits[i].start = lastMovedQueryIdx + 1; + __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt); + break; + } else { // not full -> copy matches to the shared buffer + moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); + lastMovedQueryIdx = j; + } + } + for (int k = 0; k < currMatchNum; k++) { + idx = selectedMatches[k]; + matches[matchCnt] = {queryKmerList[j].info, + candidateKmerInfos[idx].sequenceID, + taxId2genusId[candidateKmerInfos[idx].sequenceID], + taxId2speciesId[candidateKmerInfos[idx].sequenceID], + selectedHammings[k], + selectedHammingSum[k], + (bool) candidateKmerInfos[idx].redundancy}; + matchCnt++; + } + continue; + } + selectedMatches.clear(); + selectedHammingSum.clear(); + selectedHammings.clear(); + + // Reuse the candidate target k-mers to compare in DNA level if queries are the same at amino acid level but not at DNA level + if (currentQueryAA == AminoAcidPart(queryKmerList[j].ADkmer)) { + compareDna(queryKmerList[j].ADkmer, candidateTargetKmers, selectedMatches, + selectedHammingSum, selectedHammings,queryKmerList[j].info.frame); + currMatchNum = selectedMatches.size(); + + // If local buffer is full, copy them to the shared buffer. + if (matchCnt + currMatchNum > localBufferSize) { + // Check if the shared buffer is full. + posToWrite = matchBuffer->reserveMemory(matchCnt); + if (posToWrite + matchCnt >= matchBuffer->bufferSize) { + hasOverflow = true; + querySplits[i].start = lastMovedQueryIdx + 1; + __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt); + break; + } else { // not full -> copy matches to the shared buffer + moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); + lastMovedQueryIdx = j; + } + } + for (int k = 0; k < currMatchNum; k++) { + idx = selectedMatches[k]; + matches[matchCnt] = {queryKmerList[j].info, + candidateKmerInfos[idx].sequenceID, + taxId2genusId[candidateKmerInfos[idx].sequenceID], + taxId2speciesId[candidateKmerInfos[idx].sequenceID], + selectedHammings[k], + selectedHammingSum[k], + (bool) candidateKmerInfos[idx].redundancy}; + matchCnt++; + } + currentQuery = queryKmerList[j].ADkmer; + currentQueryAA = AminoAcidPart(currentQuery); + currentQueryInfo = queryKmerList[j].info; + continue; + } + candidateTargetKmers.clear(); + candidateKmerInfos.clear(); + + // Get next query, and start to find + currentQuery = queryKmerList[j].ADkmer; + currentQueryAA = AminoAcidPart(currentQuery); + currentQueryInfo = queryKmerList[j].info; + + // Skip target k-mers that are not matched in amino acid level + while (diffIdxPos != numOfDiffIdx + && (currentQueryAA > AminoAcidPart(currentTargetKmer))) { + if (unlikely(BufferSize < diffIdxBufferIdx + 7)){ + loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 ); + } + currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, + diffIdxBufferIdx, diffIdxPos); + kmerInfoBufferIdx ++; + } + + if (currentQueryAA != AminoAcidPart(currentTargetKmer)) // Move to next query k-mer if there isn't any match. + continue; + + // Load target k-mers that are matched in amino acid level + while (diffIdxPos != numOfDiffIdx && + currentQueryAA == AminoAcidPart(currentTargetKmer)) { + candidateTargetKmers.push_back(currentTargetKmer); + candidateKmerInfos.push_back(getKmerInfo(BufferSize, kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx)); + // Print the target k-mer +// if (par.printLog == 1) { +// cout << queryKmerList[j].info.sequenceID << "\t" << queryKmerList[j].info.pos << "\t" +// << (int) queryKmerList[j].info.frame << endl; +// cout << "Query k-mer: "; +// print_binary64(64, currentQuery); +// cout << "\t"; +// seqIterator.printKmerInDNAsequence(currentQuery); +// cout << endl; +// cout << "Target k-mer: "; +// print_binary64(64, currentTargetKmer); +// cout << "\t"; +// seqIterator.printKmerInDNAsequence(currentTargetKmer); +// cout << "\t" << kmerInfoBuffer[kmerInfoBufferIdx].sequenceID +// << "\t" << taxId2speciesId[kmerInfoBuffer[kmerInfoBufferIdx].sequenceID] << endl; +// cout << (int) getHammingDistanceSum(currentQuery, currentTargetKmer) << "\t"; +// print_binary16(16, getHammings(currentQuery, currentTargetKmer)); cout << endl; +// } + + if (unlikely(BufferSize < diffIdxBufferIdx + 7)){ + loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, + BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 ); + } + + currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, + diffIdxBufferIdx, diffIdxPos); + kmerInfoBufferIdx ++; + } + + // Compare the current query and the loaded target k-mers and select + compareDna(currentQuery, candidateTargetKmers, selectedMatches, selectedHammingSum, + selectedHammings, queryKmerList[j].info.frame); + + // If local buffer is full, copy them to the shared buffer. + currMatchNum = selectedMatches.size(); + if (matchCnt + currMatchNum > localBufferSize) { + // Check if the shared buffer is full. + posToWrite = matchBuffer->reserveMemory(matchCnt); + if (posToWrite + matchCnt >= matchBuffer->bufferSize) { // full -> write matches to file first + hasOverflow = true; + querySplits[i].start = lastMovedQueryIdx + 1; + __sync_fetch_and_sub(&matchBuffer->startIndexOfReserve, matchCnt); + break; + } else { // not full -> copy matches to the shared buffer + moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); + lastMovedQueryIdx = j; + } + } + + for (int k = 0; k < currMatchNum; k++) { + idx = selectedMatches[k]; + matches[matchCnt] = {queryKmerList[j].info, + candidateKmerInfos[idx].sequenceID, + taxId2genusId[candidateKmerInfos[idx].sequenceID], + taxId2speciesId[candidateKmerInfos[idx].sequenceID], + selectedHammings[k], + selectedHammingSum[k], + (bool) candidateKmerInfos[idx].redundancy}; + matchCnt++; + } + } // End of one split + + // Move matches in the local buffer to the shared buffer + posToWrite = matchBuffer->reserveMemory(matchCnt); + if (posToWrite + matchCnt >= matchBuffer->bufferSize) { + hasOverflow = true; + querySplits[i].start = lastMovedQueryIdx + 1; + __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt); + } else { + moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); + } + + // Check whether current split is completed or not + if (querySplits[i].start - 1 == querySplits[i].end) { + splitCheckList[i] = true; + __sync_fetch_and_add(&completedSplitCnt, 1); + } + } // End of omp for (Iterating for splits) + delete[] matches; + fclose(diffIdxFp); + fclose(kmerInfoFp); + free(diffIdxBuffer); + free(kmerInfoBuffer); + } // End of omp parallel + if (hasOverflow) { + std::cout << "overflow!!!" << std::endl; + return 2; + } + } // end of while(completeSplitCnt < threadNum) + std::cout << "Time spent for the comparison: " << double(time(nullptr) - beforeSearch) << std::endl; + munmap(diffIdxSplits.data, diffIdxSplits.fileSize + 1); + free(splitCheckList); + queryKmerNum = 0; + +#ifdef OPENMP + omp_set_num_threads(par.threads); +#endif + + // Sort matches + time_t beforeSortMatches = time(nullptr); + totalMatchCnt += matchBuffer->startIndexOfReserve; + std::cout << "Sorting matches ..." << std::endl; + SORT_PARALLEL(matchBuffer->buffer, matchBuffer->buffer + matchBuffer->startIndexOfReserve, + sortMatch()); + std::cout << "Time spent for sorting matches: " << double(time(nullptr) - beforeSortMatches) << std::endl; + + return 1; +} + +void KmerMatcher::moveMatches(Match *dest, Match *src, int &matchNum) { + memcpy(dest, src, sizeof(Match) * matchNum); + matchNum = 0; +} + +// It compares query k-mers to target k-mers. +// If a query has matches, the matches with the smallest hamming distance will be selected +void KmerMatcher::compareDna(uint64_t query, + std::vector &targetKmersToCompare, + std::vector &selectedMatches, + std::vector &selectedHammingSum, + std::vector &selectedHammings, uint8_t frame) { + + size_t size = targetKmersToCompare.size(); + auto *hammingSums = new uint8_t[size + 1]; + uint8_t currentHammingSum; + uint8_t minHammingSum = UINT8_MAX; + + // Calculate hamming distance + for (size_t i = 0; i < size; i++) { + currentHammingSum = getHammingDistanceSum(query, targetKmersToCompare[i]); + if (currentHammingSum < minHammingSum) { + minHammingSum = currentHammingSum; + } + hammingSums[i] = currentHammingSum; + } + + // Select target k-mers that passed hamming criteria + for (size_t h = 0; h < size; h++) { + if (hammingSums[h] <= minHammingSum + hammingMargin) { + selectedMatches.push_back(h); + selectedHammingSum.push_back(hammingSums[h]); + if (frame < 3) { + selectedHammings.push_back(getHammings(query, targetKmersToCompare[h])); + } else { + selectedHammings.push_back(getHammings_reverse(query, targetKmersToCompare[h])); + } + } + } + delete[] hammingSums; +} \ No newline at end of file diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h new file mode 100644 index 00000000..56379b7a --- /dev/null +++ b/src/commons/KmerMatcher.h @@ -0,0 +1,196 @@ +#ifndef METABULI_KMERMATCHER_H +#define METABULI_KMERMATCHER_H +#include "KmerBuffer.h" +#include "Match.h" +#include "common.h" +#include "LocalParameters.h" +#include +#include "FileUtil.h" +#include "Mmap.h" +#include "BitManipulateMacros.h" +#include "NcbiTaxonomy.h" + +#define BufferSize 16'777'216 //16 * 1024 * 1024 // 16 M + +// Input +// 1. Query K-mers +// 2. Reference K-mers + +// Output +// 1. Matched K-mers + + + +class KmerMatcher { +protected: + NcbiTaxonomy * taxonomy; + size_t threads; + std::string targetDiffIdxFileName, targetInfoFileName, diffIdxSplitFileName; + MmapedData diffIdxSplits; + uint64_t MARKER; + int bitsForCodon = 3; + uint8_t hammingMargin; + size_t totalMatchCnt; + uint8_t hammingLookup[8][8] = { + {0, 1, 1, 1, 2, 1, 3, 3}, + {1, 0, 1, 1, 2, 2, 3, 2}, + {1, 1, 0, 1, 2, 2, 2, 3}, + {1, 1, 1, 0, 1, 2, 3, 3}, + {2, 2, 2, 1, 0, 1, 4, 4}, + {1, 2, 2, 2, 1, 0, 4, 4}, + {3, 3, 2, 3, 4, 4, 0, 1}, + {3, 2, 3, 3, 4, 4, 1, 0}}; + unordered_map taxId2speciesId; + unordered_map taxId2genusId; + + + struct QueryKmerSplit { + QueryKmerSplit(size_t start, size_t end, size_t length, const DiffIdxSplit& diffIdxSplit) + : start(start), end(end), length(length), diffIdxSplit(diffIdxSplit) {} + + size_t start; // start idx in query k-mer list + size_t end; // end idx in query k-mer list + size_t length; + DiffIdxSplit diffIdxSplit; // index in target k-mer list from where the search begins. + }; + + size_t AminoAcidPart(size_t kmer) const { return (kmer) & MARKER; } + + template + static void loadBuffer(FILE * fp, T * buffer, size_t & bufferIdx, size_t size){ + fread(buffer, sizeof(T), size, fp); + bufferIdx = 0; + } + + template + static void loadBuffer(FILE * fp, T * buffer, size_t & bufferIdx, size_t size, int cnt){ + fseek(fp, cnt * sizeof(T), SEEK_CUR); + fread(buffer, sizeof(T), size, fp); + bufferIdx = 0; + } + + static uint64_t getNextTargetKmer(uint64_t lookingTarget, + const uint16_t * diffIdxBuffer, + size_t & diffBufferIdx, + size_t & totalPos); + + + static TargetKmerInfo getKmerInfo(size_t bufferSize, + FILE *kmerInfoFp, + TargetKmerInfo *infoBuffer, + size_t &infoBufferIdx); + + void moveMatches(Match *dest, + Match *src, + int &matchNum); + + void compareDna(uint64_t query, + std::vector &targetKmersToCompare, + std::vector &selectedMatches, + std::vector &selectedHammingSum, + std::vector &rightEndHammings, + uint8_t frame); + + virtual uint8_t getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2); + + virtual uint16_t getHammings(uint64_t kmer1, uint64_t kmer2); + + virtual uint16_t getHammings_reverse(uint64_t kmer1, uint64_t kmer2); + +public: + KmerMatcher(const LocalParameters & par, + NcbiTaxonomy * taxonomy); + + virtual ~KmerMatcher(); + + int matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer * matchBuffer); + +}; + +inline +uint64_t KmerMatcher::getNextTargetKmer(uint64_t lookingTarget, + const uint16_t *diffIdxBuffer, + size_t &diffBufferIdx, + size_t &totalPos) { + uint16_t fragment; + uint16_t check = 32768; // 2^15 + uint64_t diffIn64bit = 0; + fragment = diffIdxBuffer[diffBufferIdx++]; + totalPos++; + while (!(fragment & check)) { // 27 % + diffIn64bit |= fragment; + diffIn64bit <<= 15u; + fragment = diffIdxBuffer[diffBufferIdx++]; + totalPos++; + } + fragment &= ~check; // not; 8.47 % + diffIn64bit |= fragment; // or : 23.6% + return diffIn64bit + lookingTarget; +} + +inline +TargetKmerInfo KmerMatcher::getKmerInfo(size_t bufferSize, + FILE * kmerInfoFp, + TargetKmerInfo * infoBuffer, + size_t & infoBufferIdx){ + if (unlikely(infoBufferIdx >= bufferSize)) { + loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize, (int) (infoBufferIdx - bufferSize)); + } + return infoBuffer[infoBufferIdx]; +} + +inline uint8_t KmerMatcher::getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2) {//12345678 + uint8_t hammingSum = 0; + hammingSum += hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 3U)][GET_3_BITS(kmer2 >> 3U)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 6U)][GET_3_BITS(kmer2 >> 6U)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 9U)][GET_3_BITS(kmer2 >> 9U)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 12U)][GET_3_BITS(kmer2 >> 12U)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 15U)][GET_3_BITS(kmer2 >> 15U)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 18U)][GET_3_BITS(kmer2 >> 18U)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 21U)][GET_3_BITS(kmer2 >> 21U)]; + return hammingSum; +} + +inline uint16_t KmerMatcher::getHammings(uint64_t kmer1, uint64_t kmer2) { //hammings 87654321 + uint16_t hammings = 0; + for (int i = 0; i < 8; i++) { + hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * i; + kmer1 >>= bitsForCodon; + kmer2 >>= bitsForCodon; + } + return hammings; +} + +inline uint16_t KmerMatcher::getHammings_reverse(uint64_t kmer1, uint64_t kmer2) { //hammings 87654321 + uint16_t hammings = 0; + for (int i = 0; i < 8; i++) { + hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * (7-i); + kmer1 >>= bitsForCodon; + kmer2 >>= bitsForCodon; + } + return hammings; +} + +struct sortMatch { + bool operator() (const Match& a, const Match& b) const { + if (a.qInfo.sequenceID != b.qInfo.sequenceID) + return a.qInfo.sequenceID < b.qInfo.sequenceID; + + if (a.genusId != b.genusId) + return a.genusId < b.genusId; + + if (a.speciesId != b.speciesId) + return a.speciesId < b.speciesId; + + if (a.qInfo.frame != b.qInfo.frame) + return a.qInfo.frame < b.qInfo.frame; + + if (a.qInfo.pos != b.qInfo.pos) + return a.qInfo.pos < b.qInfo.pos; + + return a.hamming < b.hamming; + } +}; + +#endif //METABULI_KMERMATCHER_H diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp new file mode 100644 index 00000000..039c635a --- /dev/null +++ b/src/commons/LocalUtil.cpp @@ -0,0 +1,40 @@ +#include "LocalUtil.h" + + +std::string LocalUtil::getQueryBaseName(const std::string queryPath) { + std::vector splits = Util::split(queryPath, "."); + std::string baseName; + int extentionNum = 1; + if (Util::endsWith(".gz", queryPath)) { + extentionNum = 2; + } + for (size_t i = 0; i < splits.size() - extentionNum; ++i) { + if (i == splits.size() - extentionNum - 1) { + baseName += splits[i]; + } else { + baseName += splits[i] + "."; + } + } + return baseName; +} + +template +T LocalUtil::getQueryKmerNumber(T queryLength, int spaceNum) { + return (getMaxCoveredLength(queryLength) / 3 - kmerLength - spaceNum + 1) * 6; +} + + +void LocalUtil::splitQueryFile(std::vector & sequences, const std::string &queryPath) { + KSeqWrapper* kseq = nullptr; + kseq = KSeqFactory(queryPath.c_str()); + while (kseq->ReadEntry()) { + const KSeqWrapper::KSeqEntry & e = kseq->entry; + sequences.emplace_back(e.headerOffset - 1, + e.sequenceOffset + e.sequence.l, + e.sequenceOffset + e.sequence.l - e.headerOffset + 2, + e.sequence.l); + } + delete kseq; +} + + diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h new file mode 100644 index 00000000..1d34a45c --- /dev/null +++ b/src/commons/LocalUtil.h @@ -0,0 +1,23 @@ +#ifndef METABULI_LOCALUTIL_H +#define METABULI_LOCALUTIL_H + +#include "Util.h" +#include +#include "common.h" +#include "KSeqWrapper.h" + +class LocalUtil : public Util { +public: + LocalUtil() = default; + + static std::string getQueryBaseName(const std::string queryPath); + + template + static T getQueryKmerNumber(T queryLength, int spaceNum); + + static void splitQueryFile(vector & seqSegments, const string & queryPath); + +}; + + +#endif //METABULI_LOCALUTIL_H diff --git a/src/commons/Mmap.h b/src/commons/Mmap.h index 8b680e55..5d530716 100644 --- a/src/commons/Mmap.h +++ b/src/commons/Mmap.h @@ -1,7 +1,3 @@ -// -// Created by KJB on 26/08/2020. -// - #ifndef ADKMER3_MMAP_H #define ADKMER3_MMAP_H #include diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp new file mode 100644 index 00000000..636f65d2 --- /dev/null +++ b/src/commons/QueryFilter.cpp @@ -0,0 +1,52 @@ +#include "QueryFilter.h" + +QueryFilter::QueryFilter(LocalParameters & par) { + if (par.reducedAA == 1) { + classifier = new ReducedClassifier(par); + } else { + classifier = new Classifier(par); + } + queryIndexer = new QueryIndexer(par); + + setInputAndOutputFiles(par); +} + +QueryFilter::~QueryFilter() { + delete queryIndexer; + delete classifier; +} + +void QueryFilter::setInputAndOutputFiles(const LocalParameters & par) { + // Get the base name of in1 + in1 = par.filenames[0]; + string baseName = LocalUtil::getQueryBaseName(in1); + + // Set the output file names + out1 = baseName + "_filtered.fna.gz"; + reportFileName = baseName + "_filter_report.tsv"; + + // For paired-end reads + if (par.seqMode == 2) { + in2 = par.filenames[1]; + out2 = LocalUtil::getQueryBaseName(in2) + "_filtered.fna.gz"; + } +} + +void QueryFilter::filterReads(LocalParameters & par) { + + cout << "Indexing query file ..."; + queryIndexer->indexQueryFile(); + size_t numOfSeq = queryIndexer->getReadNum_1(); + size_t totalReadLength = queryIndexer->getTotalReadLength(); + const vector & queryReadSplit = queryIndexer->getQuerySplits(); + cout << "Done" << endl; + cout << "Total number of sequences: " << numOfSeq << endl; + cout << "Total read length: " << totalReadLength << "nt" << endl; + + QueryKmerBuffer kmerBuffer; + Buffer matchBuffer; + vector queryList; + + +} + diff --git a/src/commons/QueryFilter.h b/src/commons/QueryFilter.h new file mode 100644 index 00000000..33fa7de1 --- /dev/null +++ b/src/commons/QueryFilter.h @@ -0,0 +1,23 @@ +#ifndef METABULI_FILTERER_H +#define METABULI_FILTERER_H + +#include "LocalUtil.h" +#include "QueryIndexer.h" +#include "ReducedKmerMatcher.h" +class QueryFilter { +private: + QueryIndexer * queryIndexer; + KmerMatcher * kmerMatcher; + + std::string in1, in2, out1, out2, reportFileName; // input and output file names + + void setInputAndOutputFiles(const LocalParameters & par); + +public: + void filterReads(LocalParameters & par); + explicit QueryFilter(LocalParameters & par); + ~QueryFilter(); +}; + + +#endif //METABULI_FILTERER_H diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp new file mode 100644 index 00000000..f5a30ff9 --- /dev/null +++ b/src/commons/QueryIndexer.cpp @@ -0,0 +1,102 @@ +#include "QueryIndexer.h" + +QueryIndexer::QueryIndexer(const LocalParameters & par) { + seqMode = par.seqMode; + if (seqMode == 1 || seqMode == 3) { + queryPath_1 = par.filenames[0]; + queryPath_2 = ""; + } else { + queryPath_1 = par.filenames[0]; + queryPath_2 = par.filenames[1]; + } + + matchPerKmer = par.matchPerKmer; + maxRam = par.ramUsage; + threads = par.threads; + bytesPerKmer = sizeof(QueryKmer) + matchPerKmer * sizeof(Match); + readNum_1 = 0; + readNum_2 = 0; + spaceNum = par.spaceMask.length() - kmerLength; + + setAvailableRam(); +} + +void QueryIndexer::setAvailableRam() { + availableRam = ((size_t) maxRam * (size_t) 1024 * 1024 * 1024) + - ((size_t) 134217728 * (size_t) threads); +} + +void QueryIndexer::indexQueryFile() { + // Read 1 + KSeqWrapper* kseq; + kseq = KSeqFactory(queryPath_1.c_str()); + size_t kmerCnt = 0; + size_t seqCnt = 0; + size_t start = 0; + while (kseq->ReadEntry()) { + readNum_1++; + const KSeqWrapper::KSeqEntry &e = kseq->entry; + totalReadLength += e.sequence.l; + size_t currentKmerCnt = LocalUtil::getQueryKmerNumber(e.sequence.l, spaceNum); + kmerCnt += currentKmerCnt; + seqCnt++; + if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) { + querySplits.emplace_back(start, readNum_1, kmerCnt - currentKmerCnt); + kmerCnt = currentKmerCnt; + start = readNum_1; + seqCnt = 1; + } + } + querySplits.emplace_back(start, readNum_1, kmerCnt); + delete kseq; + + // Read 2 + if (seqMode == 2) { + kseq = KSeqFactory(queryPath_2.c_str()); + kmerCnt = 0; + seqCnt = 0; + start = 0; + while (kseq->ReadEntry()) { + readNum_2++; + const KSeqWrapper::KSeqEntry &e = kseq->entry; + totalReadLength += e.sequence.l; + size_t currentKmerCnt = LocalUtil::getQueryKmerNumber(e.sequence.l, spaceNum); + kmerCnt += currentKmerCnt; + seqCnt++; + if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) { + querySplits.emplace_back(start, readNum_2, kmerCnt - currentKmerCnt); + kmerCnt = currentKmerCnt; + start = readNum_2; + seqCnt = 1; + } + } + querySplits.emplace_back(start, readNum_2, kmerCnt); + delete kseq; + + // Check if the number of reads in the two files are equal + if (readNum_1 != readNum_2) { + Debug(Debug::ERROR) << "The number of reads in the two files are not equal." << "\n"; + EXIT(EXIT_FAILURE); + } + } +} + +size_t QueryIndexer::getReadNum_1() const { + return readNum_1; +} + +size_t QueryIndexer::getReadNum_2() const { + return readNum_2; +} + +const std::vector & QueryIndexer::getQuerySplits() const { + return querySplits; +} + +std::size_t QueryIndexer::getTotalReadLength() const { + return totalReadLength; +} + +size_t QueryIndexer::getAvailableRam() const { + return availableRam; +} \ No newline at end of file diff --git a/src/commons/QueryIndexer.h b/src/commons/QueryIndexer.h new file mode 100644 index 00000000..93e308eb --- /dev/null +++ b/src/commons/QueryIndexer.h @@ -0,0 +1,67 @@ +#ifndef METABULI_QUERYINDEXOR_H +#define METABULI_QUERYINDEXOR_H + +#include "LocalParameters.h" +#include "Kmer.h" +#include "Match.h" +#include "KSeqWrapper.h" +#include "LocalUtil.h" +#include "Debug.h" + +struct QuerySplit { + size_t start; + size_t end; + size_t kmerCnt; + + QuerySplit(size_t start, size_t end, size_t kmerCnt) : start(start), end(end), kmerCnt(kmerCnt) {} +}; + +// Input +// 1. A set of reads + +// Output +// 1. size_t numOfSeq; +// 2. vector querySplits; + +class QueryIndexer { +private: + // Input + std::string queryPath_1; + std::string queryPath_2; + size_t seqMode; + size_t matchPerKmer; + size_t maxRam; + size_t threads; + int spaceNum; + + // Internal + size_t availableRam; + size_t bytesPerKmer; + + // Output + std::size_t readNum_1; + std::size_t readNum_2; + std::vector querySplits; + std::size_t totalReadLength; + +public: + explicit QueryIndexer(const LocalParameters & par); + ~QueryIndexer() = default; + + void indexQueryFile(); + + // Getters + size_t getReadNum_1() const; + size_t getReadNum_2() const; + const std::vector & getQuerySplits() const; + std::size_t getTotalReadLength() const; + size_t getAvailableRam() const; + + + // Setters + void setAvailableRam(); + +}; + + +#endif //METABULI_QUERYINDEXOR_H diff --git a/src/commons/ReducedClassifier.cpp b/src/commons/ReducedClassifier.cpp deleted file mode 100644 index bd5b5f93..00000000 --- a/src/commons/ReducedClassifier.cpp +++ /dev/null @@ -1,11 +0,0 @@ -// -// Created by 김재범 on 2022/06/28. -// - -#include "ReducedClassifier.h" - -ReducedClassifier::ReducedClassifier(LocalParameters & par) -: Classifier(par){ - setMarker(0Xffffffff); - setNumOfBitsForCodon(4); -} \ No newline at end of file diff --git a/src/commons/ReducedClassifier.h b/src/commons/ReducedKmerMatcher.h similarity index 80% rename from src/commons/ReducedClassifier.h rename to src/commons/ReducedKmerMatcher.h index b3e4ab0c..dd43e646 100644 --- a/src/commons/ReducedClassifier.h +++ b/src/commons/ReducedKmerMatcher.h @@ -1,13 +1,11 @@ -// -// Created by 김재범 on 2022/06/28. -// +#ifndef METABULI_REDUCEDKMERMATCHER_H +#define METABULI_REDUCEDKMERMATCHER_H -#ifndef METABULI_REDUCEDCLASSIFIER_H -#define METABULI_REDUCEDCLASSIFIER_H +#include "KmerMatcher.h" +#include +#include "NcbiTaxonomy.h" -#include "Classifier.h" - -class ReducedClassifier : public Classifier { +class ReducedKmerMatcher : public KmerMatcher { protected: uint8_t hammingLookup[11][11] = { {0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3}, @@ -22,7 +20,6 @@ class ReducedClassifier : public Classifier { {3, 2, 3, 3, 4, 4, 4, 4, 4, 0, 4}, {3, 3, 2, 3, 4, 4, 4, 4, 4, 4, 0}}; - public: uint8_t getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2) override { uint8_t hammingSum = 0; @@ -58,8 +55,14 @@ class ReducedClassifier : public Classifier { return hammings; } - ReducedClassifier(LocalParameters & par); + explicit ReducedKmerMatcher(LocalParameters & par, + NcbiTaxonomy * taxonomy) + : KmerMatcher(par,taxonomy) { + MARKER = 0Xffffffff; + } + + ~ReducedKmerMatcher() override = default; }; -#endif //METABULI_REDUCEDCLASSIFIER_H +#endif //METABULI_REDUCEDKMERMATCHER_H diff --git a/src/commons/Reporter.cpp b/src/commons/Reporter.cpp new file mode 100644 index 00000000..288aecdb --- /dev/null +++ b/src/commons/Reporter.cpp @@ -0,0 +1,94 @@ +#include "Reporter.h" +#include "taxonomyreport.cpp" + +Reporter::Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxonomy(taxonomy){ + if (par.seqMode == 2) { + outDir = par.filenames[3]; + jobId = par.filenames[4]; + } else { + outDir = par.filenames[2]; + jobId = par.filenames[3]; + } +} + +void Reporter::openReadClassificationFile() { + readClassificationFile.open(outDir + "/" + jobId + "_classifications.tsv"); +} + +void Reporter::writeReadClassification(const vector & queryList) { + for (size_t i = 0; i < queryList.size(); i++) { + readClassificationFile << queryList[i].isClassified << "\t" << queryList[i].name << "\t" + << queryList[i].classification << "\t" + << queryList[i].queryLength + queryList[i].queryLength2 << "\t" + << queryList[i].score << "\t" + << queryList[i].coverage << "\t" + << queryList[i].hammingDist << "\t" + << taxonomy->getString(taxonomy->taxonNode(queryList[i].classification)->rankIdx) << "\t"; + for (auto it = queryList[i].taxCnt.begin(); it != queryList[i].taxCnt.end(); ++it) { + readClassificationFile << it->first << ":" << it->second << " "; + } + readClassificationFile << "\n"; + } +} + +void Reporter::closeReadClassificationFile() { + readClassificationFile.close(); +} + +void Reporter::writeReportFile(int numOfQuery, unordered_map &taxCnt) { + unordered_map cladeCounts = taxonomy->getCladeCounts(taxCnt); + FILE *fp; + fp = fopen((outDir + + "/" + jobId + "_report.tsv").c_str(), "w"); + writeReport(fp, cladeCounts, numOfQuery); + fclose(fp); + + // Write Krona chart + FILE *kronaFile = fopen((outDir + "/" + jobId + "_krona.html").c_str(), "w"); + fwrite(krona_prelude_html, krona_prelude_html_len, sizeof(char), kronaFile); + fprintf(kronaFile, "%zu", numOfQuery); + kronaReport(kronaFile, *taxonomy, cladeCounts, numOfQuery); + fprintf(kronaFile, ""); + +} + +void Reporter::writeReport(FILE *FP, const std::unordered_map &cladeCounts, + unsigned long totalReads, TaxID taxID, int depth) { + std::unordered_map::const_iterator it = cladeCounts.find(taxID); + unsigned int cladeCount = it == cladeCounts.end() ? 0 : it->second.cladeCount; + unsigned int taxCount = it == cladeCounts.end() ? 0 : it->second.taxCount; + if (taxID == 0) { + if (cladeCount > 0) { + fprintf(FP, "%.4f\t%i\t%i\tno rank\t0\tunclassified\n", + 100 * cladeCount / double(totalReads), + cladeCount, taxCount); + } + writeReport(FP, cladeCounts, totalReads, 1); + } else { + if (cladeCount == 0) { + return; + } + const TaxonNode *taxon = taxonomy->taxonNode(taxID); + fprintf(FP, "%.4f\t%i\t%i\t%s\t%i\t%s%s\n", + 100 * cladeCount / double(totalReads), cladeCount, taxCount, + taxonomy->getString(taxon->rankIdx), taxID, std::string(2 * depth, ' ').c_str(), taxonomy->getString(taxon->nameIdx)); + std::vector children = it->second.children; + SORT_SERIAL(children.begin(), children.end(), [&](int a, int b) { return cladeCountVal(cladeCounts, a) > cladeCountVal(cladeCounts, b); }); + for (size_t i = 0; i < children.size(); ++i) { + TaxID childTaxId = children[i]; + if (cladeCounts.count(childTaxId)) { + writeReport(FP, cladeCounts, totalReads, childTaxId, depth + 1); + } else { + break; + } + } + } +} + +unsigned int Reporter::cladeCountVal(const std::unordered_map &map, TaxID key) { + typename std::unordered_map::const_iterator it = map.find(key); + if (it == map.end()) { + return 0; + } else { + return it->second.cladeCount; + } +} \ No newline at end of file diff --git a/src/commons/Reporter.h b/src/commons/Reporter.h new file mode 100644 index 00000000..4de0f32c --- /dev/null +++ b/src/commons/Reporter.h @@ -0,0 +1,42 @@ +#ifndef METABULI_REPORTER_H +#define METABULI_REPORTER_H +#include "common.h" +#include "iostream" +#include "fstream" +#include +#include "NcbiTaxonomy.h" +#include "LocalParameters.h" + +using namespace std; + + +class Reporter { +private: + string outDir; + string jobId; + NcbiTaxonomy * taxonomy; + + // Output + ofstream readClassificationFile; + + +public: + Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy); + // Write report + + // Read by read classification results + void openReadClassificationFile(); + void writeReadClassification(const vector & queryList); + void closeReadClassificationFile(); + + void writeReportFile(int numOfQuery, unordered_map &taxCnt); + + void writeReport(FILE *FP, const std::unordered_map &cladeCounts, + unsigned long totalReads, TaxID taxID = 0, int depth = 0); + + unsigned int cladeCountVal(const std::unordered_map &map, TaxID key); + +}; + + +#endif //METABULI_REPORTER_H diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp new file mode 100644 index 00000000..c41ddea9 --- /dev/null +++ b/src/commons/Taxonomer.cpp @@ -0,0 +1,1164 @@ +#include "Taxonomer.h" + + +Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxonomy(taxonomy) { + // Parameters + auto mask = new uint32_t[par.spaceMask.length()]; + for(size_t i = 0, j = 0; i < par.spaceMask.length(); i++){ + mask[i] = par.spaceMask[i] - 48; + spaceNum += (mask[i] == 0); + if(par.spaceMask[i]==1){ + unmaskedPos[j] = (int) i; + j++; + } + } + delete[] mask; + maxGap = par.maxGap; + minCoveredPos = par.minCoveredPos; +} + +Taxonomer::~Taxonomer() { + +} + +void Taxonomer::assignTaxonomy(const Match *matchList, + size_t numOfMatches, + std::vector &queryList, + const LocalParameters &par) { + time_t beforeAnalyze = time(nullptr); + cout << "Analyzing matches ..." << endl; + + // Divide matches into blocks for multi threading + size_t seqNum = queryList.size(); + MatchBlock *matchBlocks = new MatchBlock[seqNum]; + size_t matchIdx = 0; + size_t blockIdx = 0; + uint32_t currentQuery; + while (matchIdx < numOfMatches) { + currentQuery = matchList[matchIdx].qInfo.sequenceID; + matchBlocks[blockIdx].id = currentQuery; + matchBlocks[blockIdx].start = matchIdx; + while ((currentQuery == matchList[matchIdx].qInfo.sequenceID) && (matchIdx < numOfMatches)) ++matchIdx; + matchBlocks[blockIdx].end = matchIdx - 1; + blockIdx++; + } + + // Process each block +#pragma omp parallel default(none), shared(cout, matchBlocks, matchList, seqNum, queryList, blockIdx, par) + { +#pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < blockIdx; ++i) { + chooseBestTaxon(matchBlocks[i].id, + matchBlocks[i].start, + matchBlocks[i].end, + matchList, + queryList, + par); + } + } + + for (size_t i = 0; i < seqNum; i++) { + ++taxCounts[queryList[i].classification]; + } + delete[] matchBlocks; + cout << "Time spent for analyzing: " << double(time(nullptr) - beforeAnalyze) << endl; + +} + +void Taxonomer::chooseBestTaxon(uint32_t currentQuery, + size_t offset, + size_t end, + const Match *matchList, + vector & queryList, + const LocalParameters &par) { + TaxID selectedTaxon; +// if (par.printLog) { +// cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; +// for (size_t i = offset; i < end + 1; i++) { +// cout << taxId2genusId[matchList[i].targetId] << " " << taxId2speciesId[matchList[i].targetId] << +// " " << matchList[i].targetId << " " << matchList[i].qInfo.frame << " "; +// print_binary16(16, matchList[i].rightEndHamming); +// cout << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; +// } +// } + + // Get the best genus for current query + vector genusMatches; + genusMatches.reserve(end - offset + 1); + + int res; + TaxonScore genusScore(0, 0, 0, 0); + if (par.seqMode == 2) { + if (par.spaceMask != "11111111"){ + genusScore = getBestGenusMatches_spaced(genusMatches, matchList, end, offset, + queryList[currentQuery].queryLength, + queryList[currentQuery].queryLength2); + } else { + genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, + queryList[currentQuery].queryLength, + queryList[currentQuery].queryLength2, par); + } + } else { + if (par.spaceMask != "11111111") { + genusScore = getBestGenusMatches_spaced(genusMatches, matchList, end, offset, + queryList[currentQuery].queryLength); + } else { + genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, + queryList[currentQuery].queryLength, par); + } + } + +// if (par.printLog) { +// cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n"; +// for (size_t i = 0; i < genusMatches.size(); i++) { +// cout << taxId2genusId[genusMatches[i].targetId] << " " << taxId2speciesId[genusMatches[i].targetId] << +// " " << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " "; +// print_binary16(16, genusMatches[i].rightEndHamming); +// cout << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) << " " << int(genusMatches[i].redundancy) << endl; +// } +// cout << "Genus score: " << genusScore.score << "\n"; +// } + + // If there is no proper genus for current query, it is un-classified. + if (genusScore.score == 0 || genusScore.coverage < par.minCoverage || genusScore.score < par.minScore) { + queryList[currentQuery].isClassified = false; + queryList[currentQuery].classification = 0; + queryList[currentQuery].score = genusScore.score; + queryList[currentQuery].coverage = genusScore.coverage; + queryList[currentQuery].hammingDist = genusScore.hammingDist; + queryList[currentQuery].newSpecies = false; + return; + } + + // If there are two or more good genus level candidates, find the LCA. + if (genusScore.taxId == 0) { + vector genusList; + genusList.reserve(genusMatches.size()); + for (auto & genusMatch : genusMatches) { + genusList.push_back(genusMatch.genusId); + } + selectedTaxon = taxonomy->LCA(genusList)->taxId; + queryList[currentQuery].isClassified = true; + queryList[currentQuery].classification = selectedTaxon; + queryList[currentQuery].score = genusScore.score; + queryList[currentQuery].coverage = genusScore.coverage; + queryList[currentQuery].hammingDist = genusScore.hammingDist; + for (auto & genusMatch : genusMatches) { + queryList[currentQuery].taxCnt[genusMatch.targetId]++; + } + return; + } + + // Choose the species with the highest coverage. + TaxID selectedSpecies; + TaxonScore speciesScore; + vector species; + unordered_map> speciesMatchRange; + if (par.seqMode == 2) { + speciesScore = chooseSpecies(genusMatches, + queryList[currentQuery].queryLength, + queryList[currentQuery].queryLength2, + species, + speciesMatchRange); + } else { + speciesScore = chooseSpecies(genusMatches, + queryList[currentQuery].queryLength, + species, + speciesMatchRange); + } + + + // Classify to LCA if more than one species are selected + if (species.size() > 1) { + queryList[currentQuery].isClassified = true; + queryList[currentQuery].classification = taxonomy->LCA(species)->taxId; + queryList[currentQuery].score = genusScore.score; + queryList[currentQuery].coverage = genusScore.coverage; + queryList[currentQuery].hammingDist = genusScore.hammingDist; + for (auto & genusMatch : genusMatches) { + queryList[currentQuery].taxCnt[genusMatch.targetId]++; + } + return; + } + + // If score is not enough, classify to the parent of the selected species + if (speciesScore.score < par.minSpScore) { + queryList[currentQuery].isClassified = true; + queryList[currentQuery].classification = taxonomy->taxonNode( + taxonomy->getTaxIdAtRank(species[0], "species"))->parentTaxId; + queryList[currentQuery].score = genusScore.score; + queryList[currentQuery].coverage = genusScore.coverage; + queryList[currentQuery].hammingDist = genusScore.hammingDist; + for (auto & genusMatch : genusMatches) { + if(genusMatch.speciesId == species[0]){ + queryList[currentQuery].taxCnt[genusMatch.targetId]++; + } + } + return; + } + + // Sort matches by the position of the query sequence + selectedSpecies = species[0]; +// sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first, +// genusMatches.begin() + speciesMatchRange[selectedSpecies].second, +// [](const Match & a, const Match & b) { +// if (a.qInfo.position / 3 == b.qInfo.position / 3) +// return a.hamming < b.hamming; +// else +// return a.qInfo.position / 3 < b.qInfo.position / 3; +// }); + + sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first, + genusMatches.begin() + speciesMatchRange[selectedSpecies].second, + [](const Match & a, const Match & b) { return a.qInfo.pos > b.qInfo.pos; }); + + + TaxID result = lowerRankClassification(genusMatches, speciesMatchRange[selectedSpecies], selectedSpecies); + + // Record matches of selected species + for (size_t i = speciesMatchRange[selectedSpecies].first; i < speciesMatchRange[selectedSpecies].second; i++) { + queryList[currentQuery].taxCnt[genusMatches[i].targetId]++; + } + + + // Store classification results + queryList[currentQuery].isClassified = true; + queryList[currentQuery].classification = result; + queryList[currentQuery].score = speciesScore.score; + queryList[currentQuery].coverage = speciesScore.coverage; + queryList[currentQuery].hammingDist = speciesScore.hammingDist; + queryList[currentQuery].newSpecies = false; +// if (par.printLog) { +// cout << "# " << currentQuery << endl; +// for (size_t i = 0; i < genusMatches.size(); i++) { +// cout << i << " " << genusMatches[i].qInfo.pos << " " << +// genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl; +// } +// cout << "Score: " << speciesScore.score << " " << selectedSpecies << " " +// << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx) +// +// << endl; +// } +} + +TaxID Taxonomer::lowerRankClassification(vector &matches, pair &matchRange, TaxID spTaxId) { + int i = matchRange.second - 1; + unordered_map taxCnt; + + while ( i >= matchRange.first ) { + size_t currQuotient = matches[i].qInfo.pos / 3; + uint8_t minHamming = matches[i].hamming; + Match * minHammingMatch = & matches[i]; + TaxID minHammingTaxId = minHammingMatch->targetId; + bool first = true; + i --; + while ( (i >= matchRange.first) && (currQuotient == matches[i].qInfo.pos / 3) ) { + if (matches[i].hamming < minHamming) { + minHamming = matches[i].hamming; + minHammingMatch = & matches[i]; + minHammingTaxId = minHammingMatch->targetId; + } else if (matches[i].hamming == minHamming) { + minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId); + minHammingMatch->redundancy = true; + matches[i].redundancy = true; + } + i--; + } + taxCnt[minHammingTaxId]++; + } + + unordered_map cladeCnt; + getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); + + return BFS(cladeCnt, spTaxId); +} + +void Taxonomer::getSpeciesCladeCounts(const unordered_map &taxCnt, + unordered_map & cladeCount, + TaxID speciesTaxID) { + for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) { +// cladeCount[it->first].taxCount = it->second; +// cladeCount[it->first].cladeCount += it->second; + TaxonNode const * taxon = taxonomy->taxonNode(it->first); + cladeCount[taxon->taxId].taxCount = it->second; + cladeCount[taxon->taxId].cladeCount += it->second; + while (taxon->taxId != speciesTaxID) { + if (find(cladeCount[taxon->parentTaxId].children.begin(), + cladeCount[taxon->parentTaxId].children.end(), + taxon->taxId) == cladeCount[taxon->parentTaxId].children.end()) { + cladeCount[taxon->parentTaxId].children.push_back(taxon->taxId); + } + cladeCount[taxon->parentTaxId].cladeCount += it->second; + taxon = taxonomy->taxonNode(taxon->parentTaxId); + } + } +} + +TaxID Taxonomer::BFS(const unordered_map & cladeCnt, TaxID root) { + if (cladeCnt.at(root).children.empty()) { // root is a leaf + return root; + } + unsigned int maxCnt = 3; + unsigned int currentCnt; + vector bestChildren; + for (auto it = cladeCnt.at(root).children.begin(); it != cladeCnt.at(root).children.end(); it++) { + currentCnt = cladeCnt.at(*it).cladeCount; + if (currentCnt > maxCnt) { + bestChildren.clear(); + bestChildren.push_back(*it); + maxCnt = currentCnt; + } else if (currentCnt == maxCnt) { + bestChildren.push_back(*it); + } + } + if (bestChildren.size() == 1) { + return BFS(cladeCnt, bestChildren[0]); + } else { + return root; + } +} + +TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, + size_t offset, int readLength1, int readLength2, const LocalParameters & par) { + TaxID currentGenus; + TaxID currentSpecies; + + vector filteredMatches; + vector> matchesForEachGenus; + vector genusScores; + TaxonScore bestScore; + size_t i = offset; + uint8_t curFrame; + vector curFrameMatches; + while (i < end + 1) { +// currentGenus = taxId2genusId[matchList[i].targetId]; + currentGenus = matchList[i].genusId; + // For current genus + while ((i < end + 1) && currentGenus == matchList[i].genusId) { +// currentSpecies = taxId2speciesId[matchList[i].targetId]; + currentSpecies = matchList[i].speciesId; +// if (par.printLog) { +// cout << currentGenus << " " << currentSpecies << endl; +// } + // For current species + while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { + curFrame = matchList[i].qInfo.frame; + curFrameMatches.clear(); + + // For current frame + while ((i < end + 1) && currentSpecies == matchList[i].speciesId + && curFrame == matchList[i].qInfo.frame) { + curFrameMatches.push_back(&matchList[i]); + i ++; + } + if (curFrameMatches.size() > 1) { + remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus, par); + } + } + } + + // Construct a match combination using filtered matches of current genus + // so that it can best cover the query, and score the combination + if (!filteredMatches.empty()) { + matchesForEachGenus.push_back(filteredMatches); + genusScores.push_back(scoreGenus(filteredMatches, readLength1, readLength2)); + } + filteredMatches.clear(); + } + + // If there are no meaningful genus + if (genusScores.empty()) { + bestScore.score = 0; + return bestScore; + } + + TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), + [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); + + vector maxIdx; + for (size_t g = 0; g < genusScores.size(); g++) { + if (genusScores[g].score > maxScore.score * 0.95f) { + maxIdx.push_back(g); + } + } + bestScore = maxScore; + + for (unsigned long g : maxIdx) { + for (const Match * m : matchesForEachGenus[g]) { + genusMatches.push_back(*m); + } + } + + + + // More than one genus + if (maxIdx.size() > 1) { + bestScore.taxId = 0; + return bestScore; + } + + return bestScore; + + //Three cases + //1. one genus + //2. more than one genus + //4. no genus +} + +void Taxonomer::remainConsecutiveMatches(vector & curFrameMatches, + vector & filteredMatches, + TaxID genusId, + const LocalParameters & par) { + size_t i = 0; + size_t end = curFrameMatches.size(); + vector> curPosMatches; // + vector> nextPosMatches; + map> linkedMatches; // + + size_t currPos = curFrameMatches[0]->qInfo.pos; + while ( i < end && curFrameMatches[i]->qInfo.pos == currPos) { + curPosMatches.emplace_back(curFrameMatches[i], i); + i++; + } + while (i < end) { + uint32_t nextPos = curFrameMatches[i]->qInfo.pos; + while (i < end && nextPos == curFrameMatches[i]->qInfo.pos) { + nextPosMatches.emplace_back(curFrameMatches[i], i); + ++ i; + } + // Check if current position and next position are consecutive + if (currPos + 3 == nextPos) { + // Compare curPosMatches and nextPosMatches + for (auto &curPosMatch: curPosMatches) { + for (auto &nextPosMatch: nextPosMatches) { + if (isConsecutive(curPosMatch.first, nextPosMatch.first)) { + linkedMatches[curPosMatch.second].push_back(nextPosMatch.second); + } + } + } + + } + // Update curPosMatches and nextPosMatches + curPosMatches = nextPosMatches; + nextPosMatches.clear(); + currPos = nextPos; + } + // Print linkedMatches +// if (par.printLog) { +// cout << "linkedMatches: " << endl; +// for (const auto &entry: linkedMatches) { +// cout << entry.first << ": "; +// for (auto &idx: entry.second) { +// cout << idx << " "; +// } +// cout << endl; +// } +// } + + // Iterate linkedMatches to get filteredMatches + int MIN_DEPTH = par.minConsCnt - 1; + if (taxonomy->IsAncestor(par.eukaryotaTaxId, genusId)) { + MIN_DEPTH = par.minConsCntEuk - 1; + } + unordered_set used; + vector filteredMatchIdx; + unordered_map idx2depth; + for (const auto& entry : linkedMatches) { + if (!used.count(entry.first)) { + used.insert(entry.first); + vector curMatches; + DFS(entry.first, linkedMatches, filteredMatchIdx, 0, MIN_DEPTH, used, idx2depth); + } + } + +// if (par.printLog) { +// cout << "filteredMatchIdx: "; +// for (auto &idx: filteredMatchIdx) { +// cout << idx << " "; +// } +// cout << endl; +// } + + for (auto &idx: filteredMatchIdx) { + filteredMatches.push_back(curFrameMatches[idx]); + } +} + + +size_t Taxonomer::DFS(size_t curMatchIdx, const map> & linkedMatches, + vector& filteredMatches, size_t depth, size_t MIN_DEPTH, unordered_set& used, + unordered_map & idx2depth) { + depth++; + size_t maxDepth = 0; + size_t returnDepth = 0; + if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { //|| linkedMatches.at(curMatchIdx).empty()) { + // reached a leaf node + idx2depth[curMatchIdx] = depth; + if (depth > MIN_DEPTH) { + filteredMatches.push_back(curMatchIdx); + } + return depth; + } else { // not a leaf node + for (auto &nextMatchIdx: linkedMatches.at(curMatchIdx)) { + used.insert(nextMatchIdx); + if (idx2depth.find(nextMatchIdx) != idx2depth.end()) { + returnDepth = idx2depth[nextMatchIdx]; + maxDepth = max(maxDepth, returnDepth); + continue; + } + returnDepth = DFS(nextMatchIdx, linkedMatches, filteredMatches, depth, MIN_DEPTH, used, idx2depth); + maxDepth = max(maxDepth, returnDepth); + } + if (maxDepth > MIN_DEPTH) { + filteredMatches.push_back(curMatchIdx); + idx2depth[curMatchIdx] = maxDepth; + } + } + return maxDepth; +} + +TaxonScore Taxonomer::getBestGenusMatches_spaced(vector &genusMatches, const Match *matchList, size_t end, + size_t offset, int readLength1, int readLength2) { + TaxID currentGenus; + TaxID currentSpecies; + + vector tempMatchContainer; + vector filteredMatches; + vector> matchesForEachGenus; + vector conservedWithinGenus; + vector genusScores; + TaxonScore bestScore; + size_t i = offset; + bool lastIn; + while (i + 1 < end + 1) { + currentGenus = matchList[i].genusId; + // For current genus + while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) { +// currentSpecies = taxId2speciesId[matchList[i].targetId]; + currentSpecies = matchList[i].speciesId; + // For current species + // Filter un-consecutive matches (probably random matches) + lastIn = false; + int distance = 0; + int diffPosCntOfCurrRange = 1; + int dnaDist = 0; + + // For the same species + while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) { + distance = matchList[i+1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3; + dnaDist = matchList[i+1].qInfo.pos - matchList[i].qInfo.pos; + if (distance == 0) { // At the same position + tempMatchContainer.push_back(matchList + i); + } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping + lastIn = true; + tempMatchContainer.push_back(matchList + i); + diffPosCntOfCurrRange ++; + } else { // Not consecutive --> End range + if (lastIn){ + tempMatchContainer.push_back(matchList + i); + if (diffPosCntOfCurrRange >= minCoveredPos) { + filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), + tempMatchContainer.end()); + } + } + lastIn = false; + // Initialize range info + tempMatchContainer.clear(); + diffPosCntOfCurrRange = 1; + } + i++; + } + + // Met next species + if (lastIn) { + tempMatchContainer.push_back(matchList + i); + if (diffPosCntOfCurrRange >= minCoveredPos) { + filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), + tempMatchContainer.end()); + } + } + tempMatchContainer.clear(); + i++; + } + + // Construct a match combination using filtered matches of current genus + // so that it can best cover the query, and score the combination + if (!filteredMatches.empty()) { + genusScores.push_back(scoreGenus(filteredMatches, readLength1, readLength2)); + } + filteredMatches.clear(); + } + + // If there are no meaningful genus + if (genusScores.empty()) { + bestScore.score = 0; + return bestScore; + } + + TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), + [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); + + vector maxIdx; + for (size_t g = 0; g < genusScores.size(); g++) { + if (genusScores[g].score > maxScore.score * 0.95f) { + maxIdx.push_back(g); + } + } + bestScore = maxScore; + + for (unsigned long g : maxIdx) { + for (const Match * m : matchesForEachGenus[g]) { + genusMatches.push_back(*m); + } + } + + // More than one genus + if (maxIdx.size() > 1) { + bestScore.taxId = 0; + return bestScore; + } + return bestScore; + + //Three cases + //1. one genus + //2. more than one genus + //4. no genus +} + +TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, + size_t offset, int queryLength, const LocalParameters & par) { + TaxID currentGenus; + TaxID currentSpecies; + + vector filteredMatches; + vector> matchesForEachGenus; + vector genusScores; + TaxonScore bestScore; + size_t i = offset; + uint8_t curFrame; + vector curFrameMatches; + while (i < end + 1) { + currentGenus = matchList[i].genusId; + // For current genus + while ((i < end + 1) && currentGenus == matchList[i].genusId) { + currentSpecies = matchList[i].speciesId; + + // For current species + while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { + curFrame = matchList[i].qInfo.frame; + curFrameMatches.clear(); + + // For current frame + while ((i < end + 1) && currentSpecies == matchList[i].speciesId + && curFrame == matchList[i].qInfo.frame) { + curFrameMatches.push_back(&matchList[i]); + i ++; + } + if (curFrameMatches.size() > 1) { + remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus, par); + } + } + } + + // Construct a match combination using filtered matches of current genus + // so that it can best cover the query, and score the combination + + if (!filteredMatches.empty()) { + matchesForEachGenus.push_back(filteredMatches); + genusScores.push_back(scoreGenus(filteredMatches, queryLength)); + } + filteredMatches.clear(); + } + + // If there are no meaningful genus + if (genusScores.empty()) { + bestScore.score = 0; + return bestScore; + } + + TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), + [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); + + vector maxIdx; + for (size_t g = 0; g < genusScores.size(); g++) { + if (genusScores[g].score > maxScore.score * 0.95f) { + maxIdx.push_back(g); + } + } + + bestScore = maxScore; + + for (unsigned long g : maxIdx) { + for (const Match * m : matchesForEachGenus[g]) { + genusMatches.push_back(*m); + } + } + + // More than one genus + if (maxIdx.size() > 1) { + bestScore.taxId = 0; + return bestScore; + } + return bestScore; + + //Three cases + //1. one genus + //2. more than one genus + //4. no genus +} + +TaxonScore Taxonomer::getBestGenusMatches_spaced(vector &genusMatches, const Match *matchList, size_t end, + size_t offset, int readLength) { + TaxID currentGenus; + TaxID currentSpecies; + + vector tempMatchContainer; + vector filteredMatches; + vector> matchesForEachGenus; + vector conservedWithinGenus; + vector genusScores; + TaxonScore bestScore; + size_t i = offset; + bool lastIn; + size_t speciesMatchCnt; + while (i + 1 < end + 1) { + currentGenus = matchList[i].genusId; + // For current genus + while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) { + currentSpecies = matchList[i].speciesId; + // For current species + // Filter un-consecutive matches (probably random matches) + lastIn = false; + int distance = 0; + int diffPosCntOfCurrRange = 1; + int dnaDist = 0; + + // For the same species + while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) { + distance = matchList[i + 1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3; + dnaDist = matchList[i + 1].qInfo.pos - matchList[i].qInfo.pos; + if (distance == 0) { // At the same position + tempMatchContainer.push_back(matchList + i); + } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping + lastIn = true; + tempMatchContainer.push_back(matchList + i); + diffPosCntOfCurrRange++; + } else { // Not consecutive --> End range + if (lastIn) { + tempMatchContainer.push_back(matchList + i); + if (diffPosCntOfCurrRange >= minCoveredPos) { + filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), + tempMatchContainer.end()); + } + } + lastIn = false; + // Initialize range info + tempMatchContainer.clear(); + diffPosCntOfCurrRange = 1; + } + i++; + } + + // Met next species + if (lastIn) { + tempMatchContainer.push_back(matchList + i); + if (diffPosCntOfCurrRange >= minCoveredPos) { + filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), + tempMatchContainer.end()); + } + } + tempMatchContainer.clear(); + i++; + } + + // Construct a match combination using filtered matches of current genus + // so that it can best cover the query, and score the combination + if (!filteredMatches.empty()) { + genusScores.push_back(scoreGenus(filteredMatches, readLength)); + } + filteredMatches.clear(); + } + + // If there are no meaningful genus + if (genusScores.empty()) { + bestScore.score = 0; + return bestScore; + } + + TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), + [](const TaxonScore &a, const TaxonScore &b) { return a.score < b.score; }); + + vector maxIdx; + for (size_t g = 0; g < genusScores.size(); g++) { + if (genusScores[g].score > maxScore.score * 0.95f) { + maxIdx.push_back(g); + } + } + bestScore = maxScore; + + for (unsigned long g: maxIdx) { + genusMatches.insert(genusMatches.end(), + matchesForEachGenus[g].begin(), + matchesForEachGenus[g].end()); + } + + // More than one genus + if (maxIdx.size() > 1) { + bestScore.taxId = 0; + return bestScore; + } + return bestScore; + + //Three cases + //1. one genus + //2. more than one genus + //4. no genus +} + +TaxonScore Taxonomer::scoreGenus(vector &filteredMatches, + int queryLength) { + // Calculate Hamming distance & covered length + int coveredPosCnt = 0; + uint16_t currHammings; + int aminoAcidNum = (int) queryLength / 3; + int currPos; + size_t matchNum = filteredMatches.size(); + size_t f = 0; + + // Get the largest hamming distance at each position of query + auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1]; + memset(hammingsAtEachPos, -1, (aminoAcidNum + 1)); + while (f < matchNum) { + currPos = filteredMatches[f]->qInfo.pos / 3; + currHammings = filteredMatches[f]->rightEndHamming; + if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]]) + hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); + if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]]) + hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); + if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]]) + hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); + if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]]) + hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); + if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]]) + hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); + if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]]) + hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); + if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]]) + hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); + if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]]) + hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); + f++; + } + + // Sum up hamming distances and count the number of position covered by the matches. + float hammingSum = 0; + for (int h = 0; h < aminoAcidNum; h++) { + if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. + coveredPosCnt++; + } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively + hammingSum += 1.0f + (0.5f * hammingsAtEachPos[h]); + coveredPosCnt++; + } + } + delete[] hammingsAtEachPos; + + // Score current genus + int coveredLength = coveredPosCnt * 3; + if (coveredLength > queryLength) coveredLength = queryLength; + float score = ((float) coveredLength - hammingSum) / (float) queryLength; + float coverage = (float) (coveredLength) / (float) (queryLength); + + return {filteredMatches[0]->genusId, score, coverage, (int) hammingSum}; +} + +TaxonScore Taxonomer::scoreGenus(vector &filteredMatches, + int readLength1, + int readLength2) { + + // Calculate Hamming distance & covered length + uint16_t currHammings; + int aminoAcidNum_total = ((int) readLength1 / 3) + ((int) readLength2 / 3); + int aminoAcidNum_read1 = ((int) readLength1 / 3); + int currPos; + size_t matchNum = filteredMatches.size(); + size_t f = 0; + + // Get the largest hamming distance at each position of query + auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3]; + memset(hammingsAtEachPos, -1, (aminoAcidNum_total + 3)); + while (f < matchNum) { + currPos = (int) filteredMatches[f]->qInfo.pos / 3; + currHammings = filteredMatches[f]->rightEndHamming; + if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]]) + hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); + if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]]) + hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); + if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]]) + hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); + if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]]) + hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); + if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]]) + hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); + if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]]) + hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); + if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]]) + hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); + if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]]) + hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); + f++; + } + + // Sum up hamming distances and count the number of position covered by the matches. + float hammingSum = 0; + int coveredPosCnt_read1 = 0; + int coveredPosCnt_read2 = 0; + for (int h = 0; h < aminoAcidNum_total; h++) { + // Read 1 + if (h < aminoAcidNum_read1) { + if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. + coveredPosCnt_read1++; + } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively + hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); + coveredPosCnt_read1++; + } + } + // Read 2 + else { + if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. + coveredPosCnt_read2++; + } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively + hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); + coveredPosCnt_read2++; + } + } + } + delete[] hammingsAtEachPos; + + // Score current genus + int coveredLength_read1 = coveredPosCnt_read1 * 3; + int coveredLength_read2 = coveredPosCnt_read2 * 3; + if (coveredLength_read1 > readLength1) coveredLength_read1 = readLength1; + if (coveredLength_read2 > readLength2) coveredLength_read2 = readLength2; + float score = + ((float) (coveredLength_read1 + coveredLength_read2) - hammingSum) / (float) (readLength1 + readLength2); + float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (readLength1 + readLength2); + +// matchesForEachGenus.push_back(move(filteredMatches)); + return {filteredMatches[0]->genusId, score, coverage, (int) hammingSum}; +} + +TaxonScore Taxonomer::chooseSpecies(const vector &matches, + int queryLength, + vector &species, + unordered_map> & speciesMatchRange) { + // Score each species + std::unordered_map speciesScores; + size_t i = 0; + TaxID currentSpeices; + size_t numOfMatch = matches.size(); + size_t speciesBegin, speciesEnd; + while (i < numOfMatch) { + currentSpeices = matches[i].speciesId; + speciesBegin = i; + while ((i < numOfMatch) && currentSpeices == matches[i].speciesId) { + i++; + } + speciesEnd = i; + speciesScores[currentSpeices] = scoreSpecies(matches, speciesBegin, speciesEnd, queryLength); + speciesMatchRange[currentSpeices] = {(int) speciesBegin, (int) speciesEnd}; + speciesScores[currentSpeices].taxId = currentSpeices; + } + + // Get the best species + TaxonScore bestScore; + for (auto & sp : speciesScores) { + if (sp.second.score > bestScore.score) { + species.clear(); + species.push_back(sp.first); + bestScore = sp.second; + } else if (sp.second.coverage == bestScore.coverage) { + species.push_back(sp.first); + } + } + return bestScore; +} + +TaxonScore Taxonomer::chooseSpecies(const vector &matches, + int read1Length, + int read2Length, + vector &species, + unordered_map> & speciesMatchRange) { + // Score each species + std::unordered_map speciesScores; + + + size_t i = 0; + TaxID currentSpeices; + size_t numOfMatch = matches.size(); + size_t speciesBegin, speciesEnd; + while (i < numOfMatch) { + currentSpeices = matches[i].speciesId; + speciesBegin = i; + while ((i < numOfMatch) && currentSpeices == matches[i].speciesId) { + i++; + } + speciesEnd = i; + speciesScores[currentSpeices] = scoreSpecies(matches, speciesBegin, speciesEnd, read1Length, read2Length); + speciesMatchRange[currentSpeices] = {(int) speciesBegin, (int) speciesEnd}; + speciesScores[currentSpeices].taxId = currentSpeices; + } + + // Get the best species + TaxonScore bestScore; + for (auto & sp : speciesScores) { + if (sp.second.score > bestScore.score) { + species.clear(); + species.push_back(sp.first); + bestScore = sp.second; + } else if (sp.second.coverage == bestScore.coverage) { + species.push_back(sp.first); + } + } + return bestScore; +} + +TaxonScore Taxonomer::scoreSpecies(const vector &matches, + size_t begin, + size_t end, + int queryLength) { + + // Get the largest hamming distance at each position of query + int aminoAcidNum = queryLength / 3; + auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1]; + memset(hammingsAtEachPos, -1, (aminoAcidNum + 1)); + int currPos; + size_t walker = begin; + uint16_t currHammings; + while (walker < end) { + currPos = matches[walker].qInfo.pos / 3; + currHammings = matches[walker].rightEndHamming; + if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]]) + hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); + if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]]) + hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); + if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]]) + hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); + if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]]) + hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); + if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]]) + hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); + if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]]) + hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); + if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]]) + hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); + if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]]) + hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); + walker++; + } + + // Sum up hamming distances and count the number of position covered by the matches. + float hammingSum = 0; + int hammingDist = 0; + int coveredPosCnt = 0; + for (int h = 0; h < aminoAcidNum; h++) { + if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. + coveredPosCnt++; + } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively + hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); + hammingDist += hammingsAtEachPos[h]; + coveredPosCnt++; + } + } + delete[] hammingsAtEachPos; + // Score + int coveredLength = coveredPosCnt * 3; + if (coveredLength >= queryLength) coveredLength = queryLength; + + float score = ((float)coveredLength - hammingSum) / (float) queryLength; + float coverage = (float) coveredLength / (float) (queryLength); + + return {0, score, coverage, hammingDist}; +} + +TaxonScore Taxonomer::scoreSpecies(const vector &matches, + size_t begin, + size_t end, + int queryLength, + int queryLength2) { + + // Get the smallest hamming distance at each position of query + int aminoAcidNum_total = queryLength / 3 + queryLength2 / 3; + int aminoAcidNum_read1 = queryLength / 3; + auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3]; + memset(hammingsAtEachPos, -1, (aminoAcidNum_total + 3)); + + int currPos; + size_t walker = begin; + uint16_t currHammings; + + while (walker < end) { + currPos = matches[walker].qInfo.pos / 3; + currHammings = matches[walker].rightEndHamming; + if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]]) + hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); + if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]]) + hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); + if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]]) + hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); + if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]]) + hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); + if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]]) + hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); + if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]]) + hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); + if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]]) + hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); + if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]]) + hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); + walker++; + } + + // Sum up hamming distances and count the number of position covered by the matches. + float hammingSum = 0; + int hammingDist = 0; + int coveredPosCnt_read1 = 0; + int coveredPosCnt_read2 = 0; + for (int h = 0; h < aminoAcidNum_total; h++) { + // Read 1 + if (h < aminoAcidNum_read1) { + if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. + coveredPosCnt_read1++; + } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively + hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); + hammingDist += hammingsAtEachPos[h]; + coveredPosCnt_read1++; + } + } + // Read 2 + else { + if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. + coveredPosCnt_read2++; + } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively + hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); + hammingDist += hammingsAtEachPos[h]; + coveredPosCnt_read2++; + } + } + } + delete[] hammingsAtEachPos; + + // Score + int coveredLength_read1 = coveredPosCnt_read1 * 3; + int coveredLength_read2 = coveredPosCnt_read2 * 3; + if (coveredLength_read1 >= queryLength) coveredLength_read1 = queryLength; + if (coveredLength_read2 >= queryLength2) coveredLength_read2 = queryLength2; + + float score = ((float) (coveredLength_read1 + coveredLength_read2) - hammingSum) / (float) (queryLength + queryLength2); + float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (queryLength + queryLength2); + + return {0, score, coverage, hammingDist}; +} + +bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) { + return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF); +} \ No newline at end of file diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h new file mode 100644 index 00000000..d7de78ac --- /dev/null +++ b/src/commons/Taxonomer.h @@ -0,0 +1,133 @@ +#ifndef METABULI_TAXONOMER_H +#define METABULI_TAXONOMER_H +#include "NcbiTaxonomy.h" +#include "LocalParameters.h" +#include "Match.h" +#include "common.h" +#include "BitManipulateMacros.h" +#include + +using namespace std; + +struct TaxonScore { + TaxID taxId; + float score; + float coverage; + int hammingDist; + TaxonScore(TaxID taxId, float score, float coverage, int hammingDist) : + taxId(taxId), score(score), coverage(coverage), hammingDist(hammingDist) {} + TaxonScore() : taxId(0), score(0.0f), coverage(0.0f), hammingDist(0) {} +}; + +class Taxonomer { +private: + NcbiTaxonomy * taxonomy; + + // spaced k-mer + int unmaskedPos[9]; + int spaceNum; + + // Parameters + int maxGap; + int minCoveredPos; + + struct MatchBlock { + MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {} + MatchBlock() : start(0), end(0), id(0) {} + size_t start; + size_t end; + uint32_t id; + }; + + + + // Output + unordered_map taxCounts; + + +public: + Taxonomer(const LocalParameters & par, NcbiTaxonomy * taxonomy); + ~Taxonomer(); + + void assignTaxonomy(const Match *matchList, + size_t numOfMatches, + std::vector & queryList, + const LocalParameters &par); + + void chooseBestTaxon(uint32_t currentQuery, + size_t offset, + size_t end, + const Match *matchList, + vector & queryList, + const LocalParameters &par); + + void remainConsecutiveMatches(vector & curFrameMatches, + vector & filteredMatches, + TaxID genusId, + const LocalParameters & par); + + size_t DFS(size_t curMatchIdx, const map>& linkedMatches, + vector& fiteredMatchIdx, size_t depth, size_t MIN_DEPTH, unordered_set& used, + unordered_map & idx2depth); + + static bool isConsecutive(const Match * match1, const Match * match2); + + TaxonScore getBestGenusMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, + size_t offset, int queryLength, const LocalParameters &par); + + TaxonScore getBestGenusMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, + int readLength1, int readLength2, const LocalParameters &par); + + TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, + int readLength1, int readLength2); + TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, + int readLength1); + + TaxonScore scoreGenus(vector &filteredMatches, + int queryLength); + + TaxonScore scoreGenus(vector &filteredMatches, + int readLength1, + int readLength2); + + void scoreGenus_ExtensionScore(vector &filteredMatches, + vector> &matchesForEachGenus, + vector &scoreOfEachGenus, + int readLength1, int readLength2); + + TaxonScore chooseSpecies(const std::vector &matches, + int queryLength, + vector &species, + unordered_map> & speciesMatchRange); + + TaxonScore chooseSpecies(const std::vector &matches, + int read1Length, + int read2Length, + vector &species, + unordered_map> & speciesMatchRange); + + TaxonScore scoreSpecies(const vector &matches, + size_t begin, + size_t end, + int queryLength); + + TaxonScore scoreSpecies(const vector &matches, + size_t begin, + size_t end, + int queryLength, + int queryLength2); + + TaxID lowerRankClassification(vector &matches, pair &matchRange, TaxID speciesID); + + void getSpeciesCladeCounts(const unordered_map & taxCnt, + unordered_map & cladeCnt, + TaxID spciesID); + + TaxID BFS(const unordered_map & cladeCnt, TaxID root); + + // Getters + unordered_map & getTaxCounts() { return taxCounts; } +}; + + +#endif //METABULI_TAXONOMER_H diff --git a/src/commons/common.h b/src/commons/common.h index 615b3af5..9b499da2 100644 --- a/src/commons/common.h +++ b/src/commons/common.h @@ -6,6 +6,7 @@ #define likely(x) __builtin_expect((x),1) #define unlikely(x) __builtin_expect((x),0) +#define kmerLength 8 struct SequenceBlock{ SequenceBlock(size_t start, size_t end, size_t length, size_t seqLength = 0) @@ -44,6 +45,31 @@ struct Query{ queryLength2(0), kmerCnt(0), isClassified(false), newSpecies(false) {} }; +template +struct Buffer { + T *buffer; + size_t startIndexOfReserve; + size_t bufferSize; + + explicit Buffer(size_t sizeOfBuffer=100) { + buffer = (T *) malloc(sizeof(T) * sizeOfBuffer); + bufferSize = sizeOfBuffer; + startIndexOfReserve = 0; + }; + + size_t reserveMemory(size_t numOfKmer) { + size_t offsetToWrite = __sync_fetch_and_add(&startIndexOfReserve, numOfKmer); + return offsetToWrite; + }; + + void reallocateMemory(size_t sizeOfBuffer) { + if (sizeOfBuffer > bufferSize) { + buffer = (T *) realloc(buffer, sizeof(T) * sizeOfBuffer); + bufferSize = sizeOfBuffer; + } + }; +}; + inline bool fileExist(const std::string& name) { if (FILE *file = fopen(name.c_str(), "r")) { fclose(file); diff --git a/src/metabuli.cpp b/src/metabuli.cpp index f41040d1..8918c64e 100644 --- a/src/metabuli.cpp +++ b/src/metabuli.cpp @@ -12,7 +12,7 @@ const char* tool_name = "metabuli"; const char* tool_introduction = "Metabuli is a taxonomical classifier that jointly analyzes amino acid and DNA sequences."; const char* main_author = "Jaebeom Kim "; const char* show_extended_help = "1"; -const char* show_bash_info = NULL; +const char* show_bash_info = nullptr; bool hide_base_commands = true; extern const char* MMSEQS_CURRENT_INDEX_VERSION; const char* index_version_compatible = MMSEQS_CURRENT_INDEX_VERSION; @@ -26,7 +26,7 @@ LocalParameters& localPar = LocalParameters::getLocalInstance(); std::vector commands = { {"databases", databases, &localPar.databases, COMMAND_DATABASE_CREATION, "List and download databases", - NULL, + nullptr, "Milot Mirdita ", " ", CITATION_SPACEPHARER, {{"selection", 0, DbType::ZERO_OR_ALL, &DbValidator::empty }, @@ -34,7 +34,7 @@ std::vector commands = { {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, {"build", build, &localPar.build, COMMAND_DATABASE_CREATION, "Build database based on the list of FASTA files.", - NULL, + nullptr, "Jaebeom Kim ", " ", CITATION_SPACEPHARER, @@ -43,14 +43,14 @@ std::vector commands = { {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, {"database-report", databaseReport, &localPar.databaseReport, COMMAND_DATABASE_CREATION, "It generates a report of taxa in a database.", - NULL, + nullptr, "Jaebeom Kim ", " ", CITATION_SPACEPHARER, {{"Directory where the DB will be generated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}}, - {"updateDB", build, &localPar.build, COMMAND_DATABASE_CREATION, + {"updateDB", build, &localPar.build, COMMAND_DB, "Update database based on the list of FASTA files.", - NULL, + nullptr, "Jaebeom Kim ", " ", CITATION_SPACEPHARER, @@ -59,18 +59,25 @@ std::vector commands = { {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, {"classify", classify, &localPar.classify, COMMAND_TAXONOMY, "Assigning taxonomy label to query reads", - NULL, + nullptr, "Jaebeom Kim ", - " ", + " ", CITATION_SPACEPHARER, {{"FASTA", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile}, {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}, {"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}, {"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, - + {"filter", classify, &localPar.classify, COMMAND_TAXONOMY, + "Filtering reads based on the classification result", + nullptr, + "Jaebeom Kim ", + " ", + CITATION_SPACEPHARER, + {{"READ FILE", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile}, + {"FILTER DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}, {"grade", grade, &localPar.grade, COMMAND_EXPERT, "Grade the classification result (only for benchmarking)", - NULL, + nullptr, "Jaebeom Kim ", " ", CITATION_SPACEPHARER, @@ -79,7 +86,7 @@ std::vector commands = { {"taxonomy dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::directory}}}, {"seqHeader2TaxId", seqHeader2TaxId, &localPar.seqHeader2TaxId, COMMAND_EXPERT, "It extracts k-mers from query sequences, and compares them to the target database", - NULL, + nullptr, "Jaebeom Kim ", " ", CITATION_SPACEPHARER, @@ -88,7 +95,7 @@ std::vector commands = { {"add-to-library", addToLibrary, &localPar.addToLibrary, COMMAND_DATABASE_CREATION, "It bins sequences into distinct files according to their species referring their accession number.\n " "It requires a mapping file (accession to tax ID) and NCBI style tax dump files in a taxonomy directory.", - NULL, + nullptr, "Jaebeom Kim ", " ", CITATION_SPACEPHARER, @@ -97,7 +104,7 @@ std::vector commands = { {"DB directory", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}, {"apply-threshold", applyThreshold, &localPar.applyThreshold, COMMAND_MAIN, "Assigning taxonomy label to query reads", - NULL, + nullptr, "Jaebeom Kim ", " ", CITATION_SPACEPHARER, @@ -107,7 +114,7 @@ std::vector commands = { {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}}, {"binning2report", binning2report, &localPar.binning2report, COMMAND_FORMAT_CONVERSION, "It generates Kraken style report file from binning results", - NULL, + nullptr, "Jaebeom Kim ", " ", CITATION_SPACEPHARER, @@ -117,7 +124,7 @@ std::vector commands = { {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}}, {"filter-by-genus", filterByGenus, &localPar.filterByGenus, COMMAND_EXPERT, "It filters out reads classified as a specific genus", - NULL, + nullptr, "Jaebeom Kim ", " ", CITATION_SPACEPHARER, diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt index 6f4819ad..66d1fb0c 100644 --- a/src/workflow/CMakeLists.txt +++ b/src/workflow/CMakeLists.txt @@ -3,4 +3,5 @@ set(workflow_source_files workflow/updateDB.cpp workflow/add_to_library.cpp workflow/build.cpp + workflow/filter.cpp PARENT_SCOPE) \ No newline at end of file diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp index e88a4aa4..21a45f17 100644 --- a/src/workflow/classify.cpp +++ b/src/workflow/classify.cpp @@ -1,13 +1,10 @@ #include "Classifier.h" -#include "ReducedClassifier.h" #include "Parameters.h" #include "LocalParameters.h" -#include "NcbiTaxonomy.h" #include "FileUtil.h" void setClassifyDefaults(LocalParameters & par){ par.seqMode = 2; - par.memoryMode = 1; par.reducedAA = 0; par.minScore = 0; par.minCoverage = 0; @@ -49,13 +46,7 @@ int classify(int argc, const char **argv, const Command& command) #endif cout << "Number of threads: " << par.threads << endl; - Classifier * classifier; - if(par.reducedAA == 1){ - classifier = new ReducedClassifier(par); - } else { - classifier = new Classifier(par); - } - + Classifier * classifier = new Classifier(par); classifier->startClassify(par); delete classifier; return 0; diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp new file mode 100644 index 00000000..c8e3894a --- /dev/null +++ b/src/workflow/filter.cpp @@ -0,0 +1,57 @@ +#include "Classifier.h" +#include "Parameters.h" +#include "LocalParameters.h" +#include "FileUtil.h" +#include "QueryFilter.h" + +void setFilterDefaults(LocalParameters & par){ + par.seqMode = 2; + par.reducedAA = 0; + par.minScore = 0.7; + par.minCoverage = 0; + par.minSpScore = 0; + par.spaceMask = "11111111"; + par.hammingMargin = 0; + par.verbosity = 3; + par.ramUsage = 128; + par.minCoveredPos = 4; + par.printLog = 0; + par.maxGap = 0; + par.taxonomyPath = "DBDIR/taxonomy/" ; + par.minConsCnt = 4; + par.minConsCntEuk = 9; + par.eukaryotaTaxId = 2759; + par.maskMode = 0; + par.maskProb = 0.9; + par.matchPerKmer = 4; +} + +int filter(int argc, const char **argv, const Command& command) +{ + LocalParameters & par = LocalParameters::getLocalInstance(); + setFilterDefaults(par); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0); + + if (par.seqMode == 2) { + if (!FileUtil::directoryExists(par.filenames[3].c_str())) { + FileUtil::makeDir(par.filenames[3].c_str()); + } + } else { + if (!FileUtil::directoryExists(par.filenames[2].c_str())) { + FileUtil::makeDir(par.filenames[2].c_str()); + } + } + +#ifdef OPENMP + omp_set_num_threads(par.threads); +#endif + + cout << "Number of threads: " << par.threads << endl; + + QueryFilter * queryFilter = new QueryFilter(par); + + queryFilter->startClassify(par); + + delete classifier; + return 0; +} \ No newline at end of file From 51f12bf980870829cd4b31bcfe9ad6cf2e4e0956 Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Fri, 18 Aug 2023 11:33:21 +0900 Subject: [PATCH 07/65] fix compile errors --- src/commons/KmerBuffer.h | 2 +- src/commons/KmerExtractor.h | 2 +- src/commons/KmerMatcher.h | 3 ++- src/commons/LocalUtil.cpp | 2 +- src/commons/LocalUtil.h | 4 ++-- src/commons/Match.h | 4 ++-- src/commons/QueryFilter.cpp | 6 ------ 7 files changed, 9 insertions(+), 14 deletions(-) diff --git a/src/commons/KmerBuffer.h b/src/commons/KmerBuffer.h index fd43cd9a..d5fe0d66 100644 --- a/src/commons/KmerBuffer.h +++ b/src/commons/KmerBuffer.h @@ -68,7 +68,7 @@ class TargetKmerBuffer{ static size_t getTargetKmerBufferSize(){ size_t memLimit = Util::getTotalSystemMemory() * 0.5; size_t bufferSize = memLimit / sizeof(TargetKmer); - cout< 10000000000){ bufferSize = 10000000000; } diff --git a/src/commons/KmerExtractor.h b/src/commons/KmerExtractor.h index 260bc78c..fe862bdb 100644 --- a/src/commons/KmerExtractor.h +++ b/src/commons/KmerExtractor.h @@ -2,7 +2,7 @@ #define METABULI_KMEREXTRACTER_H #include "SeqIterator.h" #include "QueryIndexer.h" -#include "KseqWrapper.h" +#include "KSeqWrapper.h" class KmerExtractor { private: diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h index 56379b7a..183f916f 100644 --- a/src/commons/KmerMatcher.h +++ b/src/commons/KmerMatcher.h @@ -9,6 +9,7 @@ #include "Mmap.h" #include "BitManipulateMacros.h" #include "NcbiTaxonomy.h" +#include "unordered_map" #define BufferSize 16'777'216 //16 * 1024 * 1024 // 16 M @@ -19,7 +20,7 @@ // Output // 1. Matched K-mers - +using namespace std; class KmerMatcher { protected: diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp index 039c635a..8dd6d268 100644 --- a/src/commons/LocalUtil.cpp +++ b/src/commons/LocalUtil.cpp @@ -1,7 +1,7 @@ #include "LocalUtil.h" -std::string LocalUtil::getQueryBaseName(const std::string queryPath) { +std::string LocalUtil::getQueryBaseName(const std::string & queryPath) { std::vector splits = Util::split(queryPath, "."); std::string baseName; int extentionNum = 1; diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h index 1d34a45c..ec2567cf 100644 --- a/src/commons/LocalUtil.h +++ b/src/commons/LocalUtil.h @@ -10,12 +10,12 @@ class LocalUtil : public Util { public: LocalUtil() = default; - static std::string getQueryBaseName(const std::string queryPath); + static std::string getQueryBaseName(const std::string & queryPath); template static T getQueryKmerNumber(T queryLength, int spaceNum); - static void splitQueryFile(vector & seqSegments, const string & queryPath); + static void splitQueryFile(std::vector & seqSegments, const string & queryPath); }; diff --git a/src/commons/Match.h b/src/commons/Match.h index 5d8cf503..da8dcdb4 100644 --- a/src/commons/Match.h +++ b/src/commons/Match.h @@ -33,8 +33,8 @@ struct Match { // 24 byte bool redundancy; // 1 void printMatch() const { - cout << qInfo.sequenceID << " " << qInfo.pos << " " << qInfo.frame << " " - << targetId << " " << genusId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << endl; + std::cout << qInfo.sequenceID << " " << qInfo.pos << " " << qInfo.frame << " " + << targetId << " " << genusId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << std::endl; } }; diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp index 636f65d2..863baea3 100644 --- a/src/commons/QueryFilter.cpp +++ b/src/commons/QueryFilter.cpp @@ -1,11 +1,6 @@ #include "QueryFilter.h" QueryFilter::QueryFilter(LocalParameters & par) { - if (par.reducedAA == 1) { - classifier = new ReducedClassifier(par); - } else { - classifier = new Classifier(par); - } queryIndexer = new QueryIndexer(par); setInputAndOutputFiles(par); @@ -13,7 +8,6 @@ QueryFilter::QueryFilter(LocalParameters & par) { QueryFilter::~QueryFilter() { delete queryIndexer; - delete classifier; } void QueryFilter::setInputAndOutputFiles(const LocalParameters & par) { From 7fbfe711b81ab1336a009ba8b8b03187eb7eff26 Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Fri, 18 Aug 2023 11:36:52 +0900 Subject: [PATCH 08/65] fix compile errors --- src/commons/KmerMatcher.cpp | 4 ++-- src/commons/LocalUtil.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 7117e3ce..3d15b6d2 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -149,7 +149,7 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer * m while (completedSplitCnt < threads) { bool hasOverflow = false; #pragma omp parallel default(none), shared(completedSplitCnt, splitCheckList, hasOverflow, \ -querySplits, queryKmerList, matchBuffer, cout, par, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName, targetSplitIdxs) +querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName, targetSplitIdxs) { // FILE FILE * diffIdxFp = fopen(targetDiffIdxFileName.c_str(), "rb"); @@ -409,7 +409,7 @@ querySplits, queryKmerList, matchBuffer, cout, par, targetDiffIdxFileName, numOf queryKmerNum = 0; #ifdef OPENMP - omp_set_num_threads(par.threads); + omp_set_num_threads(threads); #endif // Sort matches diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h index ec2567cf..0f7ff0d2 100644 --- a/src/commons/LocalUtil.h +++ b/src/commons/LocalUtil.h @@ -15,7 +15,7 @@ class LocalUtil : public Util { template static T getQueryKmerNumber(T queryLength, int spaceNum); - static void splitQueryFile(std::vector & seqSegments, const string & queryPath); + static void splitQueryFile(std::vector & seqSegments, const std::string & queryPath); }; From 3f994f237427c896af55f9c138963569fcf497a3 Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Fri, 18 Aug 2023 11:37:46 +0900 Subject: [PATCH 09/65] fix compile errors --- src/workflow/filter.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp index c8e3894a..fd02afcf 100644 --- a/src/workflow/filter.cpp +++ b/src/workflow/filter.cpp @@ -48,10 +48,6 @@ int filter(int argc, const char **argv, const Command& command) cout << "Number of threads: " << par.threads << endl; - QueryFilter * queryFilter = new QueryFilter(par); - queryFilter->startClassify(par); - - delete classifier; return 0; } \ No newline at end of file From 5e8b8806495c19b78846028137f8f3b32de97a43 Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Fri, 18 Aug 2023 11:41:09 +0900 Subject: [PATCH 10/65] fix compile errors --- src/commons/Classifier.cpp | 2 -- src/commons/Classifier.h | 28 ---------------------------- 2 files changed, 30 deletions(-) diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index eebbc63f..09a53dfa 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -1,6 +1,4 @@ #include "Classifier.h" -#include "LocalParameters.h" -#include "taxonomyreport.cpp" Classifier::Classifier(LocalParameters & par) { // Load parameters diff --git a/src/commons/Classifier.h b/src/commons/Classifier.h index 5cf61ce9..81a0b027 100644 --- a/src/commons/Classifier.h +++ b/src/commons/Classifier.h @@ -49,9 +49,6 @@ class Classifier { Reporter * reporter; NcbiTaxonomy * taxonomy; - - - public: void startClassify(const LocalParameters &par); @@ -59,32 +56,7 @@ class Classifier { virtual ~Classifier(); - }; - - -//inline uint64_t -//Classifier::getNextTargetKmer(uint64_t lookingTarget, const uint16_t *targetDiffIdxList, size_t &diffIdxPos) { -// uint16_t fragment; -// uint16_t check = (0x1u << 15u); -// uint64_t diffIn64bit = 0; -// fragment = targetDiffIdxList[diffIdxPos]; -// diffIdxPos++; -// while (!(fragment & check)) { // 27 % -// diffIn64bit |= fragment; -// diffIn64bit <<= 15u; -// fragment = targetDiffIdxList[diffIdxPos]; -// diffIdxPos++; -// } -// fragment &= ~check; // not; 8.47 % -// diffIn64bit |= fragment; // or : 23.6% -// -// return diffIn64bit + lookingTarget; -//} - - - - #endif //ADKMER4_SEARCHER_H From ba571e9d803c8522f53ae592fcef5d1d81878fa0 Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Fri, 18 Aug 2023 11:43:42 +0900 Subject: [PATCH 11/65] fix compile errors --- src/commons/LocalUtil.cpp | 4 ---- src/commons/LocalUtil.h | 6 ++++++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp index 8dd6d268..6498f75b 100644 --- a/src/commons/LocalUtil.cpp +++ b/src/commons/LocalUtil.cpp @@ -18,10 +18,6 @@ std::string LocalUtil::getQueryBaseName(const std::string & queryPath) { return baseName; } -template -T LocalUtil::getQueryKmerNumber(T queryLength, int spaceNum) { - return (getMaxCoveredLength(queryLength) / 3 - kmerLength - spaceNum + 1) * 6; -} void LocalUtil::splitQueryFile(std::vector & sequences, const std::string &queryPath) { diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h index 0f7ff0d2..550d34d2 100644 --- a/src/commons/LocalUtil.h +++ b/src/commons/LocalUtil.h @@ -20,4 +20,10 @@ class LocalUtil : public Util { }; +template +T LocalUtil::getQueryKmerNumber(T queryLength, int spaceNum) { + return (getMaxCoveredLength(queryLength) / 3 - kmerLength - spaceNum + 1) * 6; +} + + #endif //METABULI_LOCALUTIL_H From ec780c5eeef9ad8c557e4ec16a30b55539c5a481 Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Fri, 18 Aug 2023 11:45:43 +0900 Subject: [PATCH 12/65] fix compile errors --- src/commons/KmerExtractor.cpp | 13 ++----------- src/commons/KmerExtractor.h | 2 +- src/commons/LocalUtil.cpp | 10 +++++++++- src/commons/LocalUtil.h | 1 + 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp index f8860ec9..a72521da 100644 --- a/src/commons/KmerExtractor.cpp +++ b/src/commons/KmerExtractor.cpp @@ -142,8 +142,8 @@ void KmerExtractor::fillQueryKmerBufferParallel_paired(KSeqWrapper *kseq1, int kmerCnt2 = LocalUtil::getQueryKmerNumber((int) e2.sequence.l, spaceNum); // Query Info - queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l); - queryList[processedQueryNum].queryLength2 = getMaxCoveredLength((int) e2.sequence.l); + queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) e1.sequence.l); + queryList[processedQueryNum].queryLength2 = LocalUtil::getMaxCoveredLength((int) e2.sequence.l); queryList[processedQueryNum].name = string(e1.name.s); queryList[processedQueryNum].kmerCnt = (int) (kmerCnt + kmerCnt2); @@ -205,12 +205,3 @@ void KmerExtractor::fillQueryKmerBufferParallel_paired(KSeqWrapper *kseq1, } } -int KmerExtractor::getMaxCoveredLength(int queryLength) { - if (queryLength % 3 == 2) { - return queryLength - 2; // 2 - } else if (queryLength % 3 == 1) { - return queryLength - 4; // 4 - } else { - return queryLength - 3; // 3 - } -} \ No newline at end of file diff --git a/src/commons/KmerExtractor.h b/src/commons/KmerExtractor.h index fe862bdb..2e7f2977 100644 --- a/src/commons/KmerExtractor.h +++ b/src/commons/KmerExtractor.h @@ -29,7 +29,7 @@ class KmerExtractor { const QuerySplit & currentSplit, const LocalParameters &par); - static int getMaxCoveredLength(int queryLength) ; + public: explicit KmerExtractor(const LocalParameters & par); diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp index 6498f75b..08e04508 100644 --- a/src/commons/LocalUtil.cpp +++ b/src/commons/LocalUtil.cpp @@ -33,4 +33,12 @@ void LocalUtil::splitQueryFile(std::vector & sequences, const std delete kseq; } - +int LocalUtil::getMaxCoveredLength(int queryLength) { + if (queryLength % 3 == 2) { + return queryLength - 2; // 2 + } else if (queryLength % 3 == 1) { + return queryLength - 4; // 4 + } else { + return queryLength - 3; // 3 + } +} \ No newline at end of file diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h index 550d34d2..0fcbdf82 100644 --- a/src/commons/LocalUtil.h +++ b/src/commons/LocalUtil.h @@ -17,6 +17,7 @@ class LocalUtil : public Util { static void splitQueryFile(std::vector & seqSegments, const std::string & queryPath); + static int getMaxCoveredLength(int queryLength) ; }; From bf2633ef21740e524e83c5c321ae4d8d64e9fa43 Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Fri, 18 Aug 2023 11:46:20 +0900 Subject: [PATCH 13/65] fix compile errors --- src/commons/KmerExtractor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp index a72521da..2488051f 100644 --- a/src/commons/KmerExtractor.cpp +++ b/src/commons/KmerExtractor.cpp @@ -65,7 +65,7 @@ void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq1, int kmerCnt = LocalUtil::getQueryKmerNumber((int) e1.sequence.l, spaceNum); // Query Info - queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l); + queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) e1.sequence.l); queryList[processedQueryNum].name = string(e1.name.s); queryList[processedQueryNum].kmerCnt = (int) (kmerCnt); From 5eac1d1036af8f4053007340baabb10fb30642d5 Mon Sep 17 00:00:00 2001 From: JaebeomKim0731 Date: Fri, 18 Aug 2023 12:23:12 +0900 Subject: [PATCH 14/65] Fix error --- src/commons/QueryIndexer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp index f5a30ff9..ad0f7f86 100644 --- a/src/commons/QueryIndexer.cpp +++ b/src/commons/QueryIndexer.cpp @@ -17,6 +17,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) { readNum_1 = 0; readNum_2 = 0; spaceNum = par.spaceMask.length() - kmerLength; + totalReadLength = 0; setAvailableRam(); } From 7a74d115649e6d9ef5743fd611ab698ab3883bf5 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Fri, 18 Aug 2023 16:53:22 +0900 Subject: [PATCH 15/65] Fix run time errors --- src/commons/Classifier.cpp | 5 ++ src/commons/KmerExtractor.cpp | 2 + src/commons/KmerMatcher.cpp | 54 ++++++++++++++------ src/commons/KmerMatcher.h | 35 ++++++------- src/commons/LocalParameters.h | 2 +- src/commons/QueryIndexer.cpp | 93 +++++++++++++++++++++-------------- src/commons/Taxonomer.cpp | 2 +- 7 files changed, 124 insertions(+), 69 deletions(-) diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index 09a53dfa..3f157fd3 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -43,6 +43,11 @@ void Classifier::startClassify(const LocalParameters &par) { cout << "Total number of sequences: " << numOfSeq << endl; cout << "Total read length: " << totalReadLength << "nt" << endl; + // Print queryReadSplit + for (size_t i = 0; i < queryReadSplit.size(); i++) { + cout << queryReadSplit[i].start << " " << queryReadSplit[i].end << " " << queryReadSplit[i].kmerCnt << endl; + } + QueryKmerBuffer kmerBuffer; Buffer matchBuffer; vector queryList; diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp index 2488051f..91f5ee8f 100644 --- a/src/commons/KmerExtractor.cpp +++ b/src/commons/KmerExtractor.cpp @@ -4,6 +4,8 @@ KmerExtractor::KmerExtractor(const LocalParameters &par) { spaceNum = par.spaceMask.length() - 8; maskMode = par.maskMode; maskProb = par.maskProb; + subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0); + probMatrix = new ProbabilityMatrix(*(subMat)); } KmerExtractor::~KmerExtractor() { diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 3d15b6d2..8486538a 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -22,7 +22,6 @@ KmerMatcher::KmerMatcher(const LocalParameters & par, return; } char taxID[100]; - while(feof(taxIdFile) == 0) { fscanf(taxIdFile,"%s",taxID); TaxID taxId = atol(taxID); @@ -88,8 +87,7 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer * m // Each split has start and end points of query list + proper offset point of target k-mer list std::vector querySplits; uint64_t queryAA; - std::vector targetSplitIdxs; - + if (threads == 1) { //Single thread querySplits.emplace_back(0, queryKmerNum - 1, queryKmerNum, diffIdxSplits.data[0]); } else if (threads == 2) { //Two threads @@ -121,7 +119,6 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer * m querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i, diffIdxSplits.data[j]); } - targetSplitIdxs.emplace_back(j); needLastTargetBlock = false; break; } @@ -130,11 +127,9 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer * m if (i != threads - 1) { // If it is not the last split querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth, diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); - targetSplitIdxs.emplace_back(numOfDiffIdxSplits_use - 2); } else { querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i, diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); - targetSplitIdxs.emplace_back(numOfDiffIdxSplits_use - 2); } } } @@ -149,7 +144,7 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer * m while (completedSplitCnt < threads) { bool hasOverflow = false; #pragma omp parallel default(none), shared(completedSplitCnt, splitCheckList, hasOverflow, \ -querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName, targetSplitIdxs) +querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName) { // FILE FILE * diffIdxFp = fopen(targetDiffIdxFileName.c_str(), "rb"); @@ -157,7 +152,7 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI // Target K-mer buffer uint16_t * diffIdxBuffer = (uint16_t *) malloc(sizeof(uint16_t) * (BufferSize + 1)); // size = 32 Mb - TargetKmerInfo * kmerInfoBuffer = (TargetKmerInfo *) malloc(sizeof(TargetKmerInfo) * (BufferSize+1)); // 64 Mb + TargetKmerInfo * kmerInfoBuffer = (TargetKmerInfo *) malloc(sizeof(TargetKmerInfo) * (BufferSize + 1)); // 64 Mb size_t kmerInfoBufferIdx = 0; size_t diffIdxBufferIdx = 0; @@ -177,9 +172,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI auto *matches = new Match[localBufferSize]; // 16 * 2'000'000 = 32 Mb int matchCnt = 0; - // For debug -// SeqIterator seqIterator(par); - //vectors for selected target k-mers std::vector selectedHammingSum; std::vector selectedMatches; @@ -236,6 +228,11 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI } for (int k = 0; k < currMatchNum; k++) { idx = selectedMatches[k]; + // Check if candidateKmerInfos[idx].sequenceID is valid + if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || + taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { + cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; + } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, taxId2genusId[candidateKmerInfos[idx].sequenceID], @@ -273,6 +270,11 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI } for (int k = 0; k < currMatchNum; k++) { idx = selectedMatches[k]; + // Check if candidateKmerInfos[idx].sequenceID is valid + if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || + taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { + cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; + } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, taxId2genusId[candidateKmerInfos[idx].sequenceID], @@ -365,6 +367,11 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI for (int k = 0; k < currMatchNum; k++) { idx = selectedMatches[k]; + // Check if candidateKmerInfos[idx].sequenceID is valid + if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || + taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { + cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; + } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, taxId2genusId[candidateKmerInfos[idx].sequenceID], @@ -404,7 +411,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI } } // end of while(completeSplitCnt < threadNum) std::cout << "Time spent for the comparison: " << double(time(nullptr) - beforeSearch) << std::endl; - munmap(diffIdxSplits.data, diffIdxSplits.fileSize + 1); free(splitCheckList); queryKmerNum = 0; @@ -416,8 +422,9 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI time_t beforeSortMatches = time(nullptr); totalMatchCnt += matchBuffer->startIndexOfReserve; std::cout << "Sorting matches ..." << std::endl; - SORT_PARALLEL(matchBuffer->buffer, matchBuffer->buffer + matchBuffer->startIndexOfReserve, - sortMatch()); + SORT_PARALLEL(matchBuffer->buffer, + matchBuffer->buffer + matchBuffer->startIndexOfReserve, + compareMatches); std::cout << "Time spent for sorting matches: " << double(time(nullptr) - beforeSortMatches) << std::endl; return 1; @@ -463,4 +470,23 @@ void KmerMatcher::compareDna(uint64_t query, } } delete[] hammingSums; +} + +bool KmerMatcher::compareMatches(const Match& a, const Match& b) { + if (a.qInfo.sequenceID != b.qInfo.sequenceID) + return a.qInfo.sequenceID < b.qInfo.sequenceID; + + if (a.genusId != b.genusId) + return a.genusId < b.genusId; + + if (a.speciesId != b.speciesId) + return a.speciesId < b.speciesId; + + if (a.qInfo.frame != b.qInfo.frame) + return a.qInfo.frame < b.qInfo.frame; + + if (a.qInfo.pos != b.qInfo.pos) + return a.qInfo.pos < b.qInfo.pos; + + return a.hamming < b.hamming; } \ No newline at end of file diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h index 183f916f..c4b98643 100644 --- a/src/commons/KmerMatcher.h +++ b/src/commons/KmerMatcher.h @@ -48,7 +48,6 @@ class KmerMatcher { struct QueryKmerSplit { QueryKmerSplit(size_t start, size_t end, size_t length, const DiffIdxSplit& diffIdxSplit) : start(start), end(end), length(length), diffIdxSplit(diffIdxSplit) {} - size_t start; // start idx in query k-mer list size_t end; // end idx in query k-mer list size_t length; @@ -98,6 +97,8 @@ class KmerMatcher { virtual uint16_t getHammings_reverse(uint64_t kmer1, uint64_t kmer2); + static bool compareMatches(const Match& a, const Match& b); + public: KmerMatcher(const LocalParameters & par, NcbiTaxonomy * taxonomy); @@ -133,7 +134,7 @@ inline TargetKmerInfo KmerMatcher::getKmerInfo(size_t bufferSize, FILE * kmerInfoFp, TargetKmerInfo * infoBuffer, - size_t & infoBufferIdx){ + size_t & infoBufferIdx){ if (unlikely(infoBufferIdx >= bufferSize)) { loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize, (int) (infoBufferIdx - bufferSize)); } @@ -173,25 +174,25 @@ inline uint16_t KmerMatcher::getHammings_reverse(uint64_t kmer1, uint64_t kmer2) return hammings; } -struct sortMatch { - bool operator() (const Match& a, const Match& b) const { - if (a.qInfo.sequenceID != b.qInfo.sequenceID) - return a.qInfo.sequenceID < b.qInfo.sequenceID; +// struct sortMatch { +// bool operator() (const Match& a, const Match& b) const { +// if (a.qInfo.sequenceID != b.qInfo.sequenceID) +// return a.qInfo.sequenceID < b.qInfo.sequenceID; - if (a.genusId != b.genusId) - return a.genusId < b.genusId; +// if (a.genusId != b.genusId) +// return a.genusId < b.genusId; - if (a.speciesId != b.speciesId) - return a.speciesId < b.speciesId; +// if (a.speciesId != b.speciesId) +// return a.speciesId < b.speciesId; - if (a.qInfo.frame != b.qInfo.frame) - return a.qInfo.frame < b.qInfo.frame; +// if (a.qInfo.frame != b.qInfo.frame) +// return a.qInfo.frame < b.qInfo.frame; - if (a.qInfo.pos != b.qInfo.pos) - return a.qInfo.pos < b.qInfo.pos; +// if (a.qInfo.pos != b.qInfo.pos) +// return a.qInfo.pos < b.qInfo.pos; - return a.hamming < b.hamming; - } -}; +// return a.hamming < b.hamming; +// } +// }; #endif //METABULI_KMERMATCHER_H diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 4169ab25..ebc4ab2f 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -1,7 +1,7 @@ #ifndef ADCLASSIFIER2_LOCALPARAMETERS_H #define ADCLASSIFIER2_LOCALPARAMETERS_H -#include +#include "Parameters.h" const int CITATION_SPACEPHARER = CITATION_END; diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp index ad0f7f86..07a7251f 100644 --- a/src/commons/QueryIndexer.cpp +++ b/src/commons/QueryIndexer.cpp @@ -29,56 +29,77 @@ void QueryIndexer::setAvailableRam() { void QueryIndexer::indexQueryFile() { // Read 1 - KSeqWrapper* kseq; - kseq = KSeqFactory(queryPath_1.c_str()); - size_t kmerCnt = 0; - size_t seqCnt = 0; - size_t start = 0; - while (kseq->ReadEntry()) { - readNum_1++; - const KSeqWrapper::KSeqEntry &e = kseq->entry; - totalReadLength += e.sequence.l; - size_t currentKmerCnt = LocalUtil::getQueryKmerNumber(e.sequence.l, spaceNum); - kmerCnt += currentKmerCnt; - seqCnt++; - if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) { - querySplits.emplace_back(start, readNum_1, kmerCnt - currentKmerCnt); - kmerCnt = currentKmerCnt; - start = readNum_1; - seqCnt = 1; - } - } - querySplits.emplace_back(start, readNum_1, kmerCnt); - delete kseq; - - // Read 2 - if (seqMode == 2) { - kseq = KSeqFactory(queryPath_2.c_str()); - kmerCnt = 0; - seqCnt = 0; - start = 0; + if (seqMode == 1 || seqMode == 3) { + KSeqWrapper* kseq; + kseq = KSeqFactory(queryPath_1.c_str()); + size_t kmerCnt = 0; + size_t seqCnt = 0; + size_t start = 0; while (kseq->ReadEntry()) { - readNum_2++; + readNum_1++; const KSeqWrapper::KSeqEntry &e = kseq->entry; totalReadLength += e.sequence.l; size_t currentKmerCnt = LocalUtil::getQueryKmerNumber(e.sequence.l, spaceNum); kmerCnt += currentKmerCnt; seqCnt++; if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) { - querySplits.emplace_back(start, readNum_2, kmerCnt - currentKmerCnt); + querySplits.emplace_back(start, readNum_1, kmerCnt - currentKmerCnt); kmerCnt = currentKmerCnt; - start = readNum_2; + start = readNum_1; seqCnt = 1; } } - querySplits.emplace_back(start, readNum_2, kmerCnt); + querySplits.emplace_back(start, readNum_1, kmerCnt); delete kseq; + } else { + KSeqWrapper* kseq_1 = KSeqFactory(queryPath_1.c_str()); + KSeqWrapper* kseq_2 = KSeqFactory(queryPath_2.c_str()); + size_t kmerCnt = 0; + size_t seqCnt_1 = 0; + size_t seqCnt_2 = 0; + size_t start = 0; + size_t currentKmerCnt; + bool end = false; + while(true) { + if (kseq_1->ReadEntry()) { + readNum_1++; + seqCnt_1++; + totalReadLength += kseq_1->entry.sequence.l; + currentKmerCnt = LocalUtil::getQueryKmerNumber(kseq_1->entry.sequence.l, spaceNum); + kmerCnt += currentKmerCnt; + } else { + end = true; + } + + if (kseq_2->ReadEntry()) { + readNum_2++; + seqCnt_2++; + totalReadLength += kseq_2->entry.sequence.l; + currentKmerCnt += LocalUtil::getQueryKmerNumber(kseq_2->entry.sequence.l, spaceNum); + kmerCnt += currentKmerCnt; + } else { + end = true; + } + + if (seqCnt_1 != seqCnt_2) { + Debug(Debug::ERROR) << "The number of reads in the two files are not equal." << "\n"; + EXIT(EXIT_FAILURE); + } - // Check if the number of reads in the two files are equal - if (readNum_1 != readNum_2) { - Debug(Debug::ERROR) << "The number of reads in the two files are not equal." << "\n"; - EXIT(EXIT_FAILURE); + if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) { + querySplits.emplace_back(start, seqCnt_1, kmerCnt - currentKmerCnt); + kmerCnt = currentKmerCnt; + start = seqCnt_1; + seqCnt_1 = 1; + } + + if (end) { + querySplits.emplace_back(start, seqCnt_1, kmerCnt); + break; + } } + delete kseq_1; + delete kseq_2; } } diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index c41ddea9..85ef5339 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -7,7 +7,7 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon for(size_t i = 0, j = 0; i < par.spaceMask.length(); i++){ mask[i] = par.spaceMask[i] - 48; spaceNum += (mask[i] == 0); - if(par.spaceMask[i]==1){ + if(mask[i] == 1){ unmaskedPos[j] = (int) i; j++; } From 7ee4d6700a95f4fbf40f05764b1d5156b87a1a45 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Mon, 21 Aug 2023 16:38:05 +0900 Subject: [PATCH 16/65] first implementation of a filtering module --- src/commons/Classifier.cpp | 5 -- src/commons/KmerMatcher.cpp | 2 +- src/commons/LocalParameters.cpp | 29 +++++++ src/commons/LocalParameters.h | 7 ++ src/commons/QueryFilter.cpp | 135 +++++++++++++++++++++++++++++++- src/commons/QueryFilter.h | 26 +++++- src/commons/QueryIndexer.cpp | 10 +-- src/commons/Reporter.cpp | 5 +- src/commons/Reporter.h | 10 +-- src/metabuli.cpp | 2 +- src/workflow/filter.cpp | 23 ++---- 11 files changed, 216 insertions(+), 38 deletions(-) diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index 3f157fd3..09a53dfa 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -43,11 +43,6 @@ void Classifier::startClassify(const LocalParameters &par) { cout << "Total number of sequences: " << numOfSeq << endl; cout << "Total read length: " << totalReadLength << "nt" << endl; - // Print queryReadSplit - for (size_t i = 0; i < queryReadSplit.size(); i++) { - cout << queryReadSplit[i].start << " " << queryReadSplit[i].end << " " << queryReadSplit[i].kmerCnt << endl; - } - QueryKmerBuffer kmerBuffer; Buffer matchBuffer; vector queryList; diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 8486538a..204187d4 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -160,7 +160,7 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI uint64_t currentQuery = UINT64_MAX; uint64_t currentQueryAA = UINT64_MAX; QueryKmerInfo currentQueryInfo; - + //target variables size_t diffIdxPos = 0; std::vector candidateTargetKmers; //vector for candidate target k-mer, some of which are selected after based on hamming distance diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 47b2b689..b6d21b7b 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -223,6 +223,13 @@ LocalParameters::LocalParameters() : typeid(std::string), (void *) &printColumns, "^.*$") + PRINT_MODE(PRINT_MODE_ID, + "--print-mode", + "[1] Only filtered reads [2] Both filtered and removed reads", + "[1] Only filtered reads [2] Both filtered and removed reads", + typeid(int), + (void *) &printMode, + "[1-2]") { //add_to_library @@ -258,6 +265,28 @@ LocalParameters::LocalParameters() : classify.push_back(&PARAM_MASK_PROBABILTY); classify.push_back(&MATCH_PER_KMER); + // filter + filter.push_back(&PARAM_THREADS); + filter.push_back(&SEQ_MODE); + filter.push_back(&VIRUS_TAX_ID); + filter.push_back(&REDUCED_AA); + filter.push_back(&MIN_SCORE); + filter.push_back(&MIN_COVERAGE); + filter.push_back(&SPACED); + filter.push_back(&HAMMING_MARGIN); + filter.push_back(&MIN_SP_SCORE); + filter.push_back(&PARAM_V); + filter.push_back(&RAM_USAGE); + filter.push_back(&MIN_COVERED_POS); + filter.push_back(&PRINT_LOG); + filter.push_back(&MAX_GAP); + filter.push_back(&TAXONOMY_PATH); + filter.push_back(&MIN_CONS_CNT); + filter.push_back(&MIN_CONS_CNT_EUK); + filter.push_back(&PARAM_MASK_RESIDUES); + filter.push_back(&PARAM_MASK_PROBABILTY); + filter.push_back(&MATCH_PER_KMER); + filter.push_back(&PRINT_MODE); //updateTargetDB exclusiontest_hiv.push_back(&TEST_RANK); diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index ebc4ab2f..1a173deb 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -21,6 +21,7 @@ class LocalParameters : public Parameters { } std::vector classify; + std::vector filter; std::vector exclusiontest_hiv; std::vector seqHeader2TaxId; std::vector grade; @@ -71,6 +72,9 @@ class LocalParameters : public Parameters { PARAMETER(COVERAGE_COL) PARAMETER(PRINT_COLUMNS) + // Filter + PARAMETER(PRINT_MODE) + // Superkingdom taxonomy id int virusTaxId; int bacteriaTaxId; @@ -113,6 +117,9 @@ class LocalParameters : public Parameters { // Add to library bool assembly; + // Filter + int printMode; + private: LocalParameters(); diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp index 863baea3..57f97fd0 100644 --- a/src/commons/QueryFilter.cpp +++ b/src/commons/QueryFilter.cpp @@ -1,13 +1,58 @@ #include "QueryFilter.h" QueryFilter::QueryFilter(LocalParameters & par) { + // Load parameters + dbDir = par.filenames[1 + (par.seqMode == 2)]; + matchPerKmer = par.matchPerKmer; + printMode = par.printMode; + + // Taxonomy + if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; + taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp", + par.taxonomyPath + "/nodes.dmp", + par.taxonomyPath + "/merged.dmp"); + + // Agents queryIndexer = new QueryIndexer(par); + kmerExtractor = new KmerExtractor(par); + if (par.reducedAA) { kmerMatcher = new ReducedKmerMatcher(par, taxonomy);} + else { kmerMatcher = new KmerMatcher(par, taxonomy);} + taxonomer = new Taxonomer(par, taxonomy); + reporter = new Reporter(par, taxonomy); setInputAndOutputFiles(par); + filter_kseq1 = KSeqFactory(in1.c_str()); + if (par.seqMode == 2) { filter_kseq2 = KSeqFactory(in2.c_str()); } + + isFiltered = new bool[queryIndexer->getReadNum_1()]; + memset(isFiltered, 0, sizeof(bool) * queryIndexer->getReadNum_1()); + readCounter = 0; + + // Open output files + f1_fp = fopen(f1.c_str(), "w"); + if (par.seqMode == 2) { f2_fp = fopen(f2.c_str(), "w"); } + if (printMode == 2) { + rm1_fp = fopen(rm1.c_str(), "w"); + if (par.seqMode == 2) { rm2_fp = fopen(rm2.c_str(), "w"); } + } } QueryFilter::~QueryFilter() { + delete taxonomy; delete queryIndexer; + delete kmerExtractor; + delete kmerMatcher; + delete taxonomer; + delete reporter; + delete filter_kseq1; + delete filter_kseq2; + delete[] isFiltered; + fclose(f1_fp); + if (par.seqMode == 2) { fclose(f2_fp); } + if (printMode == 2) { + fclose(rm1_fp); + if (par.seqMode == 2) { fclose(rm2_fp); } + } } void QueryFilter::setInputAndOutputFiles(const LocalParameters & par) { @@ -16,13 +61,37 @@ void QueryFilter::setInputAndOutputFiles(const LocalParameters & par) { string baseName = LocalUtil::getQueryBaseName(in1); // Set the output file names - out1 = baseName + "_filtered.fna.gz"; - reportFileName = baseName + "_filter_report.tsv"; + f1 = baseName + "_filtered.fna.gz"; + rm1 = baseName + "_removed.fna.gz"; // For paired-end reads if (par.seqMode == 2) { in2 = par.filenames[1]; - out2 = LocalUtil::getQueryBaseName(in2) + "_filtered.fna.gz"; + f2 = LocalUtil::getQueryBaseName(in2) + "_filtered.fna.gz"; + rm2 = LocalUtil::getQueryBaseName(in2) + "_removed.fna.gz"; + } +} + +void QueryFilter::recordFilteredReads(const vectore & queryList) { + for (query:queryList){ + isFiltered[readCounter++] = query.isClassified; + } +} + +void QueryFilter::printFilteredReads() { + for (size_t i = 0; i < readCounter; i ++) { + // Read query reads + filter_kseq1->ReadEntry(); + if (par.seqMode == 2) { filter_kseq2->ReadEntry(); } + + // Print reads + if (isFiltered[i]) { // Print filtered reads + fprintf(f1_fp, ">%s\n%s\n", filter_kseq1->entry.name.s, filter_kseq1->entry.sequence.s); + if (par.seqMode == 2) { fprintf(f2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); } + } else if (printMode == 2) { // Print removed reads + fprintf(rm1_fp, ">%s\n%s\n", filter_kseq1->entry.name.s, filter_kseq1->entry.sequence.s); + if (par.seqMode == 2) { fprintf(rm2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); } + } } } @@ -41,6 +110,66 @@ void QueryFilter::filterReads(LocalParameters & par) { Buffer matchBuffer; vector queryList; + size_t numOfTatalQueryKmerCnt = 0; + size_t totalMatchCnt = 0; + size_t processedSeqCnt = 0; + reporter->openReadClassificationFile(); + +#ifdef OPENMP + omp_set_num_threads(par.threads); +#endif + + KSeqWrapper* kseq1 = KSeqFactory(in1.c_str()); + KSeqWrapper* kseq2 = nullptr; + if (par.seqMode == 2) { kseq2 = KSeqFactory(in2.c_str()); } + + for (size_t splitIdx = 0; splitIdx < queryReadSplit.size(); splitIdx++) { + // Allocate memory for query list + queryList.clear(); + queryList.resize(queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start); + + // Allocate memory for query k-mer list and match list + kmerBuffer.reallocateMemory(queryReadSplit[splitIdx].kmerCnt); + if (queryReadSplit.size() == 1) { + size_t remain = queryIndexer->getAvailableRam() - queryReadSplit[splitIdx].kmerCnt * sizeof(QueryKmer) - numOfSeq * 200; + matchBuffer.reallocateMemory(remain / sizeof(Match)); + } else { + matchBuffer.reallocateMemory(queryReadSplit[splitIdx].kmerCnt * matchPerKmer); + } + // Initialize query k-mer buffer and match buffer + kmerBuffer.startIndexOfReserve = 0; + matchBuffer.startIndexOfReserve = 0; + + // Extract query k-mer + kmerExtractor->extractQueryKmers(kmerBuffer, + queryList, + queryReadSplit[splitIdx], + par, + kseq1, + kseq2); + numOfTatalQueryKmerCnt += kmerBuffer.startIndexOfReserve; + + // Search matches between query and target k-mers + kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer); + + // Classify queries based on the matches + taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par); + processedSeqCnt += queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start; + cout << "The number of processed sequences: " << processedSeqCnt << " (" << (double) processedSeqCnt / (double) numOfSeq << ")" << endl; + + // Write classification results + reporter->writeReadClassification(queryList, true); + + recordFilteredReads(queryList); + } + printFilteredReads(); + reporter->writeReportFile(numOfSeq, taxonomer->getTaxCounts()); + reporter->closeReadClassificationFile(); + + // Memory deallocation + free(matchBuffer.buffer); + delete kseq1; + delete kseq2; } diff --git a/src/commons/QueryFilter.h b/src/commons/QueryFilter.h index 33fa7de1..962a3c02 100644 --- a/src/commons/QueryFilter.h +++ b/src/commons/QueryFilter.h @@ -4,15 +4,39 @@ #include "LocalUtil.h" #include "QueryIndexer.h" #include "ReducedKmerMatcher.h" +#include "KmerExtractor.h" +#include "Taxonomer.h" +#include "Reporter.h" + class QueryFilter { private: + // Parameters + std::string dbDir; + size_t matchPerKmer; + int printMode; + + // Agents QueryIndexer * queryIndexer; + KmerExtractor * kmerExtractor; KmerMatcher * kmerMatcher; + Taxonomer * taxonomer; + Reporter * reporter; - std::string in1, in2, out1, out2, reportFileName; // input and output file names + // Kseq + KSeqWrapper* filter_kseq1; + KSeqWrapper* filter_kseq2; + + std::string in1, in2, f1, f2, rm1, rm2; // input and output file names + bool * isFiltered; + size_t readCounter; + FILE * f1_fp, * f2_fp, * rm1_fp, * rm2_fp; void setInputAndOutputFiles(const LocalParameters & par); + void recordFilteredReads(const vector & queryList); + + void printFilteredReads(); + public: void filterReads(LocalParameters & par); explicit QueryFilter(LocalParameters & par); diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp index 07a7251f..0651b479 100644 --- a/src/commons/QueryIndexer.cpp +++ b/src/commons/QueryIndexer.cpp @@ -55,7 +55,7 @@ void QueryIndexer::indexQueryFile() { KSeqWrapper* kseq_1 = KSeqFactory(queryPath_1.c_str()); KSeqWrapper* kseq_2 = KSeqFactory(queryPath_2.c_str()); size_t kmerCnt = 0; - size_t seqCnt_1 = 0; + size_t seqCnt_1 = 0; size_t seqCnt_2 = 0; size_t start = 0; size_t currentKmerCnt; @@ -81,20 +81,20 @@ void QueryIndexer::indexQueryFile() { end = true; } - if (seqCnt_1 != seqCnt_2) { + if (readNum_1 != readNum_2) { Debug(Debug::ERROR) << "The number of reads in the two files are not equal." << "\n"; EXIT(EXIT_FAILURE); } if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) { - querySplits.emplace_back(start, seqCnt_1, kmerCnt - currentKmerCnt); + querySplits.emplace_back(start, readNum_1, kmerCnt - currentKmerCnt); kmerCnt = currentKmerCnt; - start = seqCnt_1; + start = readNum_1; seqCnt_1 = 1; } if (end) { - querySplits.emplace_back(start, seqCnt_1, kmerCnt); + querySplits.emplace_back(start, readNum_1, kmerCnt); break; } } diff --git a/src/commons/Reporter.cpp b/src/commons/Reporter.cpp index 288aecdb..566e8c66 100644 --- a/src/commons/Reporter.cpp +++ b/src/commons/Reporter.cpp @@ -15,8 +15,11 @@ void Reporter::openReadClassificationFile() { readClassificationFile.open(outDir + "/" + jobId + "_classifications.tsv"); } -void Reporter::writeReadClassification(const vector & queryList) { +void Reporter::writeReadClassification(const vector & queryList, bool classifiedOnly) { for (size_t i = 0; i < queryList.size(); i++) { + if (classifiedOnly && !queryList[i].isClassified) { + continue; + } readClassificationFile << queryList[i].isClassified << "\t" << queryList[i].name << "\t" << queryList[i].classification << "\t" << queryList[i].queryLength + queryList[i].queryLength2 << "\t" diff --git a/src/commons/Reporter.h b/src/commons/Reporter.h index 4de0f32c..d64e567c 100644 --- a/src/commons/Reporter.h +++ b/src/commons/Reporter.h @@ -23,16 +23,16 @@ class Reporter { public: Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy); // Write report + void writeReportFile(int numOfQuery, unordered_map &taxCnt); + void writeReport(FILE *FP, const std::unordered_map &cladeCounts, + unsigned long totalReads, TaxID taxID = 0, int depth = 0); // Read by read classification results void openReadClassificationFile(); - void writeReadClassification(const vector & queryList); + void writeReadClassification(const vector & queryList, bool classifiedOnly = false); void closeReadClassificationFile(); - void writeReportFile(int numOfQuery, unordered_map &taxCnt); - - void writeReport(FILE *FP, const std::unordered_map &cladeCounts, - unsigned long totalReads, TaxID taxID = 0, int depth = 0); + unsigned int cladeCountVal(const std::unordered_map &map, TaxID key); diff --git a/src/metabuli.cpp b/src/metabuli.cpp index 8918c64e..60f09cf9 100644 --- a/src/metabuli.cpp +++ b/src/metabuli.cpp @@ -67,7 +67,7 @@ std::vector commands = { {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}, {"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}, {"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, - {"filter", classify, &localPar.classify, COMMAND_TAXONOMY, + {"filter", classify, &localPar.filter, COMMAND_TAXONOMY, "Filtering reads based on the classification result", nullptr, "Jaebeom Kim ", diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp index fd02afcf..40bfc797 100644 --- a/src/workflow/filter.cpp +++ b/src/workflow/filter.cpp @@ -1,7 +1,4 @@ -#include "Classifier.h" -#include "Parameters.h" #include "LocalParameters.h" -#include "FileUtil.h" #include "QueryFilter.h" void setFilterDefaults(LocalParameters & par){ @@ -24,6 +21,7 @@ void setFilterDefaults(LocalParameters & par){ par.maskMode = 0; par.maskProb = 0.9; par.matchPerKmer = 4; + par.printMode = 1; } int filter(int argc, const char **argv, const Command& command) @@ -32,22 +30,15 @@ int filter(int argc, const char **argv, const Command& command) setFilterDefaults(par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0); - if (par.seqMode == 2) { - if (!FileUtil::directoryExists(par.filenames[3].c_str())) { - FileUtil::makeDir(par.filenames[3].c_str()); - } - } else { - if (!FileUtil::directoryExists(par.filenames[2].c_str())) { - FileUtil::makeDir(par.filenames[2].c_str()); - } - } - #ifdef OPENMP omp_set_num_threads(par.threads); #endif - cout << "Number of threads: " << par.threads << endl; - - + QueryFilter * queryFilter = new QueryFilter(par); + + queryFilter->filterReads(par); + + delete queryFilter; + return 0; } \ No newline at end of file From 1988259083dd7602b2bda6123c6dd5b4ba4e8659 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 22 Aug 2023 14:52:44 +0900 Subject: [PATCH 17/65] filter agains multiple DBs --- src/commons/Classifier.cpp | 1 + src/commons/IndexCreator.h | 5 - src/commons/KmerMatcher.cpp | 47 ++++-- src/commons/KmerMatcher.h | 283 ++++++++++++++++---------------- src/commons/LocalParameters.cpp | 12 +- src/commons/LocalParameters.h | 2 + src/commons/Match.h | 10 +- src/commons/QueryFilter.cpp | 27 +-- src/commons/QueryFilter.h | 3 + src/metabuli.cpp | 31 ++-- src/workflow/add_to_library.cpp | 1 + src/workflow/filter.cpp | 1 + 12 files changed, 223 insertions(+), 200 deletions(-) diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index 09a53dfa..5b95e8f6 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -106,6 +106,7 @@ void Classifier::startClassify(const LocalParameters &par) { // Search matches between query and target k-mers kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer); + kmerMatcher->sortMatches(&matchBuffer); //#ifdef OPENMP diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h index 087cc3b3..8635cdae 100644 --- a/src/commons/IndexCreator.h +++ b/src/commons/IndexCreator.h @@ -14,10 +14,7 @@ #include "common.h" #include "NcbiTaxonomy.h" #include "FastSort.h" -#include "Classifier.h" #include "LocalParameters.h" - -// For masking #include "NucleotideMatrix.h" #include "SubstitutionMatrix.h" #include "tantan.h" @@ -28,8 +25,6 @@ #endif - - struct TaxId2Fasta{ TaxID species; TaxID taxid; diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 204187d4..3f9a1495 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -2,17 +2,13 @@ KmerMatcher::KmerMatcher(const LocalParameters & par, NcbiTaxonomy * taxonomy) { + // Parameters threads = par.threads; - std::string dbDir = par.filenames[1 + (par.seqMode == 2)]; - targetDiffIdxFileName = dbDir + "/diffIdx"; - targetInfoFileName = dbDir + "/info"; - diffIdxSplitFileName = dbDir + "/split"; - - diffIdxSplits = mmapData(diffIdxSplitFileName.c_str(), 3); - + dbDir = par.filenames[1 + (par.seqMode == 2)]; + hammingMargin = par.hammingMargin; + MARKER = 16777215; MARKER = ~ MARKER; - hammingMargin = par.hammingMargin; totalMatchCnt = 0; // Load the taxonomy ID list @@ -53,16 +49,33 @@ KmerMatcher::KmerMatcher(const LocalParameters & par, fclose(taxIdFile); } + KmerMatcher::~KmerMatcher() { - munmap(diffIdxSplits.data, diffIdxSplits.fileSize + 1); } -int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer * matchBuffer) { - size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve; - QueryKmer *queryKmerList = queryKmerBuffer->buffer; +int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, + Buffer * matchBuffer, + const string & db){ + // Set database files + string targetDiffIdxFileName; + string targetInfoFileName; + string diffIdxSplitFileName; + if (db.empty()) { + targetDiffIdxFileName = dbDir + "/diffIdx"; + targetInfoFileName = dbDir + "/info"; + diffIdxSplitFileName = dbDir + "/split"; + } else { + targetDiffIdxFileName = dbDir + "/" + db + "/diffIdx"; + targetInfoFileName = dbDir + "/" + db + "/info"; + diffIdxSplitFileName = dbDir + "/" + db + "/split"; + } + MmapedData diffIdxSplits = mmapData(diffIdxSplitFileName.c_str(), 3); size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t); + size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve; + QueryKmer *queryKmerList = queryKmerBuffer->buffer; + std::cout << "Comparing query and reference metamers..." << std::endl; // Find the first index of garbage query k-mer (UINT64_MAX) and discard from there @@ -418,16 +431,17 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI omp_set_num_threads(threads); #endif - // Sort matches - time_t beforeSortMatches = time(nullptr); totalMatchCnt += matchBuffer->startIndexOfReserve; + return 1; +} + +void KmerMatcher::sortMatches(Buffer * matchBuffer) { + time_t beforeSortMatches = time(nullptr); std::cout << "Sorting matches ..." << std::endl; SORT_PARALLEL(matchBuffer->buffer, matchBuffer->buffer + matchBuffer->startIndexOfReserve, compareMatches); std::cout << "Time spent for sorting matches: " << double(time(nullptr) - beforeSortMatches) << std::endl; - - return 1; } void KmerMatcher::moveMatches(Match *dest, Match *src, int &matchNum) { @@ -472,6 +486,7 @@ void KmerMatcher::compareDna(uint64_t query, delete[] hammingSums; } + bool KmerMatcher::compareMatches(const Match& a, const Match& b) { if (a.qInfo.sequenceID != b.qInfo.sequenceID) return a.qInfo.sequenceID < b.qInfo.sequenceID; diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h index c4b98643..05b61dc7 100644 --- a/src/commons/KmerMatcher.h +++ b/src/commons/KmerMatcher.h @@ -1,17 +1,17 @@ #ifndef METABULI_KMERMATCHER_H #define METABULI_KMERMATCHER_H +#include "BitManipulateMacros.h" +#include "FileUtil.h" #include "KmerBuffer.h" -#include "Match.h" -#include "common.h" #include "LocalParameters.h" -#include -#include "FileUtil.h" +#include "Match.h" #include "Mmap.h" -#include "BitManipulateMacros.h" #include "NcbiTaxonomy.h" +#include "common.h" #include "unordered_map" +#include -#define BufferSize 16'777'216 //16 * 1024 * 1024 // 16 M +#define BufferSize 16'777'216 // 16 * 1024 * 1024 // 16 M // Input // 1. Query K-mers @@ -24,154 +24,155 @@ using namespace std; class KmerMatcher { protected: - NcbiTaxonomy * taxonomy; - size_t threads; - std::string targetDiffIdxFileName, targetInfoFileName, diffIdxSplitFileName; - MmapedData diffIdxSplits; - uint64_t MARKER; - int bitsForCodon = 3; - uint8_t hammingMargin; - size_t totalMatchCnt; - uint8_t hammingLookup[8][8] = { - {0, 1, 1, 1, 2, 1, 3, 3}, - {1, 0, 1, 1, 2, 2, 3, 2}, - {1, 1, 0, 1, 2, 2, 2, 3}, - {1, 1, 1, 0, 1, 2, 3, 3}, - {2, 2, 2, 1, 0, 1, 4, 4}, - {1, 2, 2, 2, 1, 0, 4, 4}, - {3, 3, 2, 3, 4, 4, 0, 1}, - {3, 2, 3, 3, 4, 4, 1, 0}}; - unordered_map taxId2speciesId; - unordered_map taxId2genusId; - - - struct QueryKmerSplit { - QueryKmerSplit(size_t start, size_t end, size_t length, const DiffIdxSplit& diffIdxSplit) - : start(start), end(end), length(length), diffIdxSplit(diffIdxSplit) {} - size_t start; // start idx in query k-mer list - size_t end; // end idx in query k-mer list - size_t length; - DiffIdxSplit diffIdxSplit; // index in target k-mer list from where the search begins. - }; - - size_t AminoAcidPart(size_t kmer) const { return (kmer) & MARKER; } - - template - static void loadBuffer(FILE * fp, T * buffer, size_t & bufferIdx, size_t size){ - fread(buffer, sizeof(T), size, fp); - bufferIdx = 0; - } - - template - static void loadBuffer(FILE * fp, T * buffer, size_t & bufferIdx, size_t size, int cnt){ - fseek(fp, cnt * sizeof(T), SEEK_CUR); - fread(buffer, sizeof(T), size, fp); - bufferIdx = 0; - } - - static uint64_t getNextTargetKmer(uint64_t lookingTarget, - const uint16_t * diffIdxBuffer, - size_t & diffBufferIdx, - size_t & totalPos); - - - static TargetKmerInfo getKmerInfo(size_t bufferSize, - FILE *kmerInfoFp, - TargetKmerInfo *infoBuffer, - size_t &infoBufferIdx); - - void moveMatches(Match *dest, - Match *src, - int &matchNum); - - void compareDna(uint64_t query, - std::vector &targetKmersToCompare, - std::vector &selectedMatches, - std::vector &selectedHammingSum, - std::vector &rightEndHammings, - uint8_t frame); - - virtual uint8_t getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2); - - virtual uint16_t getHammings(uint64_t kmer1, uint64_t kmer2); - - virtual uint16_t getHammings_reverse(uint64_t kmer1, uint64_t kmer2); - - static bool compareMatches(const Match& a, const Match& b); + NcbiTaxonomy *taxonomy; + size_t threads; + std::string dbDir; + // string targetDiffIdxFileName, targetInfoFileName, diffIdxSplitFileName; + // MmapedData diffIdxSplits; + uint64_t MARKER; + int bitsForCodon = 3; + uint8_t hammingMargin; + size_t totalMatchCnt; + uint8_t hammingLookup[8][8] = { + {0, 1, 1, 1, 2, 1, 3, 3}, {1, 0, 1, 1, 2, 2, 3, 2}, + {1, 1, 0, 1, 2, 2, 2, 3}, {1, 1, 1, 0, 1, 2, 3, 3}, + {2, 2, 2, 1, 0, 1, 4, 4}, {1, 2, 2, 2, 1, 0, 4, 4}, + {3, 3, 2, 3, 4, 4, 0, 1}, {3, 2, 3, 3, 4, 4, 1, 0}}; + unordered_map taxId2speciesId; + unordered_map taxId2genusId; + + struct QueryKmerSplit { + QueryKmerSplit(size_t start, size_t end, size_t length, + const DiffIdxSplit &diffIdxSplit) + : start(start), end(end), length(length), diffIdxSplit(diffIdxSplit) {} + size_t start; // start idx in query k-mer list + size_t end; // end idx in query k-mer list + size_t length; + DiffIdxSplit diffIdxSplit; // index in target k-mer list from where the + // search begins. + }; + + size_t AminoAcidPart(size_t kmer) const { return (kmer)&MARKER; } + + template + static void loadBuffer(FILE *fp, T *buffer, size_t &bufferIdx, size_t size) { + fread(buffer, sizeof(T), size, fp); + bufferIdx = 0; + } + + template + static void loadBuffer(FILE *fp, T *buffer, size_t &bufferIdx, size_t size, + int cnt) { + fseek(fp, cnt * sizeof(T), SEEK_CUR); + fread(buffer, sizeof(T), size, fp); + bufferIdx = 0; + } + + static uint64_t getNextTargetKmer(uint64_t lookingTarget, + const uint16_t *diffIdxBuffer, + size_t &diffBufferIdx, size_t &totalPos); + + static TargetKmerInfo getKmerInfo(size_t bufferSize, FILE *kmerInfoFp, + TargetKmerInfo *infoBuffer, + size_t &infoBufferIdx); + + void moveMatches(Match *dest, Match *src, int &matchNum); + + void compareDna(uint64_t query, std::vector &targetKmersToCompare, + std::vector &selectedMatches, + std::vector &selectedHammingSum, + std::vector &rightEndHammings, uint8_t frame); + + virtual uint8_t getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2); + + virtual uint16_t getHammings(uint64_t kmer1, uint64_t kmer2); + + virtual uint16_t getHammings_reverse(uint64_t kmer1, uint64_t kmer2); + + static bool compareMatches(const Match &a, const Match &b); public: - KmerMatcher(const LocalParameters & par, - NcbiTaxonomy * taxonomy); - - virtual ~KmerMatcher(); - - int matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer * matchBuffer); - + KmerMatcher(const LocalParameters &par, NcbiTaxonomy *taxonomy); + + virtual ~KmerMatcher(); + + int matchKmers(QueryKmerBuffer *queryKmerBuffer, Buffer *matchBuffer, + const string &db = string()); + + void sortMatches(Buffer *matchBuffer); }; -inline -uint64_t KmerMatcher::getNextTargetKmer(uint64_t lookingTarget, - const uint16_t *diffIdxBuffer, - size_t &diffBufferIdx, - size_t &totalPos) { - uint16_t fragment; - uint16_t check = 32768; // 2^15 - uint64_t diffIn64bit = 0; +inline uint64_t KmerMatcher::getNextTargetKmer(uint64_t lookingTarget, + const uint16_t *diffIdxBuffer, + size_t &diffBufferIdx, + size_t &totalPos) { + uint16_t fragment; + uint16_t check = 32768; // 2^15 + uint64_t diffIn64bit = 0; + fragment = diffIdxBuffer[diffBufferIdx++]; + totalPos++; + while (!(fragment & check)) { // 27 % + diffIn64bit |= fragment; + diffIn64bit <<= 15u; fragment = diffIdxBuffer[diffBufferIdx++]; totalPos++; - while (!(fragment & check)) { // 27 % - diffIn64bit |= fragment; - diffIn64bit <<= 15u; - fragment = diffIdxBuffer[diffBufferIdx++]; - totalPos++; - } - fragment &= ~check; // not; 8.47 % - diffIn64bit |= fragment; // or : 23.6% - return diffIn64bit + lookingTarget; + } + fragment &= ~check; // not; 8.47 % + diffIn64bit |= fragment; // or : 23.6% + return diffIn64bit + lookingTarget; } -inline -TargetKmerInfo KmerMatcher::getKmerInfo(size_t bufferSize, - FILE * kmerInfoFp, - TargetKmerInfo * infoBuffer, - size_t & infoBufferIdx){ - if (unlikely(infoBufferIdx >= bufferSize)) { - loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize, (int) (infoBufferIdx - bufferSize)); - } - return infoBuffer[infoBufferIdx]; +inline TargetKmerInfo KmerMatcher::getKmerInfo(size_t bufferSize, + FILE *kmerInfoFp, + TargetKmerInfo *infoBuffer, + size_t &infoBufferIdx) { + if (unlikely(infoBufferIdx >= bufferSize)) { + loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize, + (int)(infoBufferIdx - bufferSize)); + } + return infoBuffer[infoBufferIdx]; } -inline uint8_t KmerMatcher::getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2) {//12345678 - uint8_t hammingSum = 0; - hammingSum += hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 3U)][GET_3_BITS(kmer2 >> 3U)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 6U)][GET_3_BITS(kmer2 >> 6U)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 9U)][GET_3_BITS(kmer2 >> 9U)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 12U)][GET_3_BITS(kmer2 >> 12U)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 15U)][GET_3_BITS(kmer2 >> 15U)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 18U)][GET_3_BITS(kmer2 >> 18U)]; - hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 21U)][GET_3_BITS(kmer2 >> 21U)]; - return hammingSum; +inline uint8_t KmerMatcher::getHammingDistanceSum(uint64_t kmer1, + uint64_t kmer2) { // 12345678 + uint8_t hammingSum = 0; + hammingSum += hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 3U)][GET_3_BITS(kmer2 >> 3U)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 6U)][GET_3_BITS(kmer2 >> 6U)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 9U)][GET_3_BITS(kmer2 >> 9U)]; + hammingSum += + hammingLookup[GET_3_BITS(kmer1 >> 12U)][GET_3_BITS(kmer2 >> 12U)]; + hammingSum += + hammingLookup[GET_3_BITS(kmer1 >> 15U)][GET_3_BITS(kmer2 >> 15U)]; + hammingSum += + hammingLookup[GET_3_BITS(kmer1 >> 18U)][GET_3_BITS(kmer2 >> 18U)]; + hammingSum += + hammingLookup[GET_3_BITS(kmer1 >> 21U)][GET_3_BITS(kmer2 >> 21U)]; + return hammingSum; } -inline uint16_t KmerMatcher::getHammings(uint64_t kmer1, uint64_t kmer2) { //hammings 87654321 - uint16_t hammings = 0; - for (int i = 0; i < 8; i++) { - hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * i; - kmer1 >>= bitsForCodon; - kmer2 >>= bitsForCodon; - } - return hammings; +inline uint16_t KmerMatcher::getHammings(uint64_t kmer1, + uint64_t kmer2) { // hammings 87654321 + uint16_t hammings = 0; + for (int i = 0; i < 8; i++) { + hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * i; + kmer1 >>= bitsForCodon; + kmer2 >>= bitsForCodon; + } + return hammings; } -inline uint16_t KmerMatcher::getHammings_reverse(uint64_t kmer1, uint64_t kmer2) { //hammings 87654321 - uint16_t hammings = 0; - for (int i = 0; i < 8; i++) { - hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] << 2U * (7-i); - kmer1 >>= bitsForCodon; - kmer2 >>= bitsForCodon; - } - return hammings; +inline uint16_t +KmerMatcher::getHammings_reverse(uint64_t kmer1, + uint64_t kmer2) { // hammings 87654321 + uint16_t hammings = 0; + for (int i = 0; i < 8; i++) { + hammings |= hammingLookup[GET_3_BITS(kmer1)][GET_3_BITS(kmer2)] + << 2U * (7 - i); + kmer1 >>= bitsForCodon; + kmer2 >>= bitsForCodon; + } + return hammings; } // struct sortMatch { @@ -195,4 +196,4 @@ inline uint16_t KmerMatcher::getHammings_reverse(uint64_t kmer1, uint64_t kmer2) // } // }; -#endif //METABULI_KMERMATCHER_H +#endif // METABULI_KMERMATCHER_H diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index b6d21b7b..c5985ccf 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -222,14 +222,21 @@ LocalParameters::LocalParameters() : "CSV of column numbers to be printed", typeid(std::string), (void *) &printColumns, - "^.*$") + "^.*$"), PRINT_MODE(PRINT_MODE_ID, "--print-mode", "[1] Only filtered reads [2] Both filtered and removed reads", "[1] Only filtered reads [2] Both filtered and removed reads", typeid(int), (void *) &printMode, - "[1-2]") + "[1-2]"), + CONTAM_LIST(CONTAM_LIST_ID, + "--contam-list", + "List of contaminants to be filtered", + "List of taxids to be filtered", + typeid(std::string), + (void *) &contamList, + "^.*$") { //add_to_library @@ -287,6 +294,7 @@ LocalParameters::LocalParameters() : filter.push_back(&PARAM_MASK_PROBABILTY); filter.push_back(&MATCH_PER_KMER); filter.push_back(&PRINT_MODE); + filter.push_back(&CONTAM_LIST); //updateTargetDB exclusiontest_hiv.push_back(&TEST_RANK); diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 1a173deb..2a92115a 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -74,6 +74,7 @@ class LocalParameters : public Parameters { // Filter PARAMETER(PRINT_MODE) + PARAMETER(CONTAM_LIST) // Superkingdom taxonomy id int virusTaxId; @@ -119,6 +120,7 @@ class LocalParameters : public Parameters { // Filter int printMode; + std::string contamList; private: LocalParameters(); diff --git a/src/commons/Match.h b/src/commons/Match.h index da8dcdb4..436eb0bb 100644 --- a/src/commons/Match.h +++ b/src/commons/Match.h @@ -4,14 +4,6 @@ #include "Kmer.h" #include -//struct Match_qInfo { -// explicit Match_qInfo(uint32_t position = 0, uint32_t queryId = 0, uint8_t frame = 0) -// : position(position), queryId(queryId), frame(frame) {} -// uint64_t position : 32; -// uint64_t queryId : 29; -// uint64_t frame : 3; // 0-5 -//}; - struct Match { // 24 byte Match(){} Match(QueryKmerInfo qInfo, @@ -25,7 +17,7 @@ struct Match { // 24 byte rightEndHamming(eachHamming), hamming(hamming), redundancy(redundancy) { } QueryKmerInfo qInfo; // 8 - TaxID targetId; // 4 + TaxID targetId; // 4 taxonomy id infact TaxID genusId; // 4 TaxID speciesId; // 4 uint16_t rightEndHamming; // 2 diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp index 57f97fd0..c3ed99b3 100644 --- a/src/commons/QueryFilter.cpp +++ b/src/commons/QueryFilter.cpp @@ -5,7 +5,9 @@ QueryFilter::QueryFilter(LocalParameters & par) { dbDir = par.filenames[1 + (par.seqMode == 2)]; matchPerKmer = par.matchPerKmer; printMode = par.printMode; - + seqMode = par.seqMode; + contams = Util::split(par.contamList, ","); + // Taxonomy if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp", @@ -48,10 +50,10 @@ QueryFilter::~QueryFilter() { delete filter_kseq2; delete[] isFiltered; fclose(f1_fp); - if (par.seqMode == 2) { fclose(f2_fp); } + if (seqMode == 2) { fclose(f2_fp); } if (printMode == 2) { fclose(rm1_fp); - if (par.seqMode == 2) { fclose(rm2_fp); } + if (seqMode == 2) { fclose(rm2_fp); } } } @@ -65,15 +67,15 @@ void QueryFilter::setInputAndOutputFiles(const LocalParameters & par) { rm1 = baseName + "_removed.fna.gz"; // For paired-end reads - if (par.seqMode == 2) { + if (seqMode == 2) { in2 = par.filenames[1]; f2 = LocalUtil::getQueryBaseName(in2) + "_filtered.fna.gz"; rm2 = LocalUtil::getQueryBaseName(in2) + "_removed.fna.gz"; } } -void QueryFilter::recordFilteredReads(const vectore & queryList) { - for (query:queryList){ +void QueryFilter::recordFilteredReads(const vector & queryList) { + for (auto query : queryList) { isFiltered[readCounter++] = query.isClassified; } } @@ -82,15 +84,15 @@ void QueryFilter::printFilteredReads() { for (size_t i = 0; i < readCounter; i ++) { // Read query reads filter_kseq1->ReadEntry(); - if (par.seqMode == 2) { filter_kseq2->ReadEntry(); } + if (seqMode == 2) { filter_kseq2->ReadEntry(); } // Print reads if (isFiltered[i]) { // Print filtered reads fprintf(f1_fp, ">%s\n%s\n", filter_kseq1->entry.name.s, filter_kseq1->entry.sequence.s); - if (par.seqMode == 2) { fprintf(f2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); } + if (seqMode == 2) { fprintf(f2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); } } else if (printMode == 2) { // Print removed reads fprintf(rm1_fp, ">%s\n%s\n", filter_kseq1->entry.name.s, filter_kseq1->entry.sequence.s); - if (par.seqMode == 2) { fprintf(rm2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); } + if (seqMode == 2) { fprintf(rm2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); } } } } @@ -151,7 +153,10 @@ void QueryFilter::filterReads(LocalParameters & par) { numOfTatalQueryKmerCnt += kmerBuffer.startIndexOfReserve; // Search matches between query and target k-mers - kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer); + for (auto db : contams) { + kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer, db); + } + kmerMatcher->sortMatches(&matchBuffer); // Classify queries based on the matches taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par); @@ -166,7 +171,7 @@ void QueryFilter::filterReads(LocalParameters & par) { printFilteredReads(); reporter->writeReportFile(numOfSeq, taxonomer->getTaxCounts()); reporter->closeReadClassificationFile(); - + // Memory deallocation free(matchBuffer.buffer); delete kseq1; diff --git a/src/commons/QueryFilter.h b/src/commons/QueryFilter.h index 962a3c02..3dccb7e6 100644 --- a/src/commons/QueryFilter.h +++ b/src/commons/QueryFilter.h @@ -14,6 +14,8 @@ class QueryFilter { std::string dbDir; size_t matchPerKmer; int printMode; + int seqMode; + std::vector contams; // Agents QueryIndexer * queryIndexer; @@ -21,6 +23,7 @@ class QueryFilter { KmerMatcher * kmerMatcher; Taxonomer * taxonomer; Reporter * reporter; + NcbiTaxonomy * taxonomy; // Kseq KSeqWrapper* filter_kseq1; diff --git a/src/metabuli.cpp b/src/metabuli.cpp index 60f09cf9..54c447b5 100644 --- a/src/metabuli.cpp +++ b/src/metabuli.cpp @@ -41,7 +41,7 @@ std::vector commands = { {{"Directory where the DB will be generated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty}, {"A list of FASTA files", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, - {"database-report", databaseReport, &localPar.databaseReport, COMMAND_DATABASE_CREATION, + {"database-report", databaseReport, &localPar.databaseReport, COMMAND_DB, "It generates a report of taxa in a database.", nullptr, "Jaebeom Kim ", @@ -57,24 +57,24 @@ std::vector commands = { {{"DB directory to be updated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty}, {"A list of FASTA files", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, - {"classify", classify, &localPar.classify, COMMAND_TAXONOMY, - "Assigning taxonomy label to query reads", + {"classify", classify, &localPar.classify, COMMAND_MAIN, + "Assigning taxonomy label to query reads", nullptr, - "Jaebeom Kim ", - " ", - CITATION_SPACEPHARER, - {{"FASTA", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile}, - {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}, - {"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}, - {"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, - {"filter", classify, &localPar.filter, COMMAND_TAXONOMY, + "Jaebeom Kim ", + " ", + CITATION_SPACEPHARER, + {{"FASTA", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile}, + {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}, + {"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}, + {"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, + {"filter", classify, &localPar.filter, COMMAND_MAIN, "Filtering reads based on the classification result", nullptr, "Jaebeom Kim ", " ", CITATION_SPACEPHARER, {{"READ FILE", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile}, - {"FILTER DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}, + {"FILTER DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}, {"grade", grade, &localPar.grade, COMMAND_EXPERT, "Grade the classification result (only for benchmarking)", nullptr, @@ -93,16 +93,15 @@ std::vector commands = { {{"read-classification", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, {"add-to-library", addToLibrary, &localPar.addToLibrary, COMMAND_DATABASE_CREATION, - "It bins sequences into distinct files according to their species referring their accession number.\n " - "It requires a mapping file (accession to tax ID) and NCBI style tax dump files in a taxonomy directory.", - nullptr, + "It bins sequences into files according to their species.", + nullptr, "Jaebeom Kim ", " ", CITATION_SPACEPHARER, {{"List of absolute paths of files to be added. One path per line.", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"NCBI style accession2taxid file. It should be consistent to tax dump files.", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"DB directory", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}, - {"apply-threshold", applyThreshold, &localPar.applyThreshold, COMMAND_MAIN, + {"apply-threshold", applyThreshold, &localPar.applyThreshold, COMMAND_EXPERT, "Assigning taxonomy label to query reads", nullptr, "Jaebeom Kim ", diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp index 0d533eec..4f8bfacc 100644 --- a/src/workflow/add_to_library.cpp +++ b/src/workflow/add_to_library.cpp @@ -6,6 +6,7 @@ #include #include "IndexCreator.h" #include "FileUtil.h" +#include using namespace std; diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp index 40bfc797..e4328cb6 100644 --- a/src/workflow/filter.cpp +++ b/src/workflow/filter.cpp @@ -22,6 +22,7 @@ void setFilterDefaults(LocalParameters & par){ par.maskProb = 0.9; par.matchPerKmer = 4; par.printMode = 1; + par.contamList = ""; // TODO: set default } int filter(int argc, const char **argv, const Command& command) From 6c95cd8bffc9a1f12898b7139e8d2aaf7d314a4f Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 22 Aug 2023 17:22:04 +0900 Subject: [PATCH 18/65] binary taxonomy DB writing and reading --- lib/mmseqs/src/taxonomy/NcbiTaxonomy.h | 4 + src/commons/Classifier.cpp | 12 +-- src/commons/IndexCreator.cpp | 118 ++++++------------------- src/commons/IndexCreator.h | 26 +++--- src/commons/KmerMatcher.h | 3 + src/commons/QueryFilter.cpp | 9 +- src/commons/common.cpp | 85 ++++++++++++++---- src/commons/common.h | 2 + src/workflow/build.cpp | 23 ++--- src/workflow/classify.cpp | 2 +- src/workflow/filter.cpp | 2 +- 11 files changed, 140 insertions(+), 146 deletions(-) diff --git a/lib/mmseqs/src/taxonomy/NcbiTaxonomy.h b/lib/mmseqs/src/taxonomy/NcbiTaxonomy.h index 6e69dc81..204055c9 100644 --- a/lib/mmseqs/src/taxonomy/NcbiTaxonomy.h +++ b/lib/mmseqs/src/taxonomy/NcbiTaxonomy.h @@ -132,6 +132,10 @@ class NcbiTaxonomy { TaxID getTaxIdAtRank(int taxId, const std::string & rank); void createTaxIdListAtRank(std::vector & taxIdList, std::vector & taxIdListAtRank, const std::string & rank); + void setMmapData(char* data, size_t size) { + mmapData = data; + mmapSize = size; + } private: size_t loadNodes(std::vector &tmpNodes, const std::string &nodesFile); diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index 5b95e8f6..c426f889 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -6,10 +6,11 @@ Classifier::Classifier(LocalParameters & par) { matchPerKmer = par.matchPerKmer; // Taxonomy - if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; - taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp", - par.taxonomyPath + "/nodes.dmp", - par.taxonomyPath + "/merged.dmp"); + taxonomy = loadTaxonomy(dbDir, par.taxonomyPath); + // if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; + // taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp", + // par.taxonomyPath + "/nodes.dmp", + // par.taxonomyPath + "/merged.dmp"); // Agents queryIndexer = new QueryIndexer(par); @@ -48,7 +49,6 @@ void Classifier::startClassify(const LocalParameters &par) { vector queryList; size_t numOfTatalQueryKmerCnt = 0; - size_t totalMatchCnt = 0; size_t processedSeqCnt = 0; reporter->openReadClassificationFile(); @@ -134,7 +134,7 @@ void Classifier::startClassify(const LocalParameters &par) { } cout << "Number of query k-mers: " << numOfTatalQueryKmerCnt << endl; - cout << "The number of matches: " << totalMatchCnt << endl; + cout << "The number of matches: " << kmerMatcher->getTotalMatchCnt() << endl; reporter->closeReadClassificationFile(); // Write report files diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index be7c0f77..834f60a1 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -2,22 +2,33 @@ #include -IndexCreator::IndexCreator(const LocalParameters & par) -{ - dbDir = par.filenames[0]; - fnaListFileName = par.filenames[1]; - taxonomyDir = par.filenames[0] + "/taxonomy"; +IndexCreator::IndexCreator(const LocalParameters & par) { + // Parameters threadNum = par.threads; bufferSize = par.bufferSize; + + // Input files + dbDir = par.filenames[0]; + if (par.taxonomyPath.empty()) { + taxonomyDir = dbDir + "/taxonomy/"; + } else { + taxonomyDir = par.taxonomyPath + "/"; + } + cout << "Taxonomy path: " << par.taxonomyPath << endl; + fnaListFileName = par.filenames[1]; + acc2taxidFileName = par.filenames[2]; + + // Output files + taxidListFileName = dbDir + "/taxID_list"; + taxonomyBinaryFileName = dbDir + "/taxonomyDB"; + versionFileName = dbDir + "/db.version"; // Load taxonomy taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp", taxonomyDir + "/nodes.dmp", taxonomyDir + "/merged.dmp"); - // ======================================================= // - if (par.reducedAA == 1){ MARKER = 0Xffffffff; MARKER = ~ MARKER; @@ -30,27 +41,6 @@ IndexCreator::IndexCreator(const LocalParameters & par) subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0); } -IndexCreator::IndexCreator(const LocalParameters &par, string dbDir, string fnaListFileName, string acc2taxidFile) - : dbDir(std::move(dbDir)), fnaListFileName(std::move(fnaListFileName)), - taxonomyDir(par.taxonomyPath), acc2taxidFileName(std::move(acc2taxidFile)) -{ - // Load taxonomy - taxonomy = new NcbiTaxonomy(this->taxonomyDir + "/names.dmp", - this->taxonomyDir + "/nodes.dmp", - this->taxonomyDir + "/merged.dmp"); - - if (par.reducedAA == 1){ - MARKER = 0Xffffffff; - MARKER = ~ MARKER; - } else { - MARKER = 16777215; - MARKER = ~ MARKER; - } - tinfo_path = par.tinfoPath; - - // For masking low complexity regions - subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0); -} IndexCreator::~IndexCreator() { delete taxonomy; @@ -64,7 +54,6 @@ void IndexCreator::createIndex(const LocalParameters &par) { cout << "Made blocks for each thread" << endl; // Write taxonomy id list - string taxidListFileName = dbDir + "/taxID_list"; FILE * taxidListFile = fopen(taxidListFileName.c_str(), "w"); for (auto & taxid : taxIdList) { fprintf(taxidListFile, "%d\n", taxid); @@ -108,6 +97,7 @@ void IndexCreator::createIndex(const LocalParameters &par) { delete[] uniqKmerIdx; } delete[] splitChecker; + writeTaxonomyDB(); } void IndexCreator::updateIndex(const LocalParameters &par) { @@ -117,7 +107,7 @@ void IndexCreator::updateIndex(const LocalParameters &par) { // Train Prodigal for each species time_t prodigalStart = time(nullptr); - trainProdigal(); + // trainProdigal(); time_t prodigalEnd = time(nullptr); cout << "Prodigal training time: " << prodigalEnd - prodigalStart << " seconds" << endl; @@ -826,64 +816,14 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, return 0; } - -void IndexCreator::trainProdigal() { - // Train prodigal for each FASTA. -#pragma omp parallel default(none), shared(cerr, fastaList, tinfo_path) - { - ProdigalWrapper prodigal; - kseq_buffer_t buffer; - kseq_t *seq; - size_t lengthOfTrainingSeq; -#pragma omp for schedule(dynamic, 1) - for (size_t i = 0; i < fastaList.size(); i++) { - FASTA ¤tFasta = fastaList[i]; - TaxID currentSpecies = currentFasta.speciesID; - string fileName = tinfo_path + to_string(currentSpecies) + ".tinfo"; - - // Skip if the training file for current species already exists. - if (fileExist(fileName)) { - cerr << "Training file for " << currentSpecies << " already exists. Skip." << endl; - continue; - } - - // Load sequence for training. - struct MmapedData fastaFile = mmapData(currentFasta.path.c_str()); - buffer = {const_cast(&fastaFile.data[currentFasta.sequences[currentFasta.trainingSeqIdx].start]), - static_cast(currentFasta.sequences[currentFasta.trainingSeqIdx].length)}; - seq = kseq_init(&buffer); - kseq_read(seq); - - // Train prodigal. - prodigal.is_meta = 0; - lengthOfTrainingSeq = seq->seq.l; - if (lengthOfTrainingSeq < 100'000) { - prodigal.is_meta = 1; - prodigal.trainMeta(seq->seq.s); - } else { - prodigal.trainASpecies(seq->seq.s); - } - - // Write training result into a file for later use. - _training *tinfo = prodigal.getTrainingInfo(); - write_training_file(const_cast(fileName.c_str()), tinfo); - - kseq_destroy(seq); - munmap(fastaFile.data, fastaFile.fileSize + 1); - } +void IndexCreator::writeTaxonomyDB() { + std::pair serialized = NcbiTaxonomy::serialize(*taxonomy); + FILE *handle = fopen(taxonomyBinaryFileName.c_str(), "w"); + if (handle == NULL) { + Debug(Debug::ERROR) << "Could not open " << taxonomyBinaryFileName << " for writing\n"; + EXIT(EXIT_FAILURE); } -// // TODO: Write species ID of newly trained species into a file. -// // Write trained species into a file. -// for (int i = 0; i < threadNum; i++) { -// for (auto &species : newSpeciesList[i]) { -// trainedSpecies.push_back(species); -// } -// } -// FILE *fp = fopen((tinfo_path + "/species-list.txt").c_str(), "w"); -// for (int trainedSpecie: trainedSpecies) { -// fprintf(fp, "%d\n", trainedSpecie); -// } -// fclose(fp); + fwrite(serialized.first, serialized.second, sizeof(char), handle); + fclose(handle); + free(serialized.first); } - - diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h index 8635cdae..39336e9a 100644 --- a/src/commons/IndexCreator.h +++ b/src/commons/IndexCreator.h @@ -37,20 +37,24 @@ using namespace std; class IndexCreator{ private: uint64_t MARKER; - string tinfo_path; - string tinfo_list; - vector trainedSpecies; - unordered_map trainingInfo; - int threadNum; BaseMatrix *subMat; - // parameters + // Parameters + int threadNum; + size_t bufferSize; + + // Inputs NcbiTaxonomy * taxonomy; string dbDir; string fnaListFileName; string taxonomyDir; string acc2taxidFileName; - size_t bufferSize; + + + // Outputs + string taxidListFileName; + string taxonomyBinaryFileName; + string versionFileName; struct FASTA { string path; @@ -93,13 +97,14 @@ class IndexCreator{ size_t numOfFlush=0; - void trainProdigal(); - -// void writeTargetFiles(TargetKmer * kmerBuffer, size_t & kmerNum, const char * outputFileName,const vector & taxIdList); void writeTargetFiles(TargetKmer * kmerBuffer, size_t & kmerNum, const LocalParameters & par, const size_t * uniqeKmerIdx, size_t & uniqKmerCnt); void writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & kmerNum, const LocalParameters & par, const size_t * uniqeKmerIdx, size_t & uniqKmerCnt); + void writeDiffIdx(uint16_t *buffer, FILE* handleKmerTable, uint16_t *toWrite, size_t size, size_t & localBufIdx ); + + void writeTaxonomyDB(); + static bool compareForDiffIdx(const TargetKmer & a, const TargetKmer & b); // void maskLowComplexityRegions(char * seq, char * maskedSeq, ProbabilityMatrix & probMat, @@ -146,7 +151,6 @@ class IndexCreator{ unordered_map & foundAcc2taxid); static void getSeqSegmentsWithHead(vector & seqSegments, const char * seqFileName); IndexCreator(const LocalParameters & par); - IndexCreator(const LocalParameters & par, string dbDir, string fnaListFileName, string acc2taxidFile); IndexCreator() {taxonomy = nullptr;} ~IndexCreator(); int getNumOfFlush(); diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h index 05b61dc7..36b5d8b3 100644 --- a/src/commons/KmerMatcher.h +++ b/src/commons/KmerMatcher.h @@ -100,6 +100,9 @@ class KmerMatcher { const string &db = string()); void sortMatches(Buffer *matchBuffer); + + // Getters + size_t getTotalMatchCnt() const { return totalMatchCnt; } }; inline uint64_t KmerMatcher::getNextTargetKmer(uint64_t lookingTarget, diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp index c3ed99b3..b22bc3fe 100644 --- a/src/commons/QueryFilter.cpp +++ b/src/commons/QueryFilter.cpp @@ -9,10 +9,11 @@ QueryFilter::QueryFilter(LocalParameters & par) { contams = Util::split(par.contamList, ","); // Taxonomy - if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; - taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp", - par.taxonomyPath + "/nodes.dmp", - par.taxonomyPath + "/merged.dmp"); + taxonomy = loadTaxonomy(dbDir, par.taxonomyPath); + // if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; + // taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp", + // par.taxonomyPath + "/nodes.dmp", + // par.taxonomyPath + "/merged.dmp"); // Agents queryIndexer = new QueryIndexer(par); diff --git a/src/commons/common.cpp b/src/commons/common.cpp index 6fbc84f5..57b5bc79 100644 --- a/src/commons/common.cpp +++ b/src/commons/common.cpp @@ -1,25 +1,74 @@ #include "common.h" -#include +#include "FileUtil.h" +#include "NcbiTaxonomy.h" #include +#include #include +// #include "MathUtil.h" +#include "Debug.h" +#include "Util.h" +#include "sys/mman.h" -void process_mem_usage(double& vm_usage, double& resident_set) -{ - vm_usage = 0.0; - resident_set = 0.0; +// #include +// #include +// #include - // the two fields we want - unsigned long vsize; - long rss; - { - std::string ignore; - std::ifstream ifs("/proc/self/stat", std::ios_base::in); - ifs >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore - >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore - >> ignore >> ignore >> vsize >> rss; - } +void process_mem_usage(double &vm_usage, double &resident_set) { + vm_usage = 0.0; + resident_set = 0.0; - long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages - vm_usage = vsize / 1024.0; - resident_set = rss * page_size_kb; + // the two fields we want + unsigned long vsize; + long rss; + { + std::string ignore; + std::ifstream ifs("/proc/self/stat", std::ios_base::in); + ifs >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> + ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> + ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> + ignore >> vsize >> rss; + } + + long page_size_kb = sysconf(_SC_PAGE_SIZE) / + 1024; // in case x86-64 is configured to use 2MB pages + vm_usage = vsize / 1024.0; + resident_set = rss * page_size_kb; } + +// Mostly copied from lib/mmseqs/src/taxonomy/NcbiTaxonomy.cpp +NcbiTaxonomy *loadTaxonomy(const std::string &dbDir, + const std::string &taxonomyDir) { + std::string binFile = dbDir + "/taxonomyDB"; + if (fileExist(binFile)) { + FILE *handle = fopen(binFile.c_str(), "r"); + struct stat sb; + if (fstat(fileno(handle), &sb) < 0) { + Debug(Debug::ERROR) << "Failed to fstat file " << binFile << "\n"; + EXIT(EXIT_FAILURE); + } + char *data = (char *)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, + fileno(handle), 0); + if (data == MAP_FAILED) { + Debug(Debug::ERROR) << "Failed to mmap file " << binFile << " with error " + << errno << "\n"; + EXIT(EXIT_FAILURE); + } + fclose(handle); + NcbiTaxonomy *t = NcbiTaxonomy::unserialize(data); + if (t != NULL) { + t->setMmapData(data, sb.st_size); + return t; + } else { + Debug(Debug::WARNING) << "Outdated taxonomy information, please recreate " + "with createtaxdb.\n"; + } + } else if (taxonomyDir != "") { + return new NcbiTaxonomy(taxonomyDir + "/names.dmp", + taxonomyDir + "/nodes.dmp", + taxonomyDir + "/merged.dmp"); + } + + return new NcbiTaxonomy(dbDir + "/taxonomy/names.dmp", + dbDir + "/taxonomy/nodes.dmp", + dbDir + "/taxonomy/merged.dmp"); +} \ No newline at end of file diff --git a/src/commons/common.h b/src/commons/common.h index 9b499da2..4e2fcbcd 100644 --- a/src/commons/common.h +++ b/src/commons/common.h @@ -81,4 +81,6 @@ inline bool fileExist(const std::string& name) { void process_mem_usage(double& vm_usage, double& resident_set); +NcbiTaxonomy * loadTaxonomy(const std::string & dbDir, const std::string & taxonomyDir = ""); + #endif //ADCLASSIFIER2_COMMON_H diff --git a/src/workflow/build.cpp b/src/workflow/build.cpp index 80addbb0..f78311cc 100644 --- a/src/workflow/build.cpp +++ b/src/workflow/build.cpp @@ -9,7 +9,7 @@ void setDefaults_build(LocalParameters & par){ par.spaceMask = "11111111"; par.taxonomyPath = "" ; par.splitNum = 4096; - par.maskProb = 0.5; + par.maskProb = 0.9; par.maskMode = 0; par.bufferSize = 1'000'000'000; } @@ -19,23 +19,14 @@ int build(int argc, const char **argv, const Command &command){ LocalParameters &par = LocalParameters::getLocalInstance(); setDefaults_build(par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0); - string dbDirectory = par.filenames[0]; - string fastaListPath = par.filenames[1]; - string mappingFile = par.filenames[2]; - if (par.taxonomyPath.empty()) { - par.taxonomyPath = dbDirectory + "/taxonomy/"; - } else { - par.taxonomyPath = par.taxonomyPath + "/"; - } - + // If dbDirectory does not exist, create it - if (!FileUtil::directoryExists(dbDirectory.c_str())) { - FileUtil::makeDir(dbDirectory.c_str()); + if (!FileUtil::directoryExists(par.filenames[0].c_str())) { + FileUtil::makeDir(par.filenames[0].c_str()); } - cout << "Taxonomy path: " << par.taxonomyPath << endl; - - IndexCreator idxCre(par, dbDirectory, fastaListPath, mappingFile); + // Create index + IndexCreator idxCre(par); idxCre.createIndex(par); if(idxCre.getNumOfFlush() == 1) { @@ -43,7 +34,7 @@ int build(int argc, const char **argv, const Command &command){ return 0; } - //Merge files + // Merge index files cout << "Merge reference DB files ... " << endl; int numOfSplits = idxCre.getNumOfFlush(); FileMerger merger(par); diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp index 21a45f17..36062932 100644 --- a/src/workflow/classify.cpp +++ b/src/workflow/classify.cpp @@ -16,7 +16,7 @@ void setClassifyDefaults(LocalParameters & par){ par.minCoveredPos = 4; par.printLog = 0; par.maxGap = 0; - par.taxonomyPath = "DBDIR/taxonomy/" ; + par.taxonomyPath = "" ; par.minConsCnt = 4; par.minConsCntEuk = 9; par.eukaryotaTaxId = 2759; diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp index e4328cb6..d3d3a08e 100644 --- a/src/workflow/filter.cpp +++ b/src/workflow/filter.cpp @@ -14,7 +14,7 @@ void setFilterDefaults(LocalParameters & par){ par.minCoveredPos = 4; par.printLog = 0; par.maxGap = 0; - par.taxonomyPath = "DBDIR/taxonomy/" ; + par.taxonomyPath = "" ; par.minConsCnt = 4; par.minConsCntEuk = 9; par.eukaryotaTaxId = 2759; From df1316b6a2cce95b9a62628a550611e7f34c433d Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 22 Aug 2023 17:22:26 +0900 Subject: [PATCH 19/65] binary taxonomy DB writing and reading --- src/workflow/add_to_library.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp index 4f8bfacc..de51bd8f 100644 --- a/src/workflow/add_to_library.cpp +++ b/src/workflow/add_to_library.cpp @@ -16,7 +16,6 @@ void setDefaults_addToLibrary(LocalParameters & par){ } // Group sequences by species -// int addToLibrary(int argc, const char **argv, const Command &command){ LocalParameters &par = LocalParameters::getLocalInstance(); setDefaults_addToLibrary(par); From f72f541ca8ceb0ab5adff54889ff55451e6e1ed2 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 24 Aug 2023 10:44:43 +0900 Subject: [PATCH 20/65] first running code for filtering module --- lib/prodigal/bitmap.cpp | 18 ++++ lib/prodigal/bitmap.h | 17 +--- src/commons/Classifier.cpp | 9 +- src/commons/IndexCreator.cpp | 43 ++++++-- src/commons/IndexCreator.h | 6 +- src/commons/KmerExtractor.h | 2 - src/commons/KmerMatcher.cpp | 168 +++++++++++++++++++++++++------- src/commons/KmerMatcher.h | 2 + src/commons/QueryFilter.cpp | 47 ++++++--- src/commons/QueryFilter.h | 6 +- src/commons/Reporter.cpp | 42 +++++--- src/commons/Reporter.h | 15 ++- src/commons/common.cpp | 23 +++++ src/commons/common.h | 3 + src/metabuli.cpp | 2 +- src/workflow/add_to_library.cpp | 11 +-- src/workflow/build.cpp | 2 +- src/workflow/classify.cpp | 5 +- src/workflow/filter.cpp | 16 +-- 19 files changed, 323 insertions(+), 114 deletions(-) diff --git a/lib/prodigal/bitmap.cpp b/lib/prodigal/bitmap.cpp index 76abc9d8..b000cb6e 100644 --- a/lib/prodigal/bitmap.cpp +++ b/lib/prodigal/bitmap.cpp @@ -21,4 +21,22 @@ #include "bitmap.h" /* Test a bit, 0 = not set, 1 = set */ + unsigned char test(unsigned char *bm, int ndx) { + return ( bm[ndx>>3] & (1 << (ndx&0x07))?1:0 ); + } + +/* Clear a bit (set it to 0) */ + void clear(unsigned char *bm, int ndx) { + bm[ndx>>3] &= ~(1 << (ndx&0x07)); + } + +/* Set a bit to 1 */ + void set(unsigned char *bm, int ndx) { + bm[ndx>>3] |= (1 << (ndx&0x07)); + } + +/* Flip a bit's value 0->1 or 1->0 */ + void toggle(unsigned char *bm, int ndx) { + bm[ndx>>3] ^= (1 << (ndx&0x07)); + } diff --git a/lib/prodigal/bitmap.h b/lib/prodigal/bitmap.h index 74eb4370..4253f071 100644 --- a/lib/prodigal/bitmap.h +++ b/lib/prodigal/bitmap.h @@ -21,23 +21,16 @@ #ifndef BITMAP_H_ #define BITMAP_H_ - unsigned char static test(unsigned char *bm, int ndx) { - return ( bm[ndx>>3] & (1 << (ndx&0x07))?1:0 ); - } +/* Test a bit, 0 = not set, 1 = set */ +unsigned char test(unsigned char *bm, int ndx); /* Clear a bit (set it to 0) */ - void static clear(unsigned char *bm, int ndx) { - bm[ndx>>3] &= ~(1 << (ndx&0x07)); - } +void clear(unsigned char *bm, int ndx); /* Set a bit to 1 */ - void static set(unsigned char *bm, int ndx) { - bm[ndx>>3] |= (1 << (ndx&0x07)); - } +void set(unsigned char *bm, int ndx); /* Flip a bit's value 0->1 or 1->0 */ - void static toggle(unsigned char *bm, int ndx) { - bm[ndx>>3] ^= (1 << (ndx&0x07)); - } +void toggle(unsigned char *bm, int ndx); #endif diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index c426f889..9258a194 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -1,16 +1,15 @@ #include "Classifier.h" +#include "FileUtil.h" +#include "common.h" Classifier::Classifier(LocalParameters & par) { // Load parameters dbDir = par.filenames[1 + (par.seqMode == 2)]; matchPerKmer = par.matchPerKmer; - + loadDbParameters(par); + // Taxonomy taxonomy = loadTaxonomy(dbDir, par.taxonomyPath); - // if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; - // taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp", - // par.taxonomyPath + "/nodes.dmp", - // par.taxonomyPath + "/merged.dmp"); // Agents queryIndexer = new QueryIndexer(par); diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index 834f60a1..7f611b93 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -1,11 +1,13 @@ #include "IndexCreator.h" - +#include #include IndexCreator::IndexCreator(const LocalParameters & par) { // Parameters threadNum = par.threads; bufferSize = par.bufferSize; + reducedAA = par.reducedAA; + spaceMask = par.spaceMask; // Input files dbDir = par.filenames[0]; @@ -18,11 +20,11 @@ IndexCreator::IndexCreator(const LocalParameters & par) { fnaListFileName = par.filenames[1]; acc2taxidFileName = par.filenames[2]; - // Output files taxidListFileName = dbDir + "/taxID_list"; taxonomyBinaryFileName = dbDir + "/taxonomyDB"; versionFileName = dbDir + "/db.version"; + paramterFileName = dbDir + "/db.parameters"; // Load taxonomy taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp", @@ -53,6 +55,11 @@ void IndexCreator::createIndex(const LocalParameters &par) { makeBlocksForParallelProcessing(); cout << "Made blocks for each thread" << endl; + // Print fnaSplits + for (auto & fnaSplit : fnaSplits) { + cout << fnaSplit.offset << " " << fnaSplit.cnt << " " << fnaSplit.speciesID << " " << fnaSplit.file_idx << " " << fnaSplit.training << endl; + } + // Write taxonomy id list FILE * taxidListFile = fopen(taxidListFileName.c_str(), "w"); for (auto & taxid : taxIdList) { @@ -98,6 +105,7 @@ void IndexCreator::createIndex(const LocalParameters &par) { } delete[] splitChecker; writeTaxonomyDB(); + writeDbParameters(); } void IndexCreator::updateIndex(const LocalParameters &par) { @@ -207,22 +215,34 @@ void IndexCreator::splitFastaForProdigalTraining(int file_idx, TaxID speciesID) bool stored = false; while(seqIdx < fastaList[file_idx].sequences.size()){ stored = false; + + // Skip if(speciesID == 0) { seqIdx++; continue;} + // Length currLength = fastaList[file_idx].sequences[seqIdx].length; if (currLength > maxLength){ maxLength = currLength; seqForTraining = seqIdx; } lengthSum += currLength; + cnt ++; + // Check the size of current split if(lengthSum > 100'000'000 || cnt > 300 || (cnt > 100 && lengthSum > 50'000'000)){ - tempSplits.emplace_back(0, offset, cnt - 1, speciesID, file_idx); - offset += cnt - 1; + tempSplits.emplace_back(0, offset, cnt, speciesID, file_idx); + offset += cnt; lengthSum = 0; - cnt = 1; + cnt = 0; stored = true; } + // if(lengthSum > 100'000'000 || cnt > 300 || (cnt > 100 && lengthSum > 50'000'000)){ + // tempSplits.emplace_back(0, offset, cnt - 1, speciesID, file_idx); + // offset += cnt - 1; + // lengthSum = 0; + // cnt = 1; + // stored = true; + // } seqIdx ++; } if(!stored){ @@ -801,7 +821,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, cout << omp_get_thread_num() << " Processed " << i << "th splits (" << processedSplitCnt << ")" << endl; #endif munmap(fastaFile.data, fastaFile.fileSize + 1); - }else { + } else { // Withdraw the reservation if the buffer is full. cout << "Buffer is full. Withdraw the reservation." << endl; checker[i] = false; @@ -827,3 +847,14 @@ void IndexCreator::writeTaxonomyDB() { fclose(handle); free(serialized.first); } + +void IndexCreator::writeDbParameters() { + FILE *handle = fopen(paramterFileName.c_str(), "w"); + if (handle == NULL) { + Debug(Debug::ERROR) << "Could not open " << paramterFileName << " for writing\n"; + EXIT(EXIT_FAILURE); + } + fprintf(handle, "Reduced_alphabet\t%d\n", reducedAA); + fprintf(handle, "Spaced_kmer_mask\t%s\n", spaceMask.c_str()); + fclose(handle); +} diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h index 39336e9a..8f343717 100644 --- a/src/commons/IndexCreator.h +++ b/src/commons/IndexCreator.h @@ -42,6 +42,8 @@ class IndexCreator{ // Parameters int threadNum; size_t bufferSize; + int reducedAA; + string spaceMask; // Inputs NcbiTaxonomy * taxonomy; @@ -50,11 +52,11 @@ class IndexCreator{ string taxonomyDir; string acc2taxidFileName; - // Outputs string taxidListFileName; string taxonomyBinaryFileName; string versionFileName; + string paramterFileName; struct FASTA { string path; @@ -105,6 +107,8 @@ class IndexCreator{ void writeTaxonomyDB(); + void writeDbParameters(); + static bool compareForDiffIdx(const TargetKmer & a, const TargetKmer & b); // void maskLowComplexityRegions(char * seq, char * maskedSeq, ProbabilityMatrix & probMat, diff --git a/src/commons/KmerExtractor.h b/src/commons/KmerExtractor.h index 2e7f2977..14226262 100644 --- a/src/commons/KmerExtractor.h +++ b/src/commons/KmerExtractor.h @@ -29,8 +29,6 @@ class KmerExtractor { const QuerySplit & currentSplit, const LocalParameters &par); - - public: explicit KmerExtractor(const LocalParameters & par); ~KmerExtractor(); diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 3f9a1495..5b352c5f 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -1,4 +1,7 @@ #include "KmerMatcher.h" +#include "Kmer.h" +#include "Mmap.h" +#include KmerMatcher::KmerMatcher(const LocalParameters & par, NcbiTaxonomy * taxonomy) { @@ -11,48 +14,132 @@ KmerMatcher::KmerMatcher(const LocalParameters & par, MARKER = ~ MARKER; totalMatchCnt = 0; - // Load the taxonomy ID list - FILE * taxIdFile; - if((taxIdFile = fopen((dbDir + "/taxID_list").c_str(),"r")) == NULL){ - std::cout<<"Cannot open the taxID list file."<taxonNode(taxId); - if (taxId == taxon->taxId) { - TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); - TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); - while (taxon->taxId != speciesTaxID) { - taxId2speciesId[taxon->taxId] = speciesTaxID; - taxId2genusId[taxon->taxId] = genusTaxID; - taxon = taxonomy->taxonNode(taxon->parentTaxId); - } - taxId2speciesId[speciesTaxID] = speciesTaxID; - taxId2genusId[speciesTaxID] = genusTaxID; - } else { - TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); - TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); - while (taxon->taxId != speciesTaxID) { - taxId2speciesId[taxon->taxId] = speciesTaxID; - taxId2genusId[taxon->taxId] = genusTaxID; - taxon = taxonomy->taxonNode(taxon->parentTaxId); - } - taxId2speciesId[speciesTaxID] = speciesTaxID; - taxId2genusId[speciesTaxID] = genusTaxID; - taxId2speciesId[taxId] = speciesTaxID; - taxId2genusId[taxId] = genusTaxID; - } - } - fclose(taxIdFile); + this->taxonomy = taxonomy; + loadTaxIdList(par); + + // // Load the taxonomy ID list + // FILE * taxIdFile; + // if((taxIdFile = fopen((dbDir + "/taxID_list").c_str(),"r")) == NULL){ + // std::cout<<"Cannot open the taxID list file."<taxonNode(taxId); + // if (taxId == taxon->taxId) { + // TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); + // TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); + // while (taxon->taxId != speciesTaxID) { + // taxId2speciesId[taxon->taxId] = speciesTaxID; + // taxId2genusId[taxon->taxId] = genusTaxID; + // taxon = taxonomy->taxonNode(taxon->parentTaxId); + // } + // taxId2speciesId[speciesTaxID] = speciesTaxID; + // taxId2genusId[speciesTaxID] = genusTaxID; + // } else { + // TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); + // TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); + // while (taxon->taxId != speciesTaxID) { + // taxId2speciesId[taxon->taxId] = speciesTaxID; + // taxId2genusId[taxon->taxId] = genusTaxID; + // taxon = taxonomy->taxonNode(taxon->parentTaxId); + // } + // taxId2speciesId[speciesTaxID] = speciesTaxID; + // taxId2genusId[speciesTaxID] = genusTaxID; + // taxId2speciesId[taxId] = speciesTaxID; + // taxId2genusId[taxId] = genusTaxID; + // } + // } + // fclose(taxIdFile); } KmerMatcher::~KmerMatcher() { } +void KmerMatcher::loadTaxIdList(const LocalParameters & par) { + if (par.contamList != "") { + vector contams = Util::split(par.contamList, ","); + for (auto &contam : contams) { + FILE *taxIdFile; + cout << dbDir + "/" + contam + "/taxID_list" << endl; + if ((taxIdFile = fopen((dbDir + "/" + contam + "/taxID_list").c_str(), "r")) == NULL) { + std::cout << "Cannot open the taxID list file." << std::endl; + return; + } + char taxID[100]; + while (feof(taxIdFile) == 0) { + fscanf(taxIdFile, "%s", taxID); + TaxID taxId = atol(taxID); + TaxonNode const *taxon = taxonomy->taxonNode(taxId); + if (taxId == taxon->taxId) { + TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); + TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); + while (taxon->taxId != speciesTaxID) { + taxId2speciesId[taxon->taxId] = speciesTaxID; + taxId2genusId[taxon->taxId] = genusTaxID; + taxon = taxonomy->taxonNode(taxon->parentTaxId); + } + taxId2speciesId[speciesTaxID] = speciesTaxID; + taxId2genusId[speciesTaxID] = genusTaxID; + } else { + TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); + TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); + while (taxon->taxId != speciesTaxID) { + taxId2speciesId[taxon->taxId] = speciesTaxID; + taxId2genusId[taxon->taxId] = genusTaxID; + taxon = taxonomy->taxonNode(taxon->parentTaxId); + } + taxId2speciesId[speciesTaxID] = speciesTaxID; + taxId2genusId[speciesTaxID] = genusTaxID; + taxId2speciesId[taxId] = speciesTaxID; + taxId2genusId[taxId] = genusTaxID; + } + } + fclose(taxIdFile); + } + } else { + FILE *taxIdFile; + if ((taxIdFile = fopen((dbDir + "/taxID_list").c_str(), "r")) == NULL) { + std::cout << "Cannot open the taxID list file." << std::endl; + return; + } + char taxID[100]; + while (feof(taxIdFile) == 0) { + fscanf(taxIdFile, "%s", taxID); + TaxID taxId = atol(taxID); + TaxonNode const *taxon = taxonomy->taxonNode(taxId); + if (taxId == taxon->taxId) { + TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); + TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); + while (taxon->taxId != speciesTaxID) { + taxId2speciesId[taxon->taxId] = speciesTaxID; + taxId2genusId[taxon->taxId] = genusTaxID; + taxon = taxonomy->taxonNode(taxon->parentTaxId); + } + taxId2speciesId[speciesTaxID] = speciesTaxID; + taxId2genusId[speciesTaxID] = genusTaxID; + } else { + TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); + TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); + while (taxon->taxId != speciesTaxID) { + taxId2speciesId[taxon->taxId] = speciesTaxID; + taxId2genusId[taxon->taxId] = genusTaxID; + taxon = taxonomy->taxonNode(taxon->parentTaxId); + } + taxId2speciesId[speciesTaxID] = speciesTaxID; + taxId2genusId[speciesTaxID] = genusTaxID; + taxId2speciesId[taxId] = speciesTaxID; + taxId2genusId[taxId] = genusTaxID; + } + } + fclose(taxIdFile); + } + cout << "Taxonomy ID list is loaded." << endl; +} + int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer * matchBuffer, @@ -73,6 +160,15 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, MmapedData diffIdxSplits = mmapData(diffIdxSplitFileName.c_str(), 3); size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t); + MmapedData tempInfos = mmapData(targetInfoFileName.c_str(), 3); + size_t numOfInfos = tempInfos.fileSize / sizeof(TargetKmerInfo); + + // Print kmer infos + for (size_t i = 0; i < numOfInfos; i++) { + cout << (int) tempInfos.data[i].sequenceID << " " << (int) tempInfos.data[i].redundancy << endl; + } + + size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve; QueryKmer *queryKmerList = queryKmerBuffer->buffer; diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h index 36b5d8b3..cdf23e30 100644 --- a/src/commons/KmerMatcher.h +++ b/src/commons/KmerMatcher.h @@ -91,6 +91,8 @@ class KmerMatcher { static bool compareMatches(const Match &a, const Match &b); + void loadTaxIdList(const LocalParameters & par); + public: KmerMatcher(const LocalParameters &par, NcbiTaxonomy *taxonomy); diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp index b22bc3fe..100cd7f2 100644 --- a/src/commons/QueryFilter.cpp +++ b/src/commons/QueryFilter.cpp @@ -1,4 +1,5 @@ #include "QueryFilter.h" +#include "common.h" QueryFilter::QueryFilter(LocalParameters & par) { // Load parameters @@ -7,14 +8,11 @@ QueryFilter::QueryFilter(LocalParameters & par) { printMode = par.printMode; seqMode = par.seqMode; contams = Util::split(par.contamList, ","); + loadDbParameters(par); // Taxonomy taxonomy = loadTaxonomy(dbDir, par.taxonomyPath); - // if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; - // taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp", - // par.taxonomyPath + "/nodes.dmp", - // par.taxonomyPath + "/merged.dmp"); - + // Agents queryIndexer = new QueryIndexer(par); kmerExtractor = new KmerExtractor(par); @@ -22,13 +20,19 @@ QueryFilter::QueryFilter(LocalParameters & par) { else { kmerMatcher = new KmerMatcher(par, taxonomy);} taxonomer = new Taxonomer(par, taxonomy); reporter = new Reporter(par, taxonomy); - setInputAndOutputFiles(par); + reporter->setReadClassificationFileName(readClassificationFileName); + reporter->setReportFileName(reportFileName); + cout << "Filtered reads: " << f1 << endl; + if (par.seqMode == 2) { cout << "Filtered reads: " << f2 << endl; } + if (printMode == 2) { + cout << "Removed reads: " << rm1 << endl; + if (par.seqMode == 2) { cout << "Removed reads: " << rm2 << endl; } + } + filter_kseq1 = KSeqFactory(in1.c_str()); if (par.seqMode == 2) { filter_kseq2 = KSeqFactory(in2.c_str()); } - isFiltered = new bool[queryIndexer->getReadNum_1()]; - memset(isFiltered, 0, sizeof(bool) * queryIndexer->getReadNum_1()); readCounter = 0; // Open output files @@ -38,6 +42,8 @@ QueryFilter::QueryFilter(LocalParameters & par) { rm1_fp = fopen(rm1.c_str(), "w"); if (par.seqMode == 2) { rm2_fp = fopen(rm2.c_str(), "w"); } } + + } QueryFilter::~QueryFilter() { @@ -59,19 +65,22 @@ QueryFilter::~QueryFilter() { } void QueryFilter::setInputAndOutputFiles(const LocalParameters & par) { + cout << "Setting output file names" << endl; // Get the base name of in1 in1 = par.filenames[0]; string baseName = LocalUtil::getQueryBaseName(in1); // Set the output file names - f1 = baseName + "_filtered.fna.gz"; - rm1 = baseName + "_removed.fna.gz"; + f1 = baseName + "_filtered.fna"; + rm1 = baseName + "_removed.fna"; + reportFileName = baseName + "_report.tsv"; + readClassificationFileName = baseName + "_classifications.tsv"; // For paired-end reads if (seqMode == 2) { in2 = par.filenames[1]; - f2 = LocalUtil::getQueryBaseName(in2) + "_filtered.fna.gz"; - rm2 = LocalUtil::getQueryBaseName(in2) + "_removed.fna.gz"; + f2 = LocalUtil::getQueryBaseName(in2) + "_filtered.fna"; + rm2 = LocalUtil::getQueryBaseName(in2) + "_removed.fna"; } } @@ -88,7 +97,7 @@ void QueryFilter::printFilteredReads() { if (seqMode == 2) { filter_kseq2->ReadEntry(); } // Print reads - if (isFiltered[i]) { // Print filtered reads + if (!isFiltered[i]) { // Print filtered reads fprintf(f1_fp, ">%s\n%s\n", filter_kseq1->entry.name.s, filter_kseq1->entry.sequence.s); if (seqMode == 2) { fprintf(f2_fp, ">%s\n%s\n", filter_kseq2->entry.name.s, filter_kseq2->entry.sequence.s); } } else if (printMode == 2) { // Print removed reads @@ -105,16 +114,21 @@ void QueryFilter::filterReads(LocalParameters & par) { size_t numOfSeq = queryIndexer->getReadNum_1(); size_t totalReadLength = queryIndexer->getTotalReadLength(); const vector & queryReadSplit = queryIndexer->getQuerySplits(); + // print queryReadSplit + // for (size_t i = 0; i < queryReadSplit.size(); i++) { + // cout << queryReadSplit[i].start << " " << queryReadSplit[i].end << " " << queryReadSplit[i].kmerCnt << endl; + // } cout << "Done" << endl; cout << "Total number of sequences: " << numOfSeq << endl; cout << "Total read length: " << totalReadLength << "nt" << endl; + isFiltered = new bool[queryIndexer->getReadNum_1()]; + memset(isFiltered, 0, sizeof(bool) * queryIndexer->getReadNum_1()); QueryKmerBuffer kmerBuffer; Buffer matchBuffer; vector queryList; size_t numOfTatalQueryKmerCnt = 0; - size_t totalMatchCnt = 0; size_t processedSeqCnt = 0; reporter->openReadClassificationFile(); @@ -169,8 +183,11 @@ void QueryFilter::filterReads(LocalParameters & par) { recordFilteredReads(queryList); } + + cout << "Number of query k-mers: " << numOfTatalQueryKmerCnt << endl; + cout << "The number of matches: " << kmerMatcher->getTotalMatchCnt() << endl; printFilteredReads(); - reporter->writeReportFile(numOfSeq, taxonomer->getTaxCounts()); + reporter->writeReportFile(numOfSeq, taxonomer->getTaxCounts(), false); reporter->closeReadClassificationFile(); // Memory deallocation diff --git a/src/commons/QueryFilter.h b/src/commons/QueryFilter.h index 3dccb7e6..a53455c4 100644 --- a/src/commons/QueryFilter.h +++ b/src/commons/QueryFilter.h @@ -29,7 +29,11 @@ class QueryFilter { KSeqWrapper* filter_kseq1; KSeqWrapper* filter_kseq2; - std::string in1, in2, f1, f2, rm1, rm2; // input and output file names + std::string in1, in2; + std::string f1, f2, rm1, rm2; // input and output file names + std::string readClassificationFileName; + std::string reportFileName; + bool * isFiltered; size_t readCounter; FILE * f1_fp, * f2_fp, * rm1_fp, * rm2_fp; diff --git a/src/commons/Reporter.cpp b/src/commons/Reporter.cpp index 566e8c66..0c6b9cd1 100644 --- a/src/commons/Reporter.cpp +++ b/src/commons/Reporter.cpp @@ -1,18 +1,26 @@ #include "Reporter.h" #include "taxonomyreport.cpp" -Reporter::Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxonomy(taxonomy){ - if (par.seqMode == 2) { - outDir = par.filenames[3]; - jobId = par.filenames[4]; - } else { - outDir = par.filenames[2]; - jobId = par.filenames[3]; +Reporter::Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxonomy(taxonomy) { + if (par.contamList == "") { // classify module + if (par.seqMode == 2) { + outDir = par.filenames[3]; + jobId = par.filenames[4]; + } else { + outDir = par.filenames[2]; + jobId = par.filenames[3]; + } + // Output file names + reportFileName = outDir + + "/" + jobId + "_report.tsv"; + readClassificationFileName = outDir + "/" + jobId + "_classifications.tsv"; } + + + } void Reporter::openReadClassificationFile() { - readClassificationFile.open(outDir + "/" + jobId + "_classifications.tsv"); + readClassificationFile.open(readClassificationFileName); } void Reporter::writeReadClassification(const vector & queryList, bool classifiedOnly) { @@ -38,20 +46,22 @@ void Reporter::closeReadClassificationFile() { readClassificationFile.close(); } -void Reporter::writeReportFile(int numOfQuery, unordered_map &taxCnt) { +void Reporter::writeReportFile(int numOfQuery, unordered_map &taxCnt, bool krona) { unordered_map cladeCounts = taxonomy->getCladeCounts(taxCnt); FILE *fp; - fp = fopen((outDir + + "/" + jobId + "_report.tsv").c_str(), "w"); + fp = fopen((reportFileName).c_str(), "w"); writeReport(fp, cladeCounts, numOfQuery); fclose(fp); // Write Krona chart - FILE *kronaFile = fopen((outDir + "/" + jobId + "_krona.html").c_str(), "w"); - fwrite(krona_prelude_html, krona_prelude_html_len, sizeof(char), kronaFile); - fprintf(kronaFile, "%zu", numOfQuery); - kronaReport(kronaFile, *taxonomy, cladeCounts, numOfQuery); - fprintf(kronaFile, ""); - + if (krona) { + FILE *kronaFile = fopen((outDir + "/" + jobId + "_krona.html").c_str(), "w"); + fwrite(krona_prelude_html, krona_prelude_html_len, sizeof(char), kronaFile); + fprintf(kronaFile, "%zu", numOfQuery); + kronaReport(kronaFile, *taxonomy, cladeCounts, numOfQuery); + fprintf(kronaFile, ""); + fclose(kronaFile); + } } void Reporter::writeReport(FILE *FP, const std::unordered_map &cladeCounts, diff --git a/src/commons/Reporter.h b/src/commons/Reporter.h index d64e567c..4745bf26 100644 --- a/src/commons/Reporter.h +++ b/src/commons/Reporter.h @@ -17,13 +17,14 @@ class Reporter { NcbiTaxonomy * taxonomy; // Output + string reportFileName; + string readClassificationFileName; ofstream readClassificationFile; - public: Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy); // Write report - void writeReportFile(int numOfQuery, unordered_map &taxCnt); + void writeReportFile(int numOfQuery, unordered_map &taxCnt, bool krona = true); void writeReport(FILE *FP, const std::unordered_map &cladeCounts, unsigned long totalReads, TaxID taxID = 0, int depth = 0); @@ -32,10 +33,16 @@ class Reporter { void writeReadClassification(const vector & queryList, bool classifiedOnly = false); void closeReadClassificationFile(); - - unsigned int cladeCountVal(const std::unordered_map &map, TaxID key); + // Setter + void setReportFileName(const string &reportFileName) { + Reporter::reportFileName = reportFileName; + } + + void setReadClassificationFileName(const string &readClassificationFileName) { + Reporter::readClassificationFileName = readClassificationFileName; + } }; diff --git a/src/commons/common.cpp b/src/commons/common.cpp index 57b5bc79..92d7c735 100644 --- a/src/commons/common.cpp +++ b/src/commons/common.cpp @@ -6,6 +6,7 @@ #include // #include "MathUtil.h" #include "Debug.h" +#include "Reporter.h" #include "Util.h" #include "sys/mman.h" @@ -71,4 +72,26 @@ NcbiTaxonomy *loadTaxonomy(const std::string &dbDir, return new NcbiTaxonomy(dbDir + "/taxonomy/names.dmp", dbDir + "/taxonomy/nodes.dmp", dbDir + "/taxonomy/merged.dmp"); +} + +int loadDbParameters(LocalParameters &par) { + std::string dbDir = par.filenames[1 + (par.seqMode == 2)]; + if (fileExist(dbDir + "/db.parameters")) { + // open db.parameters + std::ifstream dbParametersFile; + dbParametersFile.open(dbDir + "/db.parameters"); + std::string eachLine; + if (dbParametersFile.is_open()) { + while (getline(dbParametersFile, eachLine)) { + std::vector tokens = Util::split(eachLine, "\t"); + if (tokens[0] == "Reduced_alphabet") { + par.reducedAA = stoi(tokens[1]); + } else if (tokens[0] == "Spaced_kmer_mask") { + par.spaceMask = tokens[1]; + } + } + return 1; + } + } + return 0; } \ No newline at end of file diff --git a/src/commons/common.h b/src/commons/common.h index 4e2fcbcd..7749c39a 100644 --- a/src/commons/common.h +++ b/src/commons/common.h @@ -1,6 +1,7 @@ #ifndef ADCLASSIFIER2_COMMON_H #define ADCLASSIFIER2_COMMON_H #include +#include "LocalParameters.h" #include "NcbiTaxonomy.h" #include @@ -83,4 +84,6 @@ void process_mem_usage(double& vm_usage, double& resident_set); NcbiTaxonomy * loadTaxonomy(const std::string & dbDir, const std::string & taxonomyDir = ""); +int loadDbParameters(LocalParameters & par); + #endif //ADCLASSIFIER2_COMMON_H diff --git a/src/metabuli.cpp b/src/metabuli.cpp index 54c447b5..7783d91f 100644 --- a/src/metabuli.cpp +++ b/src/metabuli.cpp @@ -67,7 +67,7 @@ std::vector commands = { {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}, {"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}, {"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, - {"filter", classify, &localPar.filter, COMMAND_MAIN, + {"filter", filter, &localPar.filter, COMMAND_MAIN, "Filtering reads based on the classification result", nullptr, "Jaebeom Kim ", diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp index de51bd8f..e5f97eba 100644 --- a/src/workflow/add_to_library.cpp +++ b/src/workflow/add_to_library.cpp @@ -6,6 +6,7 @@ #include #include "IndexCreator.h" #include "FileUtil.h" +#include "common.h" #include using namespace std; @@ -33,10 +34,7 @@ int addToLibrary(int argc, const char **argv, const Command &command){ } // Load taxonomy - string names = par.taxonomyPath + "/names.dmp"; - string nodes = par.taxonomyPath + "/nodes.dmp"; - string merged = par.taxonomyPath + "/merged.dmp"; - NcbiTaxonomy ncbiTaxonomy(names, nodes, merged); + NcbiTaxonomy * taxonomy = loadTaxonomy(dbDir); // Load file names ifstream fileListFile; @@ -91,7 +89,7 @@ int addToLibrary(int argc, const char **argv, const Command &command){ } // Get species taxID - int speciesTaxID = ncbiTaxonomy.getTaxIdAtRank(acc2taxid[accession], "species"); + int speciesTaxID = taxonomy->getTaxIdAtRank(acc2taxid[accession], "species"); // Skip if species taxID is not found if (speciesTaxID == 0) { @@ -157,7 +155,7 @@ int addToLibrary(int argc, const char **argv, const Command &command){ } // Get species taxID - int speciesTaxID = ncbiTaxonomy.getTaxIdAtRank(assembly2taxid[assemblyID], "species"); + int speciesTaxID = taxonomy->getTaxIdAtRank(assembly2taxid[assemblyID], "species"); if (speciesTaxID == 0) { cout << "During processing " << fileNames[i] << ", accession " << assemblyID << " is not matched to any species. It is skipped." << endl; @@ -196,5 +194,6 @@ int addToLibrary(int argc, const char **argv, const Command &command){ } fclose(file); } + delete taxonomy; return EXIT_SUCCESS; } \ No newline at end of file diff --git a/src/workflow/build.cpp b/src/workflow/build.cpp index f78311cc..b5f9e571 100644 --- a/src/workflow/build.cpp +++ b/src/workflow/build.cpp @@ -10,7 +10,7 @@ void setDefaults_build(LocalParameters & par){ par.taxonomyPath = "" ; par.splitNum = 4096; par.maskProb = 0.9; - par.maskMode = 0; + par.maskMode = 1; par.bufferSize = 1'000'000'000; } diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp index 36062932..514f6ae8 100644 --- a/src/workflow/classify.cpp +++ b/src/workflow/classify.cpp @@ -2,14 +2,15 @@ #include "Parameters.h" #include "LocalParameters.h" #include "FileUtil.h" +#include "common.h" void setClassifyDefaults(LocalParameters & par){ - par.seqMode = 2; par.reducedAA = 0; + par.spaceMask = "11111111"; + par.seqMode = 2; par.minScore = 0; par.minCoverage = 0; par.minSpScore = 0; - par.spaceMask = "11111111"; par.hammingMargin = 0; par.verbosity = 3; par.ramUsage = 128; diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp index d3d3a08e..3d0b0448 100644 --- a/src/workflow/filter.cpp +++ b/src/workflow/filter.cpp @@ -1,13 +1,13 @@ #include "LocalParameters.h" #include "QueryFilter.h" -void setFilterDefaults(LocalParameters & par){ - par.seqMode = 2; +void setFilterDefaults(LocalParameters & par) { par.reducedAA = 0; - par.minScore = 0.7; + par.spaceMask = "11111111"; + par.seqMode = 2; + par.minScore = 0.5; par.minCoverage = 0; par.minSpScore = 0; - par.spaceMask = "11111111"; par.hammingMargin = 0; par.verbosity = 3; par.ramUsage = 128; @@ -25,8 +25,7 @@ void setFilterDefaults(LocalParameters & par){ par.contamList = ""; // TODO: set default } -int filter(int argc, const char **argv, const Command& command) -{ +int filter(int argc, const char **argv, const Command& command) { LocalParameters & par = LocalParameters::getLocalInstance(); setFilterDefaults(par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0); @@ -35,6 +34,11 @@ int filter(int argc, const char **argv, const Command& command) omp_set_num_threads(par.threads); #endif + if (par.contamList == "") { + cerr << "Error: Contamination list is not specified." << endl; + return 1; + } + QueryFilter * queryFilter = new QueryFilter(par); queryFilter->filterReads(par); From bde8c6b26da19a54a6398d53b122ed5814bd67de Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 24 Aug 2023 15:35:42 +0900 Subject: [PATCH 21/65] first running version --- src/commons/IndexCreator.cpp | 130 +++++++++++++++++++++++++++-------- src/commons/IndexCreator.h | 13 +++- src/commons/KmerMatcher.cpp | 9 --- src/commons/LocalUtil.cpp | 23 ++++++- src/commons/LocalUtil.h | 4 ++ src/commons/SeqIterator.cpp | 4 -- src/commons/Taxonomer.cpp | 1 - 7 files changed, 138 insertions(+), 46 deletions(-) diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index 7f611b93..318d3040 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -1,4 +1,7 @@ #include "IndexCreator.h" +#include "FileUtil.h" +#include "LocalUtil.h" +#include #include #include @@ -27,9 +30,9 @@ IndexCreator::IndexCreator(const LocalParameters & par) { paramterFileName = dbDir + "/db.parameters"; // Load taxonomy - taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp", - taxonomyDir + "/nodes.dmp", - taxonomyDir + "/merged.dmp"); + // taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp", + // taxonomyDir + "/nodes.dmp", + // taxonomyDir + "/merged.dmp"); if (par.reducedAA == 1){ MARKER = 0Xffffffff; @@ -55,11 +58,6 @@ void IndexCreator::createIndex(const LocalParameters &par) { makeBlocksForParallelProcessing(); cout << "Made blocks for each thread" << endl; - // Print fnaSplits - for (auto & fnaSplit : fnaSplits) { - cout << fnaSplit.offset << " " << fnaSplit.cnt << " " << fnaSplit.speciesID << " " << fnaSplit.file_idx << " " << fnaSplit.training << endl; - } - // Write taxonomy id list FILE * taxidListFile = fopen(taxidListFileName.c_str(), "w"); for (auto & taxid : taxIdList) { @@ -162,7 +160,10 @@ void IndexCreator::updateIndex(const LocalParameters &par) { void IndexCreator::makeBlocksForParallelProcessing(){ unordered_map acc2taxid; - load_accession2taxid(acc2taxidFileName, acc2taxid); + TaxID maxTaxID = load_accession2taxid(acc2taxidFileName, acc2taxid); + newTaxID = maxTaxID + 1; + + vector>> newAcc2taxid; // accession.version -> (parent, newTaxID) // Make blocks of sequences that can be processed in parallel int fileNum = getNumberOfLines(fnaListFileName); @@ -176,18 +177,42 @@ void IndexCreator::makeBlocksForParallelProcessing(){ } string eachFile; string seqHeader; + string accession_version; + string accession; + vector tempTaxIDList; - unordered_map foundAcc2taxid; for (int i = 0; i < fileNum; ++i) { // Get start and end position of each sequence in the file getline(fnaListFile, eachFile); fastaList[i].path = eachFile; processedSeqCnt.push_back(taxIdList.size()); - seqHeader = getSeqSegmentsWithHead(fastaList[i].sequences, eachFile, acc2taxid, foundAcc2taxid); - seqHeader = seqHeader.substr(1, seqHeader.find('.') - 1); - TaxID speciesTaxid = taxonomy->getTaxIdAtRank(acc2taxid[seqHeader], "species"); - // Split current file into blocks for parallel processing + + seqHeader = getSeqSegmentsWithHead(fastaList[i].sequences, eachFile, acc2taxid, newAcc2taxid); + // accession_version = seqHeader.substr(1, seqHeader.find('.') - 1); + accession = seqHeader.substr(1, seqHeader.find('.') - 1); + accession_version = seqHeader.substr(1, LocalUtil::getFirstWhiteSpacePos(seqHeader) - 1); + // newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid[accession], newTaxID)); + tempTaxIDList.push_back(acc2taxid[accession]); + + // TaxID speciesTaxid = taxonomy->getTaxIdAtRank(acc2taxid[accession], "species"); + + // // Split current file into blocks for parallel processing + // splitFastaForProdigalTraining(i, speciesTaxid); + // fastaList[i].speciesID = speciesTaxid; + } + + // Edit taxonomy dump files + editTaxonomyDumpFiles(newAcc2taxid); + + // Load taxonomy + taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp.new", + taxonomyDir + "/nodes.dmp.new", + taxonomyDir + "/merged.dmp"); + + + for (int i = 0; i < fileNum; ++i) { + TaxID speciesTaxid = taxonomy->getTaxIdAtRank(tempTaxIDList[i], "species"); splitFastaForProdigalTraining(i, speciesTaxid); fastaList[i].speciesID = speciesTaxid; } @@ -196,8 +221,8 @@ void IndexCreator::makeBlocksForParallelProcessing(){ // Write accession to taxid map to file string acc2taxidFileName2 = dbDir + "/acc2taxid.map"; FILE * acc2taxidFile = fopen(acc2taxidFileName2.c_str(), "w"); - for (auto it = foundAcc2taxid.begin(); it != foundAcc2taxid.end(); ++it) { - fprintf(acc2taxidFile, "%s\t%d\n", it->first.c_str(), it->second); + for (auto it : newAcc2taxid) { + fprintf(acc2taxidFile, "%s\t%d\t%d\n", it.first.c_str(), it.second.first, it.second.second); } fclose(acc2taxidFile); @@ -256,7 +281,8 @@ void IndexCreator::splitFastaForProdigalTraining(int file_idx, TaxID speciesID) fastaList[file_idx].trainingSeqIdx = seqForTraining; } -void IndexCreator::load_accession2taxid(const string & mappingFileName, unordered_map & acc2taxid) { +TaxID IndexCreator::load_accession2taxid(const string & mappingFileName, unordered_map & acc2taxid) { + TaxID maxTaxID = 0; cerr << "Load mapping from accession ID to taxonomy ID ... " << flush; string eachLine; string eachItem; @@ -266,11 +292,15 @@ void IndexCreator::load_accession2taxid(const string & mappingFileName, unordere fscanf(mappingFile, "%*s\t%*s\t%*s\t%*s"); while (fscanf(mappingFile, "%s\t%*s\t%d\t%*d", buffer, &taxID) == 2 ){ acc2taxid[string(buffer)] = taxID; + if (taxID > maxTaxID) { + maxTaxID = taxID; + } } } else { cerr << "Cannot open file for mapping from accession to tax ID" << endl; } cerr << "Done" << endl; + return maxTaxID; } // This function sort the TargetKmerBuffer, do redundancy reducing task, write the differential index of them @@ -444,7 +474,8 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq break; } taxIds.push_back(taxIdList[kmerBuffer.buffer[i].info.sequenceID]); - hasSeenOtherStrains += (taxIdList[lookingKmer->info.sequenceID] != taxIdList[kmerBuffer.buffer[i].info.sequenceID]); + hasSeenOtherStrains += (taxonomy->taxonNode(taxIdList[lookingKmer->info.sequenceID])->parentTaxId + != taxonomy->taxonNode(taxIdList[kmerBuffer.buffer[i].info.sequenceID]) -> parentTaxId); i++; if(i == splits[split].end + 1){ endFlag = 1; @@ -569,7 +600,7 @@ void IndexCreator::splitSequenceFile(vector & seqSegments, Mmaped string IndexCreator::getSeqSegmentsWithHead(vector & seqSegments, const string & seqFileName, const unordered_map & acc2taxid, - unordered_map & foundAcc2taxid) { + vector>> & newAcc2taxid) { struct stat stat1{}; stat(seqFileName.c_str(), &stat1); size_t numOfChar = stat1.st_size; @@ -580,24 +611,33 @@ string IndexCreator::getSeqSegmentsWithHead(vector & seqSegments, size_t start = 0; size_t pos; vector seqSegmentsTmp; - vector headers; - size_t seqCnt = taxIdList.size(); + string accession; + string accession_version; + if (seqFile.is_open()) { getline(seqFile, firstLine, '\n'); + accession = firstLine.substr(1, firstLine.find('.') - 1); + accession_version = firstLine.substr(1, LocalUtil::getFirstWhiteSpacePos(firstLine) - 1); + newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid.at(accession), newTaxID)); + taxIdList.push_back(newTaxID++); // cout << firstLine << endl; - taxIdList.push_back(acc2taxid.at(firstLine.substr(1, firstLine.find('.') - 1))); - foundAcc2taxid[firstLine.substr(1, firstLine.find(' ') - 1)] = taxIdList.back(); + // taxIdList.push_back(acc2taxid.at(firstLine.substr(1, firstLine.find('.') - 1))); + // foundAcc2taxid[firstLine.substr(1, firstLine.find(' ') - 1)] = taxIdList.back(); while (getline(seqFile, eachLine, '\n')) { if (eachLine[0] == '>') { + accession = eachLine.substr(1, eachLine.find('.') - 1); + accession_version = eachLine.substr(1, LocalUtil::getFirstWhiteSpacePos(eachLine) - 1); + newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid.at(accession), newTaxID)); + taxIdList.push_back(newTaxID++); // cout << eachLine << endl; - taxIdList.push_back(acc2taxid.at(eachLine.substr(1, eachLine.find('.') - 1))); - foundAcc2taxid[eachLine.substr(1, eachLine.find(' ') - 1)] = taxIdList.back(); + // taxIdList.push_back(acc2taxid.at(eachLine.substr(1, eachLine.find('.') - 1))); + // foundAcc2taxid[eachLine.substr(1, eachLine.find(' ') - 1)] = taxIdList.back(); pos = (size_t) seqFile.tellg(); seqSegmentsTmp.emplace_back(start, pos - eachLine.length() - 3,pos - eachLine.length() - start - 2); start = pos - eachLine.length() - 1; } } - seqSegmentsTmp.emplace_back(start, numOfChar - 2, numOfChar - start - 1, seqCnt); + seqSegmentsTmp.emplace_back(start, numOfChar - 2, numOfChar - start - 1); } else { cerr << "Unable to open file: " << seqFileName << endl; } @@ -608,7 +648,7 @@ string IndexCreator::getSeqSegmentsWithHead(vector & seqSegments, void IndexCreator::getSeqSegmentsWithHead(vector & seqSegments, const char * seqFileName) { struct stat stat1{}; - int a = stat(seqFileName, &stat1); + stat(seqFileName, &stat1); size_t numOfChar = stat1.st_size; ifstream seqFile; @@ -858,3 +898,39 @@ void IndexCreator::writeDbParameters() { fprintf(handle, "Spaced_kmer_mask\t%s\n", spaceMask.c_str()); fclose(handle); } + +void IndexCreator::editTaxonomyDumpFiles(const vector>> & newAcc2taxid) { + // Edit names.dmp + string nameFileName = taxonomyDir + "/names.dmp"; + string newNameFileName = taxonomyDir + "/names.dmp.new"; + FileUtil::copyFile(nameFileName.c_str(), newNameFileName.c_str()); + FILE *nameFile = fopen(newNameFileName.c_str(), "a"); + if (nameFile == NULL) { + Debug(Debug::ERROR) << "Could not open " << newNameFileName << " for writing\n"; + EXIT(EXIT_FAILURE); + } + + for (size_t i = 0; i < newAcc2taxid.size() - 1; i++) { + fprintf(nameFile, "%d\t|\t%s\t|\t\t|\tscientific name\t|\n", newAcc2taxid[i].second.second, newAcc2taxid[i].first.c_str()); + } + fprintf(nameFile, "%d\t|\t%s\t|\t\t|\tscientific name\t|", newAcc2taxid.back().second.second, newAcc2taxid.back().first.c_str()); + fclose(nameFile); + + // Edit nodes.dmp + string nodeFileName = taxonomyDir + "/nodes.dmp"; + string newNodeFileName = taxonomyDir + "/nodes.dmp.new"; + FileUtil::copyFile(nodeFileName.c_str(), newNodeFileName.c_str()); + FILE *nodeFile = fopen(newNodeFileName.c_str(), "a"); + if (nodeFile == NULL) { + Debug(Debug::ERROR) << "Could not open " << newNodeFileName << " for writing\n"; + EXIT(EXIT_FAILURE); + } + + for (size_t i = 0; i < newAcc2taxid.size() - 1; i++) { + fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\n", newAcc2taxid[i].second.second, newAcc2taxid[i].second.first); + } + fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|", newAcc2taxid.back().second.second, newAcc2taxid.back().second.first); + fclose(nodeFile); + + // Edit node.dmp +} \ No newline at end of file diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h index 8f343717..de31e394 100644 --- a/src/commons/IndexCreator.h +++ b/src/commons/IndexCreator.h @@ -18,6 +18,7 @@ #include "NucleotideMatrix.h" #include "SubstitutionMatrix.h" #include "tantan.h" +#include "LocalUtil.h" #ifdef OPENMP @@ -65,6 +66,7 @@ class IndexCreator{ vector sequences; }; + TaxID newTaxID; vector fastaList; vector taxIdList; vector processedSeqCnt; // Index of this vector is the same as the index of fnaList @@ -128,7 +130,10 @@ class IndexCreator{ } void load_assacc2taxid(const string & mappingFile, unordered_map & assacc2taxid); - static void load_accession2taxid(const string & mappingFile, unordered_map & assacc2taxid); + + static TaxID load_accession2taxid(const string & mappingFile, unordered_map & assacc2taxid); + + void editTaxonomyDumpFiles(const vector>> & newAcc2taxid); void reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniqeKmerIdx, size_t & uniqKmerCnt, const LocalParameters & par); @@ -150,9 +155,11 @@ class IndexCreator{ public: static void splitSequenceFile(vector & seqSegments, MmapedData seqFile); - string getSeqSegmentsWithHead(vector & seqSegments, const string & seqFileName, + string getSeqSegmentsWithHead(vector & seqSegments, + const string & seqFileName, const unordered_map & acc2taxid, - unordered_map & foundAcc2taxid); + vector>> & newAcc2taxid); + static void getSeqSegmentsWithHead(vector & seqSegments, const char * seqFileName); IndexCreator(const LocalParameters & par); IndexCreator() {taxonomy = nullptr;} diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 5b352c5f..39b9a13d 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -160,15 +160,6 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, MmapedData diffIdxSplits = mmapData(diffIdxSplitFileName.c_str(), 3); size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t); - MmapedData tempInfos = mmapData(targetInfoFileName.c_str(), 3); - size_t numOfInfos = tempInfos.fileSize / sizeof(TargetKmerInfo); - - // Print kmer infos - for (size_t i = 0; i < numOfInfos; i++) { - cout << (int) tempInfos.data[i].sequenceID << " " << (int) tempInfos.data[i].redundancy << endl; - } - - size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve; QueryKmer *queryKmerList = queryKmerBuffer->buffer; diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp index 08e04508..d981f238 100644 --- a/src/commons/LocalUtil.cpp +++ b/src/commons/LocalUtil.cpp @@ -1,4 +1,5 @@ #include "LocalUtil.h" +#include std::string LocalUtil::getQueryBaseName(const std::string & queryPath) { @@ -19,7 +20,6 @@ std::string LocalUtil::getQueryBaseName(const std::string & queryPath) { } - void LocalUtil::splitQueryFile(std::vector & sequences, const std::string &queryPath) { KSeqWrapper* kseq = nullptr; kseq = KSeqFactory(queryPath.c_str()); @@ -41,4 +41,23 @@ int LocalUtil::getMaxCoveredLength(int queryLength) { } else { return queryLength - 3; // 3 } -} \ No newline at end of file +} + +int LocalUtil::getFirstWhiteSpacePos(const std::string &str) { + for (size_t i = 0; i < str.size(); ++i) { + if (isspace(int(str[i]))) { + return i; + } + } + return str.size(); +} + +// std::string LocalUtil::getAccessionFromHeader(const std::string &header) { +// int pos = getFirstWhiteSpacePos(header); +// std::string accession = header.substr(0, pos); +// std::vector splits = Util::split(accession, "."); +// if (splits.size() > 1) { +// accession = splits[0]; +// } +// return std::stoi(accession.substr(3)); +// } \ No newline at end of file diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h index 0fcbdf82..7f511239 100644 --- a/src/commons/LocalUtil.h +++ b/src/commons/LocalUtil.h @@ -18,6 +18,10 @@ class LocalUtil : public Util { static void splitQueryFile(std::vector & seqSegments, const std::string & queryPath); static int getMaxCoveredLength(int queryLength) ; + + static int getFirstWhiteSpacePos(const std::string & str); + + // static std::string getAccessionFromHeader(const std::string & header); }; diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp index 262d1258..525ac651 100644 --- a/src/commons/SeqIterator.cpp +++ b/src/commons/SeqIterator.cpp @@ -1,7 +1,3 @@ -// -// Created by KJB on 01/09/2020. -// - #include "SeqIterator.h" const string SeqIterator::atcg = "................................................................" diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 85ef5339..58ee0722 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -250,7 +250,6 @@ TaxID Taxonomer::lowerRankClassification(vector &matches, pair uint8_t minHamming = matches[i].hamming; Match * minHammingMatch = & matches[i]; TaxID minHammingTaxId = minHammingMatch->targetId; - bool first = true; i --; while ( (i >= matchRange.first) && (currQuotient == matches[i].qInfo.pos / 3) ) { if (matches[i].hamming < minHamming) { From 585b2d8ce8b49cc07be1037829dd1d08faa66b65 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 29 Aug 2023 10:15:47 +0900 Subject: [PATCH 22/65] fix DB reproducibility problem --- src/commons/IndexCreator.cpp | 195 ++++++++++++++++++++++++++------ src/commons/IndexCreator.h | 10 ++ src/commons/LocalParameters.cpp | 12 +- src/commons/LocalParameters.h | 2 + src/commons/ProdigalWrapper.cpp | 65 +++++------ src/commons/Taxonomer.cpp | 1 - src/commons/common.cpp | 5 + 7 files changed, 221 insertions(+), 69 deletions(-) diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index 318d3040..c8f8df7a 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -1,7 +1,9 @@ #include "IndexCreator.h" #include "FileUtil.h" #include "LocalUtil.h" +#include "ProdigalWrapper.h" #include +#include #include #include @@ -11,7 +13,8 @@ IndexCreator::IndexCreator(const LocalParameters & par) { bufferSize = par.bufferSize; reducedAA = par.reducedAA; spaceMask = par.spaceMask; - + accessionLevel = par.accessionLevel; + // Input files dbDir = par.filenames[0]; if (par.taxonomyPath.empty()) { @@ -29,10 +32,11 @@ IndexCreator::IndexCreator(const LocalParameters & par) { versionFileName = dbDir + "/db.version"; paramterFileName = dbDir + "/db.parameters"; - // Load taxonomy - // taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp", - // taxonomyDir + "/nodes.dmp", - // taxonomyDir + "/merged.dmp"); + if (!par.accessionLevel){ + taxonomy = new NcbiTaxonomy(taxonomyDir + "/names.dmp", + taxonomyDir + "/nodes.dmp", + taxonomyDir + "/merged.dmp"); + } if (par.reducedAA == 1){ MARKER = 0Xffffffff; @@ -55,7 +59,12 @@ IndexCreator::~IndexCreator() { void IndexCreator::createIndex(const LocalParameters &par) { // Read through FASTA files and make blocks of sequences to be processed by each thread - makeBlocksForParallelProcessing(); + if (par.accessionLevel) { + makeBlocksForParallelProcessing_accession_level(); + } else { + makeBlocksForParallelProcessing(); + } + cout << "Made blocks for each thread" << endl; // Write taxonomy id list @@ -81,16 +90,22 @@ void IndexCreator::createIndex(const LocalParameters &par) { // Extract Target k-mers fillTargetKmerBuffer(kmerBuffer, splitChecker, processedSplitCnt, par); - time_t start = time(nullptr); // Sort the k-mers + time_t start = time(nullptr); SORT_PARALLEL(kmerBuffer.buffer, kmerBuffer.buffer + kmerBuffer.startIndexOfReserve, - IndexCreator::compareForDiffIdx); + IndexCreator::compareForDiffIdx2); time_t sort = time(nullptr); cout << "Sort time: " << sort - start << endl; auto * uniqKmerIdx = new size_t[kmerBuffer.startIndexOfReserve + 1]; size_t uniqKmerCnt = 0; + // Print out the k-mers + string tmpFileName = dbDir + "/tmp"; + FILE * tmpFile = fopen(tmpFileName.c_str(), "wb"); + fwrite(kmerBuffer.buffer, sizeof(uint16_t), kmerBuffer.startIndexOfReserve, tmpFile); + fclose(tmpFile); + reduceRedundancy(kmerBuffer, uniqKmerIdx, uniqKmerCnt, par); time_t reduction = time(nullptr); cout<<"Time spent for reducing redundancy: "<<(double) (reduction - sort) << endl; @@ -157,7 +172,51 @@ void IndexCreator::updateIndex(const LocalParameters &par) { delete[] splitChecker; } -void IndexCreator::makeBlocksForParallelProcessing(){ + +void IndexCreator::makeBlocksForParallelProcessing() { + unordered_map acc2taxid; + load_accession2taxid(acc2taxidFileName, acc2taxid); + + // Make blocks of sequences that can be processed in parallel + int fileNum = getNumberOfLines(fnaListFileName); + fastaList.resize(fileNum); + + ifstream fnaListFile; + fnaListFile.open(fnaListFileName); + if (!fnaListFile.is_open()) { + Debug(Debug::ERROR) << "Cannot open file for file list" << "\n"; + EXIT(EXIT_FAILURE); + } + string eachFile; + string seqHeader; + + unordered_map foundAcc2taxid; + for (int i = 0; i < fileNum; ++i) { + // Get start and end position of each sequence in the file + getline(fnaListFile, eachFile); + fastaList[i].path = eachFile; + processedSeqCnt.push_back(taxIdList.size()); + seqHeader = getSeqSegmentsWithHead(fastaList[i].sequences, eachFile, acc2taxid, foundAcc2taxid); + seqHeader = seqHeader.substr(1, seqHeader.find('.') - 1); + TaxID speciesTaxid = taxonomy->getTaxIdAtRank(acc2taxid[seqHeader], "species"); + + // Split current file into blocks for parallel processing + splitFastaForProdigalTraining(i, speciesTaxid); + fastaList[i].speciesID = speciesTaxid; + } + fnaListFile.close(); + + // Write accession to taxid map to file + string acc2taxidFileName2 = dbDir + "/acc2taxid.map"; + FILE * acc2taxidFile = fopen(acc2taxidFileName2.c_str(), "w"); + for (auto it = foundAcc2taxid.begin(); it != foundAcc2taxid.end(); ++it) { + fprintf(acc2taxidFile, "%s\t%d\n", it->first.c_str(), it->second); + } + fclose(acc2taxidFile); + +} + +void IndexCreator::makeBlocksForParallelProcessing_accession_level() { unordered_map acc2taxid; TaxID maxTaxID = load_accession2taxid(acc2taxidFileName, acc2taxid); @@ -187,7 +246,6 @@ void IndexCreator::makeBlocksForParallelProcessing(){ fastaList[i].path = eachFile; processedSeqCnt.push_back(taxIdList.size()); - seqHeader = getSeqSegmentsWithHead(fastaList[i].sequences, eachFile, acc2taxid, newAcc2taxid); // accession_version = seqHeader.substr(1, seqHeader.find('.') - 1); accession = seqHeader.substr(1, seqHeader.find('.') - 1); @@ -453,7 +511,7 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq idxOfEachSplit[i] = new size_t[splits[i].end - splits[i].offset + 2]; cntOfEachSplit[i] = 0; } -#pragma omp parallel default(none), shared(kmerBuffer, idxOfEachSplit, cntOfEachSplit, splits) +#pragma omp parallel default(none), shared(kmerBuffer, idxOfEachSplit, cntOfEachSplit, splits, par) { TargetKmer * lookingKmer; size_t lookingIndex; @@ -474,8 +532,12 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq break; } taxIds.push_back(taxIdList[kmerBuffer.buffer[i].info.sequenceID]); - hasSeenOtherStrains += (taxonomy->taxonNode(taxIdList[lookingKmer->info.sequenceID])->parentTaxId + if (par.accessionLevel) { + hasSeenOtherStrains += (taxonomy->taxonNode(taxIdList[lookingKmer->info.sequenceID])->parentTaxId != taxonomy->taxonNode(taxIdList[kmerBuffer.buffer[i].info.sequenceID]) -> parentTaxId); + } else { + hasSeenOtherStrains += (taxIdList[lookingKmer->info.sequenceID] != taxIdList[kmerBuffer.buffer[i].info.sequenceID]); + } i++; if(i == splits[split].end + 1){ endFlag = 1; @@ -583,9 +645,30 @@ int IndexCreator::getNumOfFlush() } inline bool IndexCreator::compareForDiffIdx(const TargetKmer & a, const TargetKmer & b){ - return a.ADkmer < b.ADkmer || (a.ADkmer == b.ADkmer && a.taxIdAtRank < b.taxIdAtRank); + if (a.ADkmer != b.ADkmer) { + return a.ADkmer < b.ADkmer; + } + return a.taxIdAtRank < b.taxIdAtRank; } +inline bool IndexCreator::compareForDiffIdx2(const TargetKmer & a, const TargetKmer & b){ + if (a.ADkmer != b.ADkmer) { + return a.ADkmer < b.ADkmer; + } + + if (a.taxIdAtRank != b.taxIdAtRank) { + return a.taxIdAtRank < b.taxIdAtRank; + } + + if (a.info.sequenceID != b.info.sequenceID) { + return a.info.sequenceID < b.info.sequenceID; + } + + return a.info.redundancy < b.info.redundancy; +} + + + void IndexCreator::splitSequenceFile(vector & seqSegments, MmapedData seqFile) { size_t start = 0; size_t numOfChar = seqFile.fileSize / sizeof(char); @@ -598,7 +681,8 @@ void IndexCreator::splitSequenceFile(vector & seqSegments, Mmaped seqSegments.emplace_back(start, numOfChar - 2, numOfChar - start - 1); } -string IndexCreator::getSeqSegmentsWithHead(vector & seqSegments, const string & seqFileName, +string IndexCreator::getSeqSegmentsWithHead(vector & seqSegments, + const string & seqFileName, const unordered_map & acc2taxid, vector>> & newAcc2taxid) { struct stat stat1{}; @@ -646,6 +730,46 @@ string IndexCreator::getSeqSegmentsWithHead(vector & seqSegments, return firstLine; } +string IndexCreator::getSeqSegmentsWithHead(vector & seqSegments, + const string & seqFileName, + const unordered_map & acc2taxid, + unordered_map & foundAcc2taxid) { + struct stat stat1{}; + stat(seqFileName.c_str(), &stat1); + size_t numOfChar = stat1.st_size; + string firstLine; + ifstream seqFile; + seqFile.open(seqFileName); + string eachLine; + size_t start = 0; + size_t pos; + vector seqSegmentsTmp; + vector headers; + size_t seqCnt = taxIdList.size(); + if (seqFile.is_open()) { + getline(seqFile, firstLine, '\n'); +// cout << firstLine << endl; + taxIdList.push_back(acc2taxid.at(firstLine.substr(1, firstLine.find('.') - 1))); + foundAcc2taxid[firstLine.substr(1, firstLine.find(' ') - 1)] = taxIdList.back(); + while (getline(seqFile, eachLine, '\n')) { + if (eachLine[0] == '>') { +// cout << eachLine << endl; + taxIdList.push_back(acc2taxid.at(eachLine.substr(1, eachLine.find('.') - 1))); + foundAcc2taxid[eachLine.substr(1, eachLine.find(' ') - 1)] = taxIdList.back(); + pos = (size_t) seqFile.tellg(); + seqSegmentsTmp.emplace_back(start, pos - eachLine.length() - 3,pos - eachLine.length() - start - 2); + start = pos - eachLine.length() - 1; + } + } + seqSegmentsTmp.emplace_back(start, numOfChar - 2, numOfChar - start - 1, seqCnt); + } else { + cerr << "Unable to open file: " << seqFileName << endl; + } + seqFile.close(); + seqSegments = std::move(seqSegmentsTmp); + return firstLine; +} + void IndexCreator::getSeqSegmentsWithHead(vector & seqSegments, const char * seqFileName) { struct stat stat1{}; stat(seqFileName, &stat1); @@ -701,7 +825,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, #pragma omp parallel default(none), shared(kmerBuffer, checker, processedSplitCnt, hasOverflow, par, cout) { ProbabilityMatrix probMatrix(*subMat); - ProdigalWrapper prodigal; + // ProdigalWrapper prodigal; SeqIterator seqIterator(par); size_t posToWrite; size_t orfNum; @@ -727,10 +851,12 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, for (size_t p = 0; p < fnaSplits[i].cnt; p++) { totalLength += fastaList[fnaSplits[i].file_idx].sequences[fnaSplits[i].offset + p].length; } - size_t estimatedKmerCnt = (totalLength + totalLength / 1000) / 3; + + size_t estimatedKmerCnt = (totalLength + totalLength / 10) / 3; // Process current split if buffer has enough space. posToWrite = kmerBuffer.reserveMemory(estimatedKmerCnt); + ProdigalWrapper * prodigal = new ProdigalWrapper(); if (posToWrite + estimatedKmerCnt < kmerBuffer.bufferSize) { // MMap FASTA file of current split struct MmapedData fastaFile = mmapData(fastaList[fnaSplits[i].file_idx].path.c_str()); @@ -742,16 +868,15 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, lengthOfTrainingSeq = seq->seq.l; cout << "T: " << seq->name.s << " " << lengthOfTrainingSeq << " " << estimatedKmerCnt << endl; - // Train prodigal. - prodigal.is_meta = 0; + // Train prodigal + prodigal->is_meta = 0; if (lengthOfTrainingSeq < 100'000) { - prodigal.is_meta = 1; - prodigal.trainMeta(seq->seq.s); + prodigal->is_meta = 1; + prodigal->trainMeta(seq->seq.s); } else { - prodigal.trainASpecies(seq->seq.s); + prodigal->trainASpecies(seq->seq.s); } - // // Load training information // int read_check = read_training_file(const_cast((par.tinfoPath + to_string(fnaSplits[i].speciesID) + ".tinfo").c_str()), // prodigal.getTrainingInfo()); @@ -761,9 +886,9 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, // } // Generate intergenic 23-mer list. It is used to determine extension direction of intergenic sequences. - prodigal.getPredictedGenes(seq->seq.s); - seqIterator.generateIntergenicKmerList(prodigal.genes, prodigal.nodes, - prodigal.getNumberOfPredictedGenes(), + prodigal->getPredictedGenes(seq->seq.s); + seqIterator.generateIntergenicKmerList(prodigal->genes, prodigal->nodes, + prodigal->getNumberOfPredictedGenes(), intergenicKmers,seq->seq.s); // Get min k-mer hash list for determining strandness @@ -786,10 +911,10 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, if (seqIterator.compareMinHashList(standardList, currentList, lengthOfTrainingSeq, // Forward strlen(seq->seq.s))) { // Get extended ORFs - prodigal.getPredictedGenes(seq->seq.s); - prodigal.removeCompletelyOverlappingGenes(); - seqIterator.getExtendedORFs(prodigal.finalGenes, prodigal.nodes, extendedORFs, - prodigal.fng, strlen(seq->seq.s), + prodigal->getPredictedGenes(seq->seq.s); + prodigal->removeCompletelyOverlappingGenes(); + seqIterator.getExtendedORFs(prodigal->finalGenes, prodigal->nodes, extendedORFs, + prodigal->fng, strlen(seq->seq.s), orfNum, intergenicKmers, seq->seq.s); // Get masked sequence char *maskedSeq = nullptr; @@ -821,10 +946,10 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, reverseCompliment = seqIterator.reverseCompliment(seq->seq.s, seq->seq.l); // Get extended ORFs - prodigal.getPredictedGenes(reverseCompliment); - prodigal.removeCompletelyOverlappingGenes(); - seqIterator.getExtendedORFs(prodigal.finalGenes, prodigal.nodes, extendedORFs, - prodigal.fng, strlen(reverseCompliment), + prodigal->getPredictedGenes(reverseCompliment); + prodigal->removeCompletelyOverlappingGenes(); + seqIterator.getExtendedORFs(prodigal->finalGenes, prodigal->nodes, extendedORFs, + prodigal->fng, strlen(reverseCompliment), orfNum, intergenicKmers, reverseCompliment); // Get masked sequence @@ -868,6 +993,9 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, __sync_fetch_and_add(&hasOverflow, 1); __sync_fetch_and_sub(&kmerBuffer.startIndexOfReserve, estimatedKmerCnt); } + cout << totalLength << " " << prodigal->fng << endl; + delete prodigal; + } } } @@ -896,6 +1024,7 @@ void IndexCreator::writeDbParameters() { } fprintf(handle, "Reduced_alphabet\t%d\n", reducedAA); fprintf(handle, "Spaced_kmer_mask\t%s\n", spaceMask.c_str()); + fprintf(handle, "Accession_level\t%d\n", accessionLevel); fclose(handle); } diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h index de31e394..bee95f8f 100644 --- a/src/commons/IndexCreator.h +++ b/src/commons/IndexCreator.h @@ -45,6 +45,7 @@ class IndexCreator{ size_t bufferSize; int reducedAA; string spaceMask; + int accessionLevel; // Inputs NcbiTaxonomy * taxonomy; @@ -112,6 +113,8 @@ class IndexCreator{ void writeDbParameters(); static bool compareForDiffIdx(const TargetKmer & a, const TargetKmer & b); + + static bool compareForDiffIdx2(const TargetKmer & a, const TargetKmer & b); // void maskLowComplexityRegions(char * seq, char * maskedSeq, ProbabilityMatrix & probMat, // const LocalParameters & par); @@ -123,6 +126,8 @@ class IndexCreator{ void makeBlocksForParallelProcessing(); + void makeBlocksForParallelProcessing_accession_level(); + void splitFastaForProdigalTraining(int file_idx, TaxID speciesID); void unzipAndList(const string & folder, const string & fastaList_fname){ @@ -160,6 +165,11 @@ class IndexCreator{ const unordered_map & acc2taxid, vector>> & newAcc2taxid); + string getSeqSegmentsWithHead(vector & seqSegments, + const string & seqFileName, + const unordered_map & acc2taxid, + unordered_map & foundAcc2taxid); + static void getSeqSegmentsWithHead(vector & seqSegments, const char * seqFileName); IndexCreator(const LocalParameters & par); IndexCreator() {taxonomy = nullptr;} diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index c5985ccf..f38cd6a1 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -174,6 +174,13 @@ LocalParameters::LocalParameters() : typeid(size_t), (void *) &bufferSize, "^[0-9]+$"), + ACCESSION_LEVEL(ACCESSION_LEVEL_ID, + "--accession-level", + "Build a database for accession level classification", + "Build a database for accession level classification", + typeid(int), + (void *) &accessionLevel, + "[0-1]"), TEST_RANK(TEST_RANK_ID, "--test-rank", ".", @@ -249,6 +256,7 @@ LocalParameters::LocalParameters() : build.push_back(&PARAM_MASK_PROBABILTY); build.push_back(&PARAM_MASK_RESIDUES); build.push_back(&BUFFER_SIZE); + build.push_back(&ACCESSION_LEVEL); //classify classify.push_back(&PARAM_THREADS); @@ -271,6 +279,7 @@ LocalParameters::LocalParameters() : classify.push_back(&PARAM_MASK_RESIDUES); classify.push_back(&PARAM_MASK_PROBABILTY); classify.push_back(&MATCH_PER_KMER); + classify.push_back(&ACCESSION_LEVEL); // filter filter.push_back(&PARAM_THREADS); @@ -295,7 +304,8 @@ LocalParameters::LocalParameters() : filter.push_back(&MATCH_PER_KMER); filter.push_back(&PRINT_MODE); filter.push_back(&CONTAM_LIST); - + filter.push_back(&ACCESSION_LEVEL); + //updateTargetDB exclusiontest_hiv.push_back(&TEST_RANK); diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 2a92115a..2d75912a 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -62,6 +62,7 @@ class LocalParameters : public Parameters { PARAMETER(IS_ASSEMBLY) PARAMETER(SPLIT_NUM) PARAMETER(BUFFER_SIZE) + PARAMETER(ACCESSION_LEVEL) // Test parameters PARAMETER(TEST_RANK) @@ -105,6 +106,7 @@ class LocalParameters : public Parameters { std::string taxonomyPath; int splitNum; size_t bufferSize; + int accessionLevel; // Test parameters std::string testRank; diff --git a/src/commons/ProdigalWrapper.cpp b/src/commons/ProdigalWrapper.cpp index 901eb276..7b6e39d2 100644 --- a/src/commons/ProdigalWrapper.cpp +++ b/src/commons/ProdigalWrapper.cpp @@ -52,28 +52,14 @@ ProdigalWrapper::ProdigalWrapper() { void ProdigalWrapper:: trainASpecies(char * genome){ - memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char)); - memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char)); - memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char)); - memset(nodes, 0, nn * sizeof(struct _node)); + // Initialize training information + memset(mlist, 0, MAX_MASKS*sizeof(mask)); memset(&tinf, 0, sizeof(struct _training)); - nn = 0; slen = 0; ipath = 0; nmask = 0; tinf.st_wt = 4.35; tinf.trans_table = 11; slen = getNextSeq(genome, 1); -// if(slen == 0) { -// fprintf(stderr, "\n\nSequence read failed (file must be Fasta, "); -// fprintf(stderr, "Genbank, or EMBL format).\n\n"); -// exit(9); -// } -// if(slen < IDEAL_SINGLE_GENOME) { -// fprintf(stderr, "\n\nWarning: ideally Prodigal should be given at"); -// fprintf(stderr, " least %d bases for ", IDEAL_SINGLE_GENOME); -// fprintf(stderr, "training.\nYou may get better results with the "); -// fprintf(stderr, "-p meta option.\n\n"); -// } rcom_seq(seq, rseq, useq, slen); /*********************************************************************** @@ -82,10 +68,11 @@ trainASpecies(char * genome){ ***********************************************************************/ if(slen > max_slen && slen > STT_NOD*8) { nodes = (struct _node *)realloc(nodes, (int)(slen/8)*sizeof(struct _node)); -// if(nodes == NULL) { -// fprintf(stderr, "Realloc failed on nodes\n\n"); -// exit(11); -// } + if(nodes == NULL) { + fprintf(stderr, "Realloc failed on nodes\n\n"); + exit(11); + } + memset(nodes, 0, (int)(slen/8)*sizeof(struct _node)); max_slen = slen; } nn = add_nodes(seq, rseq, slen, nodes, closed, mlist, nmask, &tinf); @@ -130,18 +117,21 @@ trainASpecies(char * genome){ determine_sd_usage(&tinf); if(force_nonsd == 1) tinf.uses_sd = 0; if(tinf.uses_sd == 0) train_starts_nonsd(seq, rseq, slen, nodes, nn, &tinf); -} -void ProdigalWrapper::trainMeta(char *genome) { + // Initialize memories to reuse them memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char)); memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char)); memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char)); - memset(nodes, 0, nn*sizeof(struct _node)); + memset(nodes, 0, nn * sizeof(struct _node)); + nn = 0; slen = 0; ipath = 0; nmask = 0; +} + +void ProdigalWrapper::trainMeta(char *genome) { + // Initialize training information memset(&tinf, 0, sizeof(struct _training)); tinf.st_wt = 4.35; tinf.trans_table = 11; - nn = 0; slen = 0; ipath = 0; nmask = 0; - + initialize_metagenomic_bins(meta); slen = getNextSeq(genome, 1); @@ -154,6 +144,7 @@ void ProdigalWrapper::trainMeta(char *genome) { fprintf(stderr, "Realloc failed on nodes\n\n"); exit(11); } + memset(nodes, 0, (int)(slen/8)*sizeof(struct _node)); max_slen = slen; } @@ -182,13 +173,15 @@ void ProdigalWrapper::trainMeta(char *genome) { max_score = nodes[ipath].score; } } -} -void ProdigalWrapper::getPredictedGenes(char * genome){ + + // Initialize memories to reuse them memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char)); memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char)); memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char)); - memset(nodes, 0, nn*sizeof(struct _node)); - nn = 0; slen = 0; nmask = 0; ipath=0; + memset(nodes, 0, nn * sizeof(struct _node)); + nn = 0; slen = 0; ipath = 0; nmask = 0; +} +void ProdigalWrapper::getPredictedGenes(char * genome){ /* Initialize structure */ slen = getNextSeq(genome, 0); @@ -211,7 +204,6 @@ void ProdigalWrapper::getPredictedGenes(char * genome){ } if(is_meta == 0) { - ipath = 0; /*********************************************************************** Find all the potential starts and stops, sort them, and create comprehensive list of nodes for dynamic programming. @@ -235,9 +227,8 @@ void ProdigalWrapper::getPredictedGenes(char * genome){ } else{ - /// metagenomic version - fprintf(stderr, "Request: Metagenomic, Phase: Gene Finding\n"); - + /// Metagenomic version + nn = add_nodes(seq, rseq, slen, nodes, closed, mlist, nmask, meta[max_phase].tinf); qsort(nodes, nn, sizeof(struct _node), &compare_nodes); @@ -250,7 +241,13 @@ void ProdigalWrapper::getPredictedGenes(char * genome){ tweak_final_starts(genes, ng, nodes, nn, meta[max_phase].tinf); record_gene_data(genes, ng, nodes, meta[max_phase].tinf, num_seq); } -// fprintf(stderr, "done! gene count: %d (%d bp)\n", ng, slen); + + // Initialize memories to reuse them + memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char)); + memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char)); + memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char)); + memset(nodes, 0, nn*sizeof(struct _node)); + nn = 0; slen = 0; nmask = 0; ipath=0; } int ProdigalWrapper::getNextSeq(char * line, int training) { diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 58ee0722..1118f618 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -220,7 +220,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, queryList[currentQuery].taxCnt[genusMatches[i].targetId]++; } - // Store classification results queryList[currentQuery].isClassified = true; queryList[currentQuery].classification = result; diff --git a/src/commons/common.cpp b/src/commons/common.cpp index 92d7c735..f5dde1cd 100644 --- a/src/commons/common.cpp +++ b/src/commons/common.cpp @@ -88,6 +88,11 @@ int loadDbParameters(LocalParameters &par) { par.reducedAA = stoi(tokens[1]); } else if (tokens[0] == "Spaced_kmer_mask") { par.spaceMask = tokens[1]; + } else if (tokens[0] == "Accession_level") { + if (tokens[1] == "0" && par.accessionLevel == 1){ + par.accessionLevel = 0; + cerr << "Warning: Current DB doesn't support accession-level classification." << endl; + } } } return 1; From 8460b558bc45bdfd72b0a7099816fa6190f40174 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 29 Aug 2023 17:14:28 +0900 Subject: [PATCH 23/65] activate --accession-level option for 'classify' --- src/commons/IndexCreator.cpp | 9 ++------- src/commons/Taxonomer.cpp | 38 +++++++++++++++++++++++++++++++++++- src/commons/Taxonomer.h | 1 + src/commons/common.cpp | 3 +++ src/workflow/build.cpp | 1 + src/workflow/classify.cpp | 1 + src/workflow/filter.cpp | 1 + 7 files changed, 46 insertions(+), 8 deletions(-) diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index c8f8df7a..ab4baf9a 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -94,18 +94,13 @@ void IndexCreator::createIndex(const LocalParameters &par) { // Sort the k-mers time_t start = time(nullptr); SORT_PARALLEL(kmerBuffer.buffer, kmerBuffer.buffer + kmerBuffer.startIndexOfReserve, - IndexCreator::compareForDiffIdx2); + IndexCreator::compareForDiffIdx); time_t sort = time(nullptr); cout << "Sort time: " << sort - start << endl; auto * uniqKmerIdx = new size_t[kmerBuffer.startIndexOfReserve + 1]; size_t uniqKmerCnt = 0; - // Print out the k-mers - string tmpFileName = dbDir + "/tmp"; - FILE * tmpFile = fopen(tmpFileName.c_str(), "wb"); - fwrite(kmerBuffer.buffer, sizeof(uint16_t), kmerBuffer.startIndexOfReserve, tmpFile); - fclose(tmpFile); - + // Reduce redundancy reduceRedundancy(kmerBuffer, uniqKmerIdx, uniqKmerCnt, par); time_t reduction = time(nullptr); cout<<"Time spent for reducing redundancy: "<<(double) (reduction - sort) << endl; diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 1118f618..ad5505ba 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -1,4 +1,6 @@ #include "Taxonomer.h" +#include "NcbiTaxonomy.h" +#include Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxonomy(taxonomy) { @@ -15,6 +17,7 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon delete[] mask; maxGap = par.maxGap; minCoveredPos = par.minCoveredPos; + accessionLevel = par.accessionLevel; } Taxonomer::~Taxonomer() { @@ -262,13 +265,42 @@ TaxID Taxonomer::lowerRankClassification(vector &matches, pair } i--; } + // if (accessionLevel == 2) { + // if (taxonomy->taxonNode(minHammingTaxId).) { + // minHammingTaxId = taxonomy->taxonNode(minHammingTaxId)->parentTaxId; + // } + // } taxCnt[minHammingTaxId]++; } unordered_map cladeCnt; getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); - return BFS(cladeCnt, spTaxId); + if (accessionLevel == 2) { + unordered_map trimmedCladeCnt; + // Remove leaf nodes + for (auto it = cladeCnt.begin(); it != cladeCnt.end(); it++) { + TaxonNode const * taxon = taxonomy->taxonNode(it->first); + if (strcmp(taxonomy->getString(taxon->rankIdx), "") == 0) { + // trimmedCladeCnt[it->first] = it->second; + cladeCnt[taxon->parentTaxId].children.clear(); + } + // if (strcmp(taxonomy->getString(taxonomy->taxonNode(it->first)->rankIdx), "") != 0) { + // trimmedCladeCnt[it->first] = it->second; + // } else { + // cout << it->first << endl; + // } + + // if (!it->second.children.empty() || it->first == spTaxId) { + // trimmedCladeCnt[it->first] = it->second; + // } else if (it->second.children.empty()) { + // cout << it->first << endl; + // } + } + return BFS(cladeCnt, spTaxId); + } else { + return BFS(cladeCnt, spTaxId); + } } void Taxonomer::getSpeciesCladeCounts(const unordered_map &taxCnt, @@ -296,6 +328,10 @@ TaxID Taxonomer::BFS(const unordered_map & cladeCnt, TaxID r if (cladeCnt.at(root).children.empty()) { // root is a leaf return root; } + if (cladeCnt.find(cladeCnt.at(root).children[0]) == cladeCnt.end()) { // its children are trimmed + // cout << cladeCnt.at(root).children[0] << endl; + return root; + } unsigned int maxCnt = 3; unsigned int currentCnt; vector bestChildren; diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index d7de78ac..6c597dd2 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -30,6 +30,7 @@ class Taxonomer { // Parameters int maxGap; int minCoveredPos; + int accessionLevel; struct MatchBlock { MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {} diff --git a/src/commons/common.cpp b/src/commons/common.cpp index f5dde1cd..9f54ac75 100644 --- a/src/commons/common.cpp +++ b/src/commons/common.cpp @@ -93,6 +93,9 @@ int loadDbParameters(LocalParameters &par) { par.accessionLevel = 0; cerr << "Warning: Current DB doesn't support accession-level classification." << endl; } + if (tokens[1] == "1" && par.accessionLevel == 0){ + par.accessionLevel = 2; + } } } return 1; diff --git a/src/workflow/build.cpp b/src/workflow/build.cpp index b5f9e571..eaf0c367 100644 --- a/src/workflow/build.cpp +++ b/src/workflow/build.cpp @@ -12,6 +12,7 @@ void setDefaults_build(LocalParameters & par){ par.maskProb = 0.9; par.maskMode = 1; par.bufferSize = 1'000'000'000; + par.accessionLevel = 0; } int build(int argc, const char **argv, const Command &command){ diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp index 514f6ae8..91977368 100644 --- a/src/workflow/classify.cpp +++ b/src/workflow/classify.cpp @@ -24,6 +24,7 @@ void setClassifyDefaults(LocalParameters & par){ par.maskMode = 0; par.maskProb = 0.9; par.matchPerKmer = 4; + par.accessionLevel = 0; } int classify(int argc, const char **argv, const Command& command) diff --git a/src/workflow/filter.cpp b/src/workflow/filter.cpp index 3d0b0448..79bac439 100644 --- a/src/workflow/filter.cpp +++ b/src/workflow/filter.cpp @@ -23,6 +23,7 @@ void setFilterDefaults(LocalParameters & par) { par.matchPerKmer = 4; par.printMode = 1; par.contamList = ""; // TODO: set default + par.accessionLevel = 0; } int filter(int argc, const char **argv, const Command& command) { From 50768b506d909f3e8316e9f49615221de91aa0ef Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 31 Aug 2023 13:51:50 +0900 Subject: [PATCH 24/65] accession-level DB + turning off accession-level in classiy == not accession-level DB --- src/commons/Taxonomer.cpp | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index ad5505ba..663ebba4 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -265,37 +265,22 @@ TaxID Taxonomer::lowerRankClassification(vector &matches, pair } i--; } - // if (accessionLevel == 2) { - // if (taxonomy->taxonNode(minHammingTaxId).) { - // minHammingTaxId = taxonomy->taxonNode(minHammingTaxId)->parentTaxId; - // } - // } taxCnt[minHammingTaxId]++; } unordered_map cladeCnt; getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); - if (accessionLevel == 2) { - unordered_map trimmedCladeCnt; + if (accessionLevel == 2) { // Don't do accession-level classification // Remove leaf nodes for (auto it = cladeCnt.begin(); it != cladeCnt.end(); it++) { TaxonNode const * taxon = taxonomy->taxonNode(it->first); if (strcmp(taxonomy->getString(taxon->rankIdx), "") == 0) { - // trimmedCladeCnt[it->first] = it->second; - cladeCnt[taxon->parentTaxId].children.clear(); + // Remove current node from its parent's children list + cladeCnt[taxon->parentTaxId].children.erase(find(cladeCnt[taxon->parentTaxId].children.begin(), + cladeCnt[taxon->parentTaxId].children.end(), + it->first)); } - // if (strcmp(taxonomy->getString(taxonomy->taxonNode(it->first)->rankIdx), "") != 0) { - // trimmedCladeCnt[it->first] = it->second; - // } else { - // cout << it->first << endl; - // } - - // if (!it->second.children.empty() || it->first == spTaxId) { - // trimmedCladeCnt[it->first] = it->second; - // } else if (it->second.children.empty()) { - // cout << it->first << endl; - // } } return BFS(cladeCnt, spTaxId); } else { @@ -328,10 +313,6 @@ TaxID Taxonomer::BFS(const unordered_map & cladeCnt, TaxID r if (cladeCnt.at(root).children.empty()) { // root is a leaf return root; } - if (cladeCnt.find(cladeCnt.at(root).children[0]) == cladeCnt.end()) { // its children are trimmed - // cout << cladeCnt.at(root).children[0] << endl; - return root; - } unsigned int maxCnt = 3; unsigned int currentCnt; vector bestChildren; @@ -1025,7 +1006,6 @@ TaxonScore Taxonomer::chooseSpecies(const vector &matches, // Score each species std::unordered_map speciesScores; - size_t i = 0; TaxID currentSpeices; size_t numOfMatch = matches.size(); From 6dda6d20ed5c84a65de1eec36fede000bd7556d4 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Fri, 8 Sep 2023 16:37:45 +0900 Subject: [PATCH 25/65] some minro code fixes --- src/commons/IndexCreator.cpp | 8 +++++--- src/workflow/add_to_library.cpp | 8 +++++--- util/prepare_gtdb_taxonomy.sh | 20 +++++++++++++------- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index ab4baf9a..aded0b45 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -340,11 +340,13 @@ TaxID IndexCreator::load_accession2taxid(const string & mappingFileName, unorder string eachLine; string eachItem; if (FILE * mappingFile = fopen(mappingFileName.c_str(), "r")) { - char buffer[512]; + char accession[2048]; + char accession_version[2048]; int taxID; fscanf(mappingFile, "%*s\t%*s\t%*s\t%*s"); - while (fscanf(mappingFile, "%s\t%*s\t%d\t%*d", buffer, &taxID) == 2 ){ - acc2taxid[string(buffer)] = taxID; + while (fscanf(mappingFile, "%s\t%s\t%d\t%*d", accession, accession_version, &taxID) == 2 ){ + acc2taxid[string(accession_version)] = taxID; + acc2taxid[string(accession)] = taxID; if (taxID > maxTaxID) { maxTaxID = taxID; } diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp index e5f97eba..8abb3ff2 100644 --- a/src/workflow/add_to_library.cpp +++ b/src/workflow/add_to_library.cpp @@ -56,11 +56,13 @@ int addToLibrary(int argc, const char **argv, const Command &command){ unordered_map acc2taxid; string eachItem; if (FILE *mappingFile = fopen(mappingFileName.c_str(), "r")) { - char buffer[512]; + char accession[2048]; + char accession_version[2048]; int taxID; fscanf(mappingFile, "%*s\t%*s\t%*s\t%*s"); - while (fscanf(mappingFile, "%s\t%*s\t%d\t%*d", buffer, &taxID) == 2) { - acc2taxid[string(buffer)] = taxID; + while (fscanf(mappingFile, "%s\t%s\t%d\t%*d", accession, accession_version, &taxID) == 2) { + acc2taxid[string(accession_version)] = taxID; + acc2taxid[string(accession)] = taxID; } } else { cerr << "Cannot open file for mapping from accession to tax ID" << endl; diff --git a/util/prepare_gtdb_taxonomy.sh b/util/prepare_gtdb_taxonomy.sh index 855d8d74..4621fa49 100755 --- a/util/prepare_gtdb_taxonomy.sh +++ b/util/prepare_gtdb_taxonomy.sh @@ -2,10 +2,16 @@ # set output directory OUT=$1 +TAX_DIR="${OUT}/taxonomy" PWD=$(pwd) ar_gz="${PWD}/ar.tar.gz" bac_gz="${PWD}/bac.tar.gz" +# mkdir TAX_DIR if it doesn't exist +if [ ! -d "${TAX_DIR}" ]; then + mkdir -p "${TAX_DIR}" +fi + wget -O "${ar_gz}" https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tar.gz wget -O "${bac_gz}" https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tar.gz @@ -42,14 +48,14 @@ awk -F '\t' '$1 ~ /^(R|G)/{print $1,$55,$111}' "${ar_bac_meta_wTaxIDs}" | while fi done > "${map}" -mv "${ar_bac_meta_wTaxIDs}" "${OUT}" +mv "${ar_bac_meta_wTaxIDs}" "${TAX_DIR}" echo -e "\t|\t\t|" > "merged.dmp" -mv "merged.dmp" "${OUT}" -mv "names.dmp" "${OUT}" -mv "nodes.dmp" "${OUT}" -mv "delnodes.dmp" "${OUT}" -mv "${map}" "${OUT}" -mv "taxID_info.tsv" "${OUT}" +mv "merged.dmp" "${TAX_DIR}" +mv "names.dmp" "${TAX_DIR}" +mv "nodes.dmp" "${TAX_DIR}" +mv "delnodes.dmp" "${TAX_DIR}" +mv "${map}" "${TAX_DIR}" +mv "taxID_info.tsv" "${TAX_DIR}" #mv "${ar_bac_meta_wTaxIDs}" "../gtdb_taxdmp" From d3773453b642d99931432ee65743af3400ce4782 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Fri, 8 Sep 2023 20:10:18 +0900 Subject: [PATCH 26/65] fix minor error --- src/commons/IndexCreator.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index aded0b45..e21e00a4 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -344,7 +344,7 @@ TaxID IndexCreator::load_accession2taxid(const string & mappingFileName, unorder char accession_version[2048]; int taxID; fscanf(mappingFile, "%*s\t%*s\t%*s\t%*s"); - while (fscanf(mappingFile, "%s\t%s\t%d\t%*d", accession, accession_version, &taxID) == 2 ){ + while (fscanf(mappingFile, "%s\t%s\t%d\t%*d", accession, accession_version, &taxID) == 3 ){ acc2taxid[string(accession_version)] = taxID; acc2taxid[string(accession)] = taxID; if (taxID > maxTaxID) { @@ -701,18 +701,13 @@ string IndexCreator::getSeqSegmentsWithHead(vector & seqSegments, accession_version = firstLine.substr(1, LocalUtil::getFirstWhiteSpacePos(firstLine) - 1); newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid.at(accession), newTaxID)); taxIdList.push_back(newTaxID++); -// cout << firstLine << endl; - // taxIdList.push_back(acc2taxid.at(firstLine.substr(1, firstLine.find('.') - 1))); - // foundAcc2taxid[firstLine.substr(1, firstLine.find(' ') - 1)] = taxIdList.back(); + while (getline(seqFile, eachLine, '\n')) { if (eachLine[0] == '>') { accession = eachLine.substr(1, eachLine.find('.') - 1); accession_version = eachLine.substr(1, LocalUtil::getFirstWhiteSpacePos(eachLine) - 1); newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid.at(accession), newTaxID)); taxIdList.push_back(newTaxID++); -// cout << eachLine << endl; - // taxIdList.push_back(acc2taxid.at(eachLine.substr(1, eachLine.find('.') - 1))); - // foundAcc2taxid[eachLine.substr(1, eachLine.find(' ') - 1)] = taxIdList.back(); pos = (size_t) seqFile.tellg(); seqSegmentsTmp.emplace_back(start, pos - eachLine.length() - 3,pos - eachLine.length() - start - 2); start = pos - eachLine.length() - 1; From 31e5a1b15c5b433fd1932699c241c2c23336c545 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Fri, 8 Sep 2023 20:57:36 +0900 Subject: [PATCH 27/65] properly use merged.dmp during building accession-level DB --- src/commons/IndexCreator.cpp | 39 ++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index e21e00a4..ba967e6b 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -5,7 +5,9 @@ #include #include #include +#include #include +#include "NcbiTaxonomy.cpp" IndexCreator::IndexCreator(const LocalParameters & par) { // Parameters @@ -1021,6 +1023,26 @@ void IndexCreator::writeDbParameters() { } void IndexCreator::editTaxonomyDumpFiles(const vector>> & newAcc2taxid) { + // Load merged.dmp + string mergedFileName = taxonomyDir + "/merged.dmp"; + std::ifstream ss(mergedFileName); + if (ss.fail()) { + Debug(Debug::ERROR) << "File " << mergedFileName << " not found!\n"; + EXIT(EXIT_FAILURE); + } + + std::string line; + size_t count = 0; + unordered_map mergedMap; + while (std::getline(ss, line)) { + std::vector result = splitByDelimiter(line, "\t|\t", 2); + if (result.size() != 2) { + Debug(Debug::ERROR) << "Invalid name entry!\n"; + EXIT(EXIT_FAILURE); + } + mergedMap[atoi(result[0].c_str())] = atoi(result[1].c_str()); + } + // Edit names.dmp string nameFileName = taxonomyDir + "/names.dmp"; string newNameFileName = taxonomyDir + "/names.dmp.new"; @@ -1048,10 +1070,19 @@ void IndexCreator::editTaxonomyDumpFiles(const vectortaxonNode(newAcc2taxid[i].second.first)->taxId); + } + // Check if the parent taxon is merged + if (mergedMap.find(newAcc2taxid.back().second.first) != mergedMap.end()) { // merged + fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|", newAcc2taxid.back().second.second, mergedMap[newAcc2taxid.back().second.first]); + } else { + fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|", newAcc2taxid.back().second.second, newAcc2taxid.back().second.first); } - fprintf(nodeFile, "%d\t|\t%d\t|\t\t|\tscientific name\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|", newAcc2taxid.back().second.second, newAcc2taxid.back().second.first); fclose(nodeFile); - - // Edit node.dmp } \ No newline at end of file From e807ee34db5d4a9b51e17b20f3fb90c5f9fe15c5 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Sat, 9 Sep 2023 18:21:40 +0900 Subject: [PATCH 28/65] get maxTaxID properly --- src/commons/IndexCreator.cpp | 46 +++++++++++++++++++++++-- src/commons/IndexCreator.h | 2 ++ src/commons/KmerMatcher.cpp | 67 ++++++++++-------------------------- 3 files changed, 65 insertions(+), 50 deletions(-) diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index ba967e6b..aaf50291 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -217,8 +217,8 @@ void IndexCreator::makeBlocksForParallelProcessing_accession_level() { unordered_map acc2taxid; TaxID maxTaxID = load_accession2taxid(acc2taxidFileName, acc2taxid); - newTaxID = maxTaxID + 1; - + newTaxID = std::max(getMaxTaxID() + 1, maxTaxID + 1); + vector>> newAcc2taxid; // accession.version -> (parent, newTaxID) // Make blocks of sequences that can be processed in parallel @@ -1085,4 +1085,46 @@ void IndexCreator::editTaxonomyDumpFiles(const vector result = splitByDelimiter(line, "\t|\t", 2); + if (result.size() != 2) { + Debug(Debug::ERROR) << "Invalid name entry!\n"; + EXIT(EXIT_FAILURE); + } + maxTaxID = std::max(maxTaxID, (TaxID) atoi(result[0].c_str())); + } + ss.close(); + + // Check names.dmp + string nameFileName = taxonomyDir + "/names.dmp"; + ss = std::ifstream(nameFileName); + if (ss.fail()) { + Debug(Debug::ERROR) << "File " << nameFileName << " not found!\n"; + EXIT(EXIT_FAILURE); + } + + while (std::getline(ss, line)) { + std::vector result = splitByDelimiter(line, "\t|\t", 2); + if (result.size() != 2) { + Debug(Debug::ERROR) << "Invalid name entry!\n"; + EXIT(EXIT_FAILURE); + } + maxTaxID = std::max(maxTaxID, (TaxID) atoi(result[0].c_str())); + } + ss.close(); + + return maxTaxID; } \ No newline at end of file diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h index bee95f8f..1f5b7eb3 100644 --- a/src/commons/IndexCreator.h +++ b/src/commons/IndexCreator.h @@ -138,6 +138,8 @@ class IndexCreator{ static TaxID load_accession2taxid(const string & mappingFile, unordered_map & assacc2taxid); + TaxID getMaxTaxID(); + void editTaxonomyDumpFiles(const vector>> & newAcc2taxid); void reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniqeKmerIdx, size_t & uniqKmerCnt, diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 39b9a13d..3c0f08f8 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -17,42 +17,6 @@ KmerMatcher::KmerMatcher(const LocalParameters & par, this->taxonomy = taxonomy; loadTaxIdList(par); - // // Load the taxonomy ID list - // FILE * taxIdFile; - // if((taxIdFile = fopen((dbDir + "/taxID_list").c_str(),"r")) == NULL){ - // std::cout<<"Cannot open the taxID list file."<taxonNode(taxId); - // if (taxId == taxon->taxId) { - // TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); - // TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); - // while (taxon->taxId != speciesTaxID) { - // taxId2speciesId[taxon->taxId] = speciesTaxID; - // taxId2genusId[taxon->taxId] = genusTaxID; - // taxon = taxonomy->taxonNode(taxon->parentTaxId); - // } - // taxId2speciesId[speciesTaxID] = speciesTaxID; - // taxId2genusId[speciesTaxID] = genusTaxID; - // } else { - // TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); - // TaxID genusTaxID = taxonomy->getTaxIdAtRank(taxId, "genus"); - // while (taxon->taxId != speciesTaxID) { - // taxId2speciesId[taxon->taxId] = speciesTaxID; - // taxId2genusId[taxon->taxId] = genusTaxID; - // taxon = taxonomy->taxonNode(taxon->parentTaxId); - // } - // taxId2speciesId[speciesTaxID] = speciesTaxID; - // taxId2genusId[speciesTaxID] = genusTaxID; - // taxId2speciesId[taxId] = speciesTaxID; - // taxId2genusId[taxId] = genusTaxID; - // } - // } - // fclose(taxIdFile); } @@ -160,6 +124,13 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, MmapedData diffIdxSplits = mmapData(diffIdxSplitFileName.c_str(), 3); size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t); + // // Print target k-mer information + // MmapedData targetKmerInfo2 = mmapData(targetInfoFileName.c_str(), 3); + // size_t numOfTargetKmer = targetKmerInfo2.fileSize / sizeof(TargetKmerInfo); + // for (size_t i = 0; i < numOfTargetKmer; i++) { + // cout << targetKmerInfo2.data[i].sequenceID << "\t" << (int) targetKmerInfo2.data[i].redundancy << endl; + // } + size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve; QueryKmer *queryKmerList = queryKmerBuffer->buffer; @@ -329,10 +300,10 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI for (int k = 0; k < currMatchNum; k++) { idx = selectedMatches[k]; // Check if candidateKmerInfos[idx].sequenceID is valid - if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || - taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { - cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; - } + // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || + // taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { + // cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; + // } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, taxId2genusId[candidateKmerInfos[idx].sequenceID], @@ -371,10 +342,10 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI for (int k = 0; k < currMatchNum; k++) { idx = selectedMatches[k]; // Check if candidateKmerInfos[idx].sequenceID is valid - if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || - taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { - cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; - } + // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || + // taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { + // cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; + // } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, taxId2genusId[candidateKmerInfos[idx].sequenceID], @@ -468,10 +439,10 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI for (int k = 0; k < currMatchNum; k++) { idx = selectedMatches[k]; // Check if candidateKmerInfos[idx].sequenceID is valid - if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || - taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { - cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; - } + // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || + // taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { + // cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; + // } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, taxId2genusId[candidateKmerInfos[idx].sequenceID], From eb970ca0633c72fba498bb4820b9ab0fa8aa2cdb Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Mon, 11 Sep 2023 16:27:42 +0900 Subject: [PATCH 29/65] database-report for accession-level DB --- README.md | 33 ++++++++++++++++++++++------ src/util/database-report.cpp | 42 +++++++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 6765aa17..3e6e9170 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,14 @@ In addition, it can classify reads against a database of any size as long as it

+## Update notes +### v1.0.2 +- `--accession-level` option for `build` and `classify` workflow: It reports not only the taxon but also the accession of the best match. +- Fix minor bugs in `build` workflow. +- Generate `taxonomyDB` during `build` and load it during `classify` workflow for faster loading of taxonomy information. +- Support gzipped FASTA/FASTQ files in `add-to-library` and `classify` workflows. +- low-complexity filtering in `build` workflow as default with `--mask-prob 0.9`. +- ## Installation ### Precompiled binaries ``` @@ -79,8 +87,10 @@ metabuli classify --seq-mode 1 read.fna dbdir outdir jobid --min-score : The minimum score to be classified (0.15 for precision mode) --min-sp-score : The minimum score to be classified at or below species rank. (0.5 for precision mode) --taxonomy-path: Directory where the taxonomy dump files are stored. (DBDIR/taxonomy by default) - --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids. Give the same value used for DB creation. - --spacing-mask : Binary patterend mask for spaced k-mer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip. + --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids. + Give the same value used for DB creation. + --accession-level : Set 1 to use accession level classification (0 by default). + It is available when the DB is also built with accession level taxonomy. * Values of --min-score and --min-sp-score for precision mode are optimized only for short reads. * We don't recommend using them for long reads. @@ -162,9 +172,12 @@ Accessions that are not included in the `` will be skipped and #### 3. Build ``` -metabuli build [options] +# Get the list of absoulte paths of files in your library +find /library -name '*.fna' > library-files.txt + +metabuli build [options] - DBDIR: The same DBDIR from the previous step. -- FASTA list: A file containing absolute paths of the FASTA files in DBDIR/library +- LIB_FILES: A file containing absolute paths of the FASTA files in DBDIR/library (library-files.txt) - accession2taxid : A path to NCBI-style accession2taxid. * Options @@ -172,6 +185,7 @@ metabuli build [options] --taxonomy-path: Directory where the taxonomy dump files are stored. (DBDIR/taxonomy by default) --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids. --spacing-mask : Binary mask for spaced metamer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip. + --accession-level : Set 1 to use accession level taxonomy (0 by default). ``` This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete '\*\_diffIdx' and '\*\_info' if generated. @@ -201,16 +215,21 @@ This will add your FASTA files to DBDIR/library according to their species taxon #### 2. Build ``` -metabuli build [options] +# Get the list of absoulte paths of files in your library +find /library -name '*.fna' > library-files.txt + +metabuli build [options] - DBDIR: The same DBDIR from the previous step. -- FASTA list: A file containing absolute paths of the FASTA files in DBDIR/library +- : A file containing absolute paths of the FASTA files in DBDIR/library (library-files.txt) - accession2taxid : A path to NCBI-style accession2taxid. * Options --threads : The number of CPU-cores used (all by default) --taxonomy-path: Directory where the taxonomy dump files are stored. (DBDIR/taxonomy by default) --reduced-aa : 0. Use 20 alphabets or 1. Use 15 alphabets to encode amino acids. - --spacing-mask : Binary mask for spaced metamer. The same mask must be used for DB creation and classification. A mask should contain at least eight '1's, and '0' means skip. + --spacing-mask : Binary mask for spaced metamer. The same mask must be used for DB creation and classification. + A mask should contain at least eight '1's, and '0' means skip. + --accession-level : Set 1 to use accession level taxonomy (0 by default). ``` This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete '\*\_diffIdx' and '\*\_info' if generated. diff --git a/src/util/database-report.cpp b/src/util/database-report.cpp index 40ea3afb..4aee73e4 100644 --- a/src/util/database-report.cpp +++ b/src/util/database-report.cpp @@ -4,6 +4,7 @@ #include #include #include "IndexCreator.h" +#include "common.h" #include "report.h" #include "FileUtil.h" @@ -20,13 +21,15 @@ int databaseReport(int argc, const char **argv, const Command &command) { par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0); string dbDir = par.filenames[0]; - // Check if taxonomy path exists + // Load taxonomy if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; - if (!FileUtil::directoryExists(par.taxonomyPath.c_str())) { - cerr << "Error: taxonomy path " << par.taxonomyPath << " does not exist." << endl; - cerr << "Please specify the path to the taxonomy directory using the --taxonomy-path option." << endl; - return 1; - } + NcbiTaxonomy * taxonomy = loadTaxonomy(dbDir, par.taxonomyPath); + + // if (!FileUtil::directoryExists(par.taxonomyPath.c_str())) { + // cerr << "Error: taxonomy path " << par.taxonomyPath << " does not exist." << endl; + // cerr << "Please specify the path to the taxonomy directory using the --taxonomy-path option." << endl; + // return 1; + // } // Check if acc2taxid.map exists string acc2taxid = dbDir + "/acc2taxid.map"; @@ -35,11 +38,11 @@ int databaseReport(int argc, const char **argv, const Command &command) { return 1; } - // Load taxonomy - const string names = par.taxonomyPath + "/names.dmp"; - const string nodes = par.taxonomyPath + "/nodes.dmp"; - const string merged = par.taxonomyPath + "/merged.dmp"; - auto * taxonomy = new NcbiTaxonomy(names, nodes, merged); + // // Load taxonomy + // const string names = par.taxonomyPath + "/names.dmp"; + // const string nodes = par.taxonomyPath + "/nodes.dmp"; + // const string merged = par.taxonomyPath + "/merged.dmp"; + // auto * taxonomy = new NcbiTaxonomy(names, nodes, merged); // Load only the second column of acc2taxid.map as integers vector taxids; @@ -49,8 +52,23 @@ int databaseReport(int argc, const char **argv, const Command &command) { return 1; } string line; + // Check if there is third column + getline(acc2taxidFile, line); + vector tokens = Util::split(line, "\t"); + int using_token = 0; + if (tokens.size() == 2) { // accession and taxid + using_token = 1; + taxids.push_back(stoi(tokens[using_token])); + } else if (tokens.size() == 3) { // accession and taxid and accession_id + using_token = 2; + taxids.push_back(stoi(tokens[using_token])); + } else { + cerr << "Error: acc2taxid.map file " << acc2taxid << " has wrong format." << endl; + return 1; + } while (getline(acc2taxidFile, line)) { - int taxid = stoi(line.substr(line.find('\t') + 1)); + tokens = Util::split(line, "\t"); + int taxid = stoi(tokens[using_token]); if (find(taxids.begin(), taxids.end(), taxid) == taxids.end()) { taxids.push_back(taxid); } From b63a0bab5e740ca545fd8083a193d70633217609 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Mon, 11 Sep 2023 20:55:51 +0900 Subject: [PATCH 30/65] Improve descriptions for options. Remove unused options. Improve log prints --- src/commons/Classifier.cpp | 3 + src/commons/IndexCreator.cpp | 10 +- src/commons/IndexCreator.h | 4 + src/commons/KmerMatcher.cpp | 3 +- src/commons/LocalParameters.cpp | 572 ++++++++++++++++++++++++++++++-- src/commons/LocalParameters.h | 11 + src/commons/common.cpp | 13 + src/metabuli.cpp | 2 +- src/workflow/build.cpp | 9 + src/workflow/classify.cpp | 1 - 10 files changed, 591 insertions(+), 37 deletions(-) diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index 9258a194..3384ed88 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -7,6 +7,9 @@ Classifier::Classifier(LocalParameters & par) { dbDir = par.filenames[1 + (par.seqMode == 2)]; matchPerKmer = par.matchPerKmer; loadDbParameters(par); + + cout << "DB name: " << par.dbName << endl; + cout << "DB creation date: " << par.dbDate << endl; // Taxonomy taxonomy = loadTaxonomy(dbDir, par.taxonomyPath); diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index aaf50291..b7945148 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -16,6 +16,11 @@ IndexCreator::IndexCreator(const LocalParameters & par) { reducedAA = par.reducedAA; spaceMask = par.spaceMask; accessionLevel = par.accessionLevel; + lowComplexityMasking = par.maskMode; + lowComplexityMaskingThreshold = par.maskProb; + dbName = par.dbName; + dbDate = par.dbDate; + // Input files dbDir = par.filenames[0]; @@ -167,7 +172,6 @@ void IndexCreator::updateIndex(const LocalParameters &par) { delete[] uniqKmerIdx; } delete[] splitChecker; - } void IndexCreator::makeBlocksForParallelProcessing() { @@ -1016,9 +1020,13 @@ void IndexCreator::writeDbParameters() { Debug(Debug::ERROR) << "Could not open " << paramterFileName << " for writing\n"; EXIT(EXIT_FAILURE); } + fprintf(handle, "DB_name\t%s\n", dbName.c_str()); + fprintf(handle, "Creation_date\t%s\n", dbDate.c_str()); fprintf(handle, "Reduced_alphabet\t%d\n", reducedAA); fprintf(handle, "Spaced_kmer_mask\t%s\n", spaceMask.c_str()); fprintf(handle, "Accession_level\t%d\n", accessionLevel); + fprintf(handle, "Mask_mode\t%d\n", lowComplexityMasking); + fprintf(handle, "Mask_prob\t%f\n", lowComplexityMaskingThreshold); fclose(handle); } diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h index 1f5b7eb3..33bc1c66 100644 --- a/src/commons/IndexCreator.h +++ b/src/commons/IndexCreator.h @@ -46,6 +46,10 @@ class IndexCreator{ int reducedAA; string spaceMask; int accessionLevel; + int lowComplexityMasking; + float lowComplexityMaskingThreshold; + string dbName; + string dbDate; // Inputs NcbiTaxonomy * taxonomy; diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 3c0f08f8..9455070e 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -24,6 +24,7 @@ KmerMatcher::~KmerMatcher() { } void KmerMatcher::loadTaxIdList(const LocalParameters & par) { + cout << "Loading the list for taxonomy IDs ... "; if (par.contamList != "") { vector contams = Util::split(par.contamList, ","); for (auto &contam : contams) { @@ -101,7 +102,7 @@ void KmerMatcher::loadTaxIdList(const LocalParameters & par) { } fclose(taxIdFile); } - cout << "Taxonomy ID list is loaded." << endl; + cout << "Done" << endl; } diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index f38cd6a1..986ab86d 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -1,12 +1,18 @@ #include "LocalParameters.h" #include "Parameters.h" +#include "Debug.h" +#include "CommandCaller.h" +#include "Parameters.cpp" +#include "ByteParser.h" +#include +#include "DistanceCalculator.h" LocalParameters::LocalParameters() : Parameters(), VIRUS_TAX_ID(VIRUS_TAX_ID_ID, "--virus-taxid", "Taxonomy ID of virus taxon", - "NCBI: 10239 [Default]\nCUSTOM: Check names.dmp file ", + "NCBI: 10239 \nCUSTOM: Check names.dmp file ", typeid(int), (void *) &virusTaxId, "^[0-9]+$"), @@ -34,28 +40,28 @@ LocalParameters::LocalParameters() : SEQ_MODE(SEQ_MODE_ID, "--seq-mode", "Sequencing type", - "Single-end: 1 \nPaired-end: 2\nLong read: 3", + "Single-end: 1, Paired-end: 2, Long read: 3", typeid(int), (void *) &seqMode, "[1-3]"), REDUCED_AA(REDUCED_AA_ID, "--reduced-aa", - "Using reduced 15 alphabets to encode amino acids. It increases sensitivity", - "Using 20 alphabets: 0 \nUsing 15 alphabets: 1", + "Using 15 alphabets to encode AAs for sensitivity", + "Set as 0 to use 15 alphabets to encode AAs for sensitivity", typeid(int), (void *) &reducedAA, "[0-1]"), MIN_SCORE(MIN_SCORE_ID, "--min-score", - "The minimum score for classification", - "You can set a value from 0.0 to 1.0", + "Min. sequence similarity score", + "Min. sequence similarity score (0.0-1.0)", typeid(float), (void *) &minScore, "^0(\\.[0-9]+)?|1(\\.0+)?$"), MIN_COVERAGE(MIN_COVERAGE_ID, "--min-cov", - "The minimum coverage for classification", - "You can set a value from 0.0 to 1.0", + "Min. query coverage", + "Min. query coverage (0.0-1.0)", typeid(float), (void *) &minCoverage, "^0(\\.[0-9]+)?|1(\\.0+)?$"), @@ -76,17 +82,15 @@ LocalParameters::LocalParameters() : "^[0-9]+$"), HAMMING_MARGIN(HAMMING_MARGIN_ID, "--hamming-margin", - "If a query k-mer has multiple matches, the matches with hamming distance lower than sum of \n" - "the minimum distance and this margin are selected for later steps.", - "If a query k-mer has multiple matches, the matches with hamming distance lower than sum of \n" - "the minimum distance and this margin are selected for later steps.", + "Allowed extra Hamming distance", + "It allows extra Hamming distance than the minimum distance.", typeid(int), (void *) &hammingMargin, ""), MIN_SP_SCORE(MIN_SP_SCORE_ID, "--min-sp-score", - "Minimum score to be classified at species or lower rank.", - "Minimum score to be classified at the species level.", + "Min. score for species- or lower-level classification.", + "Min. score for species- or lower-level classification.", typeid(float), (void *) &minSpScore, "^0(\\.[0-9]+)?|1(\\.0+)?$"), @@ -120,22 +124,22 @@ LocalParameters::LocalParameters() : "^[0-9]+$"), MIN_CONS_CNT(MIN_CONS_CNT_ID, "--min-cons-cnt", - "Minimum number of consecutive metamer matches to be used for prokaryote/virus classification", - "Minimum number of consecutive metamer matches to be used for prokaryote/virus classification", + "Min. num. of cons. matches for non-euk. classification", + "Min. number of consecutive matches for prokaryote/virus classification", typeid(int), (void *) &minConsCnt, "^[0-9]+$"), MIN_CONS_CNT_EUK(MIN_CONS_CNT_EUK_ID, "--min-cons-cnt-euk", - "Minimum number of consecutive metamer matches to be used for eukaryote classification", - "Minimum number of consecutive metamer matches to be used for eukaryote classification", + "Min. num. of cons. matches for euk. classification", + "Min. number of consecutive matches for eukaryote classification", typeid(int), (void *) &minConsCntEuk, "^[0-9]+$"), MATCH_PER_KMER(MATCH_PER_KMER_ID, "--match-per-kmer", - "Number of matches per query k-mer", - "Number of matches per query k-mer. Larger values assign more memory for storing k-mer matches.", + "Number of matches per query k-mer. ", + "Num. of matches per query k-mer. Larger values assign more memory for storing k-mer matches. ", typeid(int), (void *) &matchPerKmer, "^[0-9]+$"), @@ -176,11 +180,25 @@ LocalParameters::LocalParameters() : "^[0-9]+$"), ACCESSION_LEVEL(ACCESSION_LEVEL_ID, "--accession-level", - "Build a database for accession level classification", - "Build a database for accession level classification", + "Accession-level DB build/search", + "Build or search a database for accession-level classification", typeid(int), (void *) &accessionLevel, "[0-1]"), + DB_NAME(DB_NAME_ID, + "--db-name", + "Name of the database (a random number as default)", + "Name of the database", + typeid(std::string), + (void *) &dbName, + "^.*$"), + DB_DATE(DB_DATE_ID, + "--db-date", + "Date of the database creation (current date as default)", + "Date of the database creation", + typeid(std::string), + (void *) &dbDate, + "^.*$"), TEST_RANK(TEST_RANK_ID, "--test-rank", ".", @@ -257,27 +275,29 @@ LocalParameters::LocalParameters() : build.push_back(&PARAM_MASK_RESIDUES); build.push_back(&BUFFER_SIZE); build.push_back(&ACCESSION_LEVEL); + build.push_back(&DB_NAME); + build.push_back(&DB_DATE); //classify classify.push_back(&PARAM_THREADS); classify.push_back(&SEQ_MODE); - classify.push_back(&VIRUS_TAX_ID); - classify.push_back(&REDUCED_AA); +// classify.push_back(&VIRUS_TAX_ID); +// classify.push_back(&REDUCED_AA); classify.push_back(&MIN_SCORE); classify.push_back(&MIN_COVERAGE); - classify.push_back(&SPACED); - classify.push_back(&HAMMING_MARGIN); - classify.push_back(&MIN_SP_SCORE); - classify.push_back(&PARAM_V); - classify.push_back(&RAM_USAGE); - classify.push_back(&MIN_COVERED_POS); - classify.push_back(&PRINT_LOG); - classify.push_back(&MAX_GAP); - classify.push_back(&TAXONOMY_PATH); classify.push_back(&MIN_CONS_CNT); classify.push_back(&MIN_CONS_CNT_EUK); + classify.push_back(&MIN_SP_SCORE); +// classify.push_back(&SPACED); + classify.push_back(&HAMMING_MARGIN); +// classify.push_back(&PARAM_V); +// classify.push_back(&MIN_COVERED_POS); +// classify.push_back(&PRINT_LOG); +// classify.push_back(&MAX_GAP); +// classify.push_back(&TAXONOMY_PATH); classify.push_back(&PARAM_MASK_RESIDUES); classify.push_back(&PARAM_MASK_PROBABILTY); + classify.push_back(&RAM_USAGE); classify.push_back(&MATCH_PER_KMER); classify.push_back(&ACCESSION_LEVEL); @@ -338,3 +358,489 @@ LocalParameters::LocalParameters() : databaseReport.push_back(&TAXONOMY_PATH); } +void LocalParameters::printParameters(const std::string &module, int argc, const char* pargv[], + const std::vector &par){ + if (Debug::debugLevel < Debug::INFO) { + return; + } + + Debug(Debug::INFO) << module << " "; + for (int i = 0; i < argc; i++) { + // don't expose users to the interal b64 masking of whitespace characters + if (strncmp("b64:", pargv[i], 4) == 0) { + Debug(Debug::INFO) << "'" << base64_decode(pargv[i] + 4, strlen(pargv[i]) - 4) << "' "; + } else { + Debug(Debug::INFO) << pargv[i] << " "; + } + } + Debug(Debug::INFO) << "\n\n"; + + if (CommandCaller::getCallDepth() > 0) { + return; + } + + size_t maxWidth = 0; + for(size_t i = 0; i < par.size(); i++) { + maxWidth = std::max(strlen(par[i]->display), maxWidth); + } + + std::stringstream ss; + ss << std::boolalpha; + + ss << std::setw(maxWidth) << std::left << "Metabuli Version:" << "\t" << "1.0.2" << "\n"; + + for (size_t i = 0; i < par.size(); i++) { + if (par[i]->category & MMseqsParameter::COMMAND_HIDDEN) { + continue; + } + ss << std::setw(maxWidth) << std::left << par[i]->display << "\t"; + if (typeid(int) == par[i]->type ) { + ss << *((int *)par[i]->value); + } else if(typeid(size_t) == par[i]->type ){ + ss << *((size_t *)par[i]->value); + } else if(typeid(ByteParser) == par[i]->type) { + ss << ByteParser::format(*((size_t *)par[i]->value), 'a', 'h'); + } else if(PARAM_SUB_MAT.uniqid == par[i]->uniqid || PARAM_SEED_SUB_MAT.uniqid == par[i]->uniqid) { + MultiParam>* param = ((MultiParam>*) par[i]->value); + MultiParam> tmpPar(NuclAA( + BaseMatrix::unserializeName(param->values.aminoacid().c_str()), + BaseMatrix::unserializeName(param->values.nucleotide().c_str()) + )); + ss << MultiParam>::format(tmpPar); + } else if(typeid(MultiParam>) == par[i]->type) { + ss << MultiParam>::format(*((MultiParam> *)par[i]->value)); + } else if(typeid(MultiParam>) == par[i]->type) { + ss << MultiParam>::format(*((MultiParam> *)par[i]->value)); + } else if(typeid(MultiParam>) == par[i]->type) { + ss << MultiParam>::format(*((MultiParam> *)par[i]->value)); + } else if(typeid(MultiParam>) == par[i]->type) { + ss << MultiParam>::format(*((MultiParam> *)par[i]->value)); + } else if(typeid(MultiParam) == par[i]->type) { + ss << MultiParam::format(*((MultiParam *)par[i]->value)); + } else if(typeid(float) == par[i]->type) { + ss << *((float *)par[i]->value); + } else if(typeid(double) == par[i]->type) { + ss << *((double *)par[i]->value); + } else if(typeid(std::string) == par[i]->type) { + ss << *((std::string *) par[i]->value); + } else if (typeid(bool) == par[i]->type) { + ss << *((bool *)par[i]->value); + } + ss << "\n"; + } + + Debug(Debug::INFO) << ss.str() << "\n"; +} + +void LocalParameters::parseParameters(int argc, const char *pargv[], const Command &command, bool printPar, int parseFlags, + int outputFlags) { + filenames.clear(); + std::vector & par = *command.params; + + bool canHandleHelp = false; + for (size_t parIdx = 0; parIdx < par.size(); parIdx++) { + if (par[parIdx]->uniqid == PARAM_HELP_ID || par[parIdx]->uniqid == PARAM_HELP_LONG_ID) { + canHandleHelp = true; + } + } + + size_t parametersFound = 0; + for (int argIdx = 0; argIdx < argc; argIdx++) { + // it is a parameter if it starts with - or -- + const bool longParameter = (pargv[argIdx][0] == '-' && pargv[argIdx][1] == '-'); + if (longParameter || (pargv[argIdx][0] == '-')) { + if ((parseFlags & PARSE_REST) && longParameter && pargv[argIdx][2] == '\0') { + restArgv = pargv + argIdx + 1; + restArgc = argc - (argIdx + 1); + break; + } + std::string parameter(pargv[argIdx]); + if (canHandleHelp == false && (parameter.compare("-h") == 0 || parameter.compare("--help") == 0)) { + printUsageMessage(command, 0xFFFFFFFF); + EXIT(EXIT_SUCCESS); + } + + bool hasUnrecognizedParameter = true; + for (size_t parIdx = 0; parIdx < par.size(); parIdx++) { + if(parameter.compare(par[parIdx]->name) == 0) { + if (typeid(bool) != par[parIdx]->type && argIdx + 1 == argc) { + printUsageMessage(command, outputFlags); + Debug(Debug::ERROR) << "Missing argument " << par[parIdx]->name << "\n"; + EXIT(EXIT_FAILURE); + } + + if (par[parIdx]->wasSet) { + printUsageMessage(command, outputFlags); + Debug(Debug::ERROR) << "Duplicate parameter " << par[parIdx]->name << "\n"; + EXIT(EXIT_FAILURE); + } + + if (typeid(int) == par[parIdx]->type) { + regex_t regex; + compileRegex(®ex, par[parIdx]->regex); + int nomatch = regexec(®ex, pargv[argIdx+1], 0, NULL, 0); + regfree(®ex); + // if no match found or two matches found (we want exactly one match) + if (nomatch){ + printUsageMessage(command, 0xFFFFFFFF); + Debug(Debug::ERROR) << "Error in argument " << par[parIdx]->name << "\n"; + EXIT(EXIT_FAILURE); + }else{ + *((int *) par[parIdx]->value) = atoi(pargv[argIdx+1]); + par[parIdx]->wasSet = true; + } + argIdx++; + } else if (typeid(size_t) == par[parIdx]->type) { + regex_t regex; + compileRegex(®ex, par[parIdx]->regex); + int nomatch = regexec(®ex, pargv[argIdx+1], 0, NULL, 0); + regfree(®ex); + // if no match found or two matches found (we want exactly one match) + if (nomatch){ + printUsageMessage(command, 0xFFFFFFFF); + Debug(Debug::ERROR) << "Error in argument " << par[parIdx]->name << "\n"; + EXIT(EXIT_FAILURE); + }else{ + *((size_t *) par[parIdx]->value) = atoi(pargv[argIdx+1]); + par[parIdx]->wasSet = true; + } + argIdx++; + } else if (typeid(ByteParser) == par[parIdx]->type) { + regex_t regex; + compileRegex(®ex, par[parIdx]->regex); + int nomatch = regexec(®ex, pargv[argIdx+1], 0, NULL, 0); + regfree(®ex); + + // if no match found or two matches found (we want exactly one match) + if (nomatch){ + printUsageMessage(command, 0xFFFFFFFF); + Debug(Debug::ERROR) << "Error in argument regex " << par[parIdx]->name << "\n"; + EXIT(EXIT_FAILURE); + } else { + size_t value = ByteParser::parse(pargv[argIdx+1]); + if (value == ByteParser::INVALID_SIZE) { + printUsageMessage(command, 0xFFFFFFFF); + Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n"; + EXIT(EXIT_FAILURE); + } else { + *((size_t *) par[parIdx]->value) = value; + par[parIdx]->wasSet = true; + } + } + argIdx++; + } else if (typeid(MultiParam>) == par[parIdx]->type) { + std::string val(pargv[argIdx+1]); + if (Util::startWith("b64:", val)) { + val = base64_decode(val.c_str() + 4, val.size() - 4); + } + NuclAA value = MultiParam>(val.c_str()).values; + if (value.first == "INVALID" || value.second == "INVALID") { + printUsageMessage(command, 0xFFFFFFFF); + Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n"; + EXIT(EXIT_FAILURE); + } else { + *((MultiParam> *) par[parIdx]->value) = value; + par[parIdx]->wasSet = true; + } + argIdx++; + }else if (typeid(MultiParam>) == par[parIdx]->type) { + NuclAA value = MultiParam>(pargv[argIdx+1]).values; + if (value.first == INT_MAX || value.second == INT_MAX) { + printUsageMessage(command, 0xFFFFFFFF); + Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n"; + EXIT(EXIT_FAILURE); + } else { + *((MultiParam> *) par[parIdx]->value) = value; + par[parIdx]->wasSet = true; + } + argIdx++; + }else if (typeid(MultiParam>) == par[parIdx]->type) { + NuclAA value = MultiParam>(pargv[argIdx + 1]).values; + if (value.first == FLT_MAX || value.second == FLT_MAX) { + printUsageMessage(command, 0xFFFFFFFF); + Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n"; + EXIT(EXIT_FAILURE); + } else { + *((MultiParam> *) par[parIdx]->value) = value; + par[parIdx]->wasSet = true; + } + argIdx++; + }else if (typeid(MultiParam>) == par[parIdx]->type) { + SeqProf value = MultiParam>(pargv[argIdx+1]).values; + *((MultiParam> *) par[parIdx]->value) = value; + par[parIdx]->wasSet = true; + argIdx++; + }else if (typeid(MultiParam) == par[parIdx]->type) { + PseudoCounts value = MultiParam(pargv[argIdx + 1]).values; + if (value.first == FLT_MAX || value.second == FLT_MAX) { + printUsageMessage(command, 0xFFFFFFFF); + Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n"; + EXIT(EXIT_FAILURE); + } else { + *((MultiParam *) par[parIdx]->value) = value; + par[parIdx]->wasSet = true; + } + argIdx++; + }else if (typeid(float) == par[parIdx]->type) { + regex_t regex; + compileRegex(®ex, par[parIdx]->regex); + int nomatch = regexec(®ex, pargv[argIdx+1], 0, NULL, 0); + regfree(®ex); + if (nomatch){ + printUsageMessage(command, 0xFFFFFFFF); + Debug(Debug::ERROR) << "Error in argument " << par[parIdx]->name << "\n"; + EXIT(EXIT_FAILURE); + }else{ + double input = strtod(pargv[argIdx+1], NULL); + *((float *) par[parIdx]->value) = static_cast(input); + par[parIdx]->wasSet = true; + } + argIdx++; + } else if (typeid(double) == par[parIdx]->type) { + regex_t regex; + compileRegex(®ex, par[parIdx]->regex); + int nomatch = regexec(®ex, pargv[argIdx+1], 0, NULL, 0); + regfree(®ex); + if (nomatch){ + printUsageMessage(command, 0xFFFFFFFF); + Debug(Debug::ERROR) << "Error in argument " << par[parIdx]->name << "\n"; + EXIT(EXIT_FAILURE); + }else{ + *((double *) par[parIdx]->value) = strtod(pargv[argIdx+1], NULL); + par[parIdx]->wasSet = true; + } + argIdx++; + } else if (typeid(std::string) == par[parIdx]->type) { + std::string val(pargv[argIdx+1]); + if (Util::startWith("b64:", val)) { + val = base64_decode(val.c_str() + 4, val.size() - 4); + } + std::string* currVal = (std::string*)par[parIdx]->value; + currVal->assign(val); + par[parIdx]->wasSet = true; + argIdx++; + } else if (typeid(bool) == par[parIdx]->type) { + bool *value = (bool *) par[parIdx]->value; + if (argIdx + 1 == argc || pargv[argIdx+1][0] == '-') { + *value = !*value; + } else { + *value = parseBool(pargv[argIdx+1]); + argIdx++; + } + par[parIdx]->wasSet = true; + } else { + Debug(Debug::ERROR) << "Wrong parameter type in parseParameters. Please inform the developers\n"; + EXIT(EXIT_FAILURE); + } + + hasUnrecognizedParameter = false; + continue; + } + } + + if (hasUnrecognizedParameter) { + printUsageMessage(command, 0xFFFFFFFF); + + // Suggest some parameter that the user might have meant + std::vector::const_iterator index = par.end(); + int maxDistance = 0; + for (std::vector::const_iterator it = par.begin(); it != par.end(); ++it) { + int distance = DistanceCalculator::localLevenshteinDistance(parameter, (*it)->name); + if (distance > maxDistance) { + maxDistance = distance; + index = it; + } + } + + Debug(Debug::ERROR) << "Unrecognized parameter \"" << parameter << "\""; + if (index != par.end()) { + Debug(Debug::ERROR) << ". Did you mean \"" << (*index)->name << "\" (" << (*index)->display << ")?\n"; + } else { + Debug(Debug::ERROR) << "\n"; + } + + EXIT(EXIT_FAILURE); + } + + parametersFound++; + } else { + // parameter is actually a filename +#ifdef __CYGWIN__ + // normalize windows paths to cygwin unix paths + const char *path = pargv[argIdx]; + ssize_t size = cygwin_conv_path(CCP_WIN_A_TO_POSIX | CCP_RELATIVE, path, NULL, 0); + if (size < 0) { + Debug(Debug::ERROR) << "Could not convert cygwin path!\n"; + EXIT(EXIT_FAILURE); + } else { + char *posix = new char[size]; + if (cygwin_conv_path(CCP_WIN_A_TO_POSIX | CCP_RELATIVE, path, posix, size)) { + Debug(Debug::ERROR) << "Could not convert cygwin path!\n"; + EXIT(EXIT_FAILURE); + } + filenames.emplace_back(posix); + delete posix; + } +#else + filenames.emplace_back(pargv[argIdx]); +#endif + } + } + + if (MMseqsMPI::isMaster()) { + Debug::setDebugLevel(verbosity); + } + +#ifdef OPENMP + omp_set_num_threads(threads); +#endif +#ifndef OPENMP + threads = 1; +#endif + + + bool ignorePathCountChecks = command.databases.empty() == false && command.databases[0].specialType & DbType::ZERO_OR_ALL && filenames.size() == 0; + const size_t MAX_DB_PARAMETER = 6; + if (ignorePathCountChecks == false && command.databases.size() > MAX_DB_PARAMETER) { + Debug(Debug::ERROR) << "Use argv if you need more than " << MAX_DB_PARAMETER << " db parameters" << "\n"; + EXIT(EXIT_FAILURE); + } + + if (ignorePathCountChecks == false && filenames.size() < command.databases.size()){ + printUsageMessage(command, outputFlags); + Debug(Debug::ERROR) << "Not enough input paths provided. "; + if (command.databases.size() == 1) { + Debug(Debug::ERROR) << "1 path is required.\n"; + } else { + Debug(Debug::ERROR) << command.databases.size() << " paths are required.\n"; + } + EXIT(EXIT_FAILURE); + } + + bool isVar = false; + bool isStartVar = false; + bool isMiddleVar = false; + bool isEndVar = false; + if(command.databases.empty() == false && command.databases[0].validator != NULL) { + if (command.databases.size() >= 2) { + for(size_t i = 0; i < command.databases.size();i++){ + if(i == 0){ + isStartVar |= (command.databases[i].specialType & DbType::VARIADIC); + } else if(i == command.databases.size() - 1){ + isEndVar |= (command.databases[i].specialType & DbType::VARIADIC); + } else { + isMiddleVar |= (command.databases[i].specialType & DbType::VARIADIC); + } + + } + isVar = isStartVar | isMiddleVar | isEndVar; + } + if (ignorePathCountChecks == false && isVar == false && filenames.size() > command.databases.size()) { + printUsageMessage(command, outputFlags); + Debug(Debug::ERROR) << "Too many input paths provided. Only " << SSTR(command.databases.size()) << " are allowed\n"; + EXIT(EXIT_FAILURE); + } + } + switch (std::min(filenames.size(), MAX_DB_PARAMETER)) { + case 6: + db6 = filenames[5]; + db6Index = db6; + db6Index.append(".index"); + db6dbtype = db6; + db6dbtype.append(".dbtype"); + hdr6 = db6; + hdr6.append("_h"); + hdr6Index = hdr6; + hdr6Index.append(".index"); + hdr6dbtype = hdr6; + hdr6dbtype.append(".dbtype"); + // FALLTHROUGH + case 5: + db5 = filenames[4]; + db5Index = db5; + db5Index.append(".index"); + db5dbtype = db5; + db5dbtype.append(".dbtype"); + hdr5 = db5; + hdr5.append("_h"); + hdr5Index = hdr5; + hdr5Index.append(".index"); + hdr5dbtype = hdr5; + hdr5dbtype.append(".dbtype"); + // FALLTHROUGH + case 4: + db4 = filenames[3]; + db4Index = db4; + db4Index.append(".index"); + db4dbtype = db4; + db4dbtype.append(".dbtype"); + hdr4 = db4; + hdr4.append("_h"); + hdr4Index = hdr4; + hdr4Index.append(".index"); + hdr4dbtype = hdr4; + hdr4dbtype.append(".dbtype"); + // FALLTHROUGH + case 3: + db3 = filenames[2]; + db3Index = db3; + db3Index.append(".index"); + db3dbtype = db3; + db3dbtype.append(".dbtype"); + hdr3 = db3; + hdr3.append("_h"); + hdr3Index = hdr3; + hdr3Index.append(".index"); + hdr3dbtype = hdr3; + hdr3dbtype.append(".dbtype"); + // FALLTHROUGH + case 2: + db2 = filenames[1]; + db2Index = db2; + db2Index.append(".index"); + db2dbtype = db2; + db2dbtype.append(".dbtype"); + hdr2 = db2; + hdr2.append("_h"); + hdr2Index = hdr2; + hdr2Index.append(".index"); + hdr2dbtype = hdr2; + hdr2dbtype.append(".dbtype"); + // FALLTHROUGH + case 1: + db1 = filenames[0]; + db1Index = db1; + db1Index.append(".index"); + db1dbtype = db1; + db1dbtype.append(".dbtype"); + hdr1 = db1; + hdr1.append("_h"); + hdr1Index = hdr1; + hdr1Index.append(".index"); + hdr1dbtype = hdr1; + hdr1dbtype.append(".dbtype"); + break; + default: + // Do not abort execution if we expect a variable amount of parameters + if (parseFlags & PARSE_VARIADIC) + break; + // FALLTHROUGH + case 0: + if (parseFlags & PARSE_ALLOW_EMPTY) + break; + printUsageMessage(command, outputFlags); + printParameters(command.cmd, argc, pargv, par); + Debug(Debug::ERROR) << "Unrecognized parameters!" << "\n"; + EXIT(EXIT_FAILURE); + } + + initMatrices(); + + if (ignorePathCountChecks == false) { + checkIfDatabaseIsValid(command, argc, pargv, isStartVar, isMiddleVar, isEndVar); + } + + if (printPar == true) { + printParameters(command.cmd, argc, pargv, par); + } +} \ No newline at end of file diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 2d75912a..ccbd03c6 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -63,6 +63,8 @@ class LocalParameters : public Parameters { PARAMETER(SPLIT_NUM) PARAMETER(BUFFER_SIZE) PARAMETER(ACCESSION_LEVEL) + PARAMETER(DB_NAME) + PARAMETER(DB_DATE) // Test parameters PARAMETER(TEST_RANK) @@ -104,6 +106,8 @@ class LocalParameters : public Parameters { std::string tinfoPath; std::string libraryPath; std::string taxonomyPath; + std::string dbName; + std::string dbDate; int splitNum; size_t bufferSize; int accessionLevel; @@ -124,6 +128,13 @@ class LocalParameters : public Parameters { int printMode; std::string contamList; + void printParameters(const std::string &module, int argc, + const char* pargv[], + const std::vector &par); + + void parseParameters(int argc, const char *pargv[], const Command &command, bool printPar, int parseFlags, + int outputFlags); + private: LocalParameters(); diff --git a/src/commons/common.cpp b/src/commons/common.cpp index 9f54ac75..c1ebff2a 100644 --- a/src/commons/common.cpp +++ b/src/commons/common.cpp @@ -85,6 +85,15 @@ int loadDbParameters(LocalParameters &par) { while (getline(dbParametersFile, eachLine)) { std::vector tokens = Util::split(eachLine, "\t"); if (tokens[0] == "Reduced_alphabet") { + // if (stoi(tokens[1]) != par.reducedAA){ + // if (par.reducedAA == 0){ // DB with reduced AA + // cerr << "Warning: Current DB is built with reduced 15 amino acid alphabets." << endl; + // cerr << " --reduce-aa option will be ignored " << endl; + // } else { + // cerr << "Warning: Current DB is built with 20 amino acid alphabets." << endl; + // cerr << " --reduce-aa option will be ignored " << endl; + // } + // } par.reducedAA = stoi(tokens[1]); } else if (tokens[0] == "Spaced_kmer_mask") { par.spaceMask = tokens[1]; @@ -96,6 +105,10 @@ int loadDbParameters(LocalParameters &par) { if (tokens[1] == "1" && par.accessionLevel == 0){ par.accessionLevel = 2; } + } else if (tokens[0] == "DB_name") { + par.dbName = tokens[1]; + } else if (tokens[0] == "Creation_date") { + par.dbDate = tokens[1]; } } return 1; diff --git a/src/metabuli.cpp b/src/metabuli.cpp index 7783d91f..43a8e9de 100644 --- a/src/metabuli.cpp +++ b/src/metabuli.cpp @@ -10,7 +10,7 @@ const char* binary_name = "metabuli"; const char* tool_name = "metabuli"; const char* tool_introduction = "Metabuli is a taxonomical classifier that jointly analyzes amino acid and DNA sequences."; -const char* main_author = "Jaebeom Kim "; +const char* main_author = "Jaebeom Kim "; const char* show_extended_help = "1"; const char* show_bash_info = nullptr; bool hide_base_commands = true; diff --git a/src/workflow/build.cpp b/src/workflow/build.cpp index eaf0c367..b9341bc3 100644 --- a/src/workflow/build.cpp +++ b/src/workflow/build.cpp @@ -13,6 +13,15 @@ void setDefaults_build(LocalParameters & par){ par.maskMode = 1; par.bufferSize = 1'000'000'000; par.accessionLevel = 0; + // Get current date + time_t now = time(0); + tm *ltm = localtime(&now); + par.dbDate = to_string(1900 + ltm->tm_year) + "-" + to_string(1 + ltm->tm_mon) + "-" + to_string(ltm->tm_mday); + + // Get random alphanumeric string fore dbName from current time + srand(time(NULL)); + string randStr = to_string(rand()); + par.dbName = randStr.substr(0, 32); } int build(int argc, const char **argv, const Command &command){ diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp index 91977368..495e97db 100644 --- a/src/workflow/classify.cpp +++ b/src/workflow/classify.cpp @@ -47,7 +47,6 @@ int classify(int argc, const char **argv, const Command& command) omp_set_num_threads(par.threads); #endif - cout << "Number of threads: " << par.threads << endl; Classifier * classifier = new Classifier(par); classifier->startClassify(par); delete classifier; From 5793500be6f16afec0acf85bb7fc885ca5d96b7a Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 12 Sep 2023 10:49:17 +0900 Subject: [PATCH 31/65] improve user interface --- README.md | 4 ++-- src/commons/LocalParameters.cpp | 2 +- src/commons/ProdigalWrapper.cpp | 4 ++-- src/metabuli.cpp | 2 +- src/workflow/add_to_library.cpp | 4 ++-- src/workflow/classify.cpp | 15 +++++++++++++++ 6 files changed, 23 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 3e6e9170..c240cc0b 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ Accessions that are not included in the `` will be skipped and ``` # Get the list of absoulte paths of files in your library -find /library -name '*.fna' > library-files.txt +find /library -type f -name '*.fna' > library-files.txt metabuli build [options] - DBDIR: The same DBDIR from the previous step. @@ -216,7 +216,7 @@ This will add your FASTA files to DBDIR/library according to their species taxon #### 2. Build ``` # Get the list of absoulte paths of files in your library -find /library -name '*.fna' > library-files.txt +find /library -type f -name '*.fna' > library-files.txt metabuli build [options] - DBDIR: The same DBDIR from the previous step. diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 986ab86d..bf4a20f5 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -268,7 +268,7 @@ LocalParameters::LocalParameters() : // build build.push_back(&PARAM_THREADS); build.push_back(&REDUCED_AA); - build.push_back(&SPACED); + // build.push_back(&SPACED); build.push_back(&TAXONOMY_PATH); build.push_back(&SPLIT_NUM); build.push_back(&PARAM_MASK_PROBABILTY); diff --git a/src/commons/ProdigalWrapper.cpp b/src/commons/ProdigalWrapper.cpp index 7b6e39d2..6c27f516 100644 --- a/src/commons/ProdigalWrapper.cpp +++ b/src/commons/ProdigalWrapper.cpp @@ -290,8 +290,8 @@ int ProdigalWrapper::getNextSeq(char * line, int training) { bctr+=2; len++; if(len >= MAX_SEQ) { - fprintf(stderr, "\n\nWarning: Sequence is long (max %d).\n", MAX_SEQ); - fprintf(stderr, "Use the first %d bases.\n\n", MAX_SEQ); + // fprintf(stderr, "\n\nWarning: Sequence is long (max %d).\n", MAX_SEQ); + // fprintf(stderr, "Use the first %d bases.\n\n", MAX_SEQ); break; } } diff --git a/src/metabuli.cpp b/src/metabuli.cpp index 43a8e9de..3d9188c2 100644 --- a/src/metabuli.cpp +++ b/src/metabuli.cpp @@ -61,7 +61,7 @@ std::vector commands = { "Assigning taxonomy label to query reads", nullptr, "Jaebeom Kim ", - " ", + " ", CITATION_SPACEPHARER, {{"FASTA", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile}, {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}, diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp index 8abb3ff2..ee19855c 100644 --- a/src/workflow/add_to_library.cpp +++ b/src/workflow/add_to_library.cpp @@ -34,7 +34,7 @@ int addToLibrary(int argc, const char **argv, const Command &command){ } // Load taxonomy - NcbiTaxonomy * taxonomy = loadTaxonomy(dbDir); + NcbiTaxonomy * taxonomy = loadTaxonomy(dbDir, par.taxonomyPath); // Load file names ifstream fileListFile; @@ -60,7 +60,7 @@ int addToLibrary(int argc, const char **argv, const Command &command){ char accession_version[2048]; int taxID; fscanf(mappingFile, "%*s\t%*s\t%*s\t%*s"); - while (fscanf(mappingFile, "%s\t%s\t%d\t%*d", accession, accession_version, &taxID) == 2) { + while (fscanf(mappingFile, "%s\t%s\t%d\t%*d", accession, accession_version, &taxID) == 3) { acc2taxid[string(accession_version)] = taxID; acc2taxid[string(accession)] = taxID; } diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp index 495e97db..460bab34 100644 --- a/src/workflow/classify.cpp +++ b/src/workflow/classify.cpp @@ -34,10 +34,25 @@ int classify(int argc, const char **argv, const Command& command) par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0); if (par.seqMode == 2) { + // Check if the second argument is a directory + if (FileUtil::directoryExists(par.filenames[1].c_str())) { + cerr << "Error: " << par.filenames[1] << " is a directory. Please specify a query file name." << endl; + cerr << " For '--seq-mode 2', please provide two query files." << endl; + exit(1); + } + if (!FileUtil::directoryExists(par.filenames[3].c_str())) { FileUtil::makeDir(par.filenames[3].c_str()); } } else { + // Check if the second argument is file + if (FileUtil::fileExists(par.filenames[1].c_str()) + && !FileUtil::directoryExists(par.filenames[1].c_str())) { + cerr << "Error: " << par.filenames[1] << " is a file. Please specify a database directory." << endl; + cerr << " For '--seq-mode 1' and '--seq-mode 3', please provide one query file." << endl; + exit(1); + } + if (!FileUtil::directoryExists(par.filenames[2].c_str())) { FileUtil::makeDir(par.filenames[2].c_str()); } From 43bfa6f607fe4d79e02d1c211a64c36dcb38ebf7 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 12 Sep 2023 14:00:51 +0900 Subject: [PATCH 32/65] 1) accession-level FileMerger 2) Support accession with prefix (NZ_~~~~) --- data/metabulidatabases.sh | 6 +-- src/commons/FileMerger.cpp | 44 +++++++++++----------- src/commons/FileMerger.h | 3 ++ src/commons/IndexCreator.cpp | 66 +++++++++++++++++++++------------ src/commons/common.cpp | 40 +++++++++++++++++++- src/commons/common.h | 2 + src/metabuli.cpp | 6 +-- src/workflow/add_to_library.cpp | 29 +++++++++------ 8 files changed, 131 insertions(+), 65 deletions(-) diff --git a/data/metabulidatabases.sh b/data/metabulidatabases.sh index 706c3135..653da5ae 100644 --- a/data/metabulidatabases.sh +++ b/data/metabulidatabases.sh @@ -115,10 +115,10 @@ case "${SELECTION}" in # INPUT_TYPE="METABULI_DB" ;; "RefSeq_virus") - if notExists "${TMP_PATH}/refseq_virus.tar.gz"; then - downloadFile "https://metabuli.steineggerlab.workers.dev/refseq_virus.tar.gz" "${TMP_PATH}/refseq_virus.tar.gz" + if notExists "${TMP_PATH}/refseq_virus+human.tar.gz"; then + downloadFile "https://metabuli.steineggerlab.workers.dev/refseq_virus+human.tar.gz" "${TMP_PATH}/refseq_virus+human.tar.gz" fi - tar zxvf "${TMP_PATH}/refseq_virus.tar.gz" -C "${OUTDB}" + tar zxvf "${TMP_PATH}/refseq_virus+human.tar.gz" -C "${OUTDB}" # push_back "${TMP_PATH}/refseq_virus" # INPUT_TYPE="METABULI_DB" ;; diff --git a/src/commons/FileMerger.cpp b/src/commons/FileMerger.cpp index f0301680..91d55bf6 100644 --- a/src/commons/FileMerger.cpp +++ b/src/commons/FileMerger.cpp @@ -1,6 +1,9 @@ #include "FileMerger.h" +#include "common.h" FileMerger::FileMerger(const LocalParameters & par) { + // Load parameters + dbDir = par.filenames[0]; splitNum = par.splitNum; bufferSize = par.bufferSize; if (par.reducedAA == 1){ @@ -10,10 +13,11 @@ FileMerger::FileMerger(const LocalParameters & par) { MARKER = 16777215; MARKER = ~ MARKER; } + taxonomy = loadTaxonomy(dbDir, ""); } FileMerger::~FileMerger() { - + delete taxonomy; } //void FileMerger::mergeTargetFiles(std::vector diffIdxFileNames, std::vector infoFileNames, vector & taxIdListAtRank, vector & taxIdList) { @@ -177,38 +181,32 @@ FileMerger::~FileMerger() { // Merge differential index and k-mer information files, reducing redundancy void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits) { size_t writtenKmerCnt = 0; - const string dbDirectory = par.filenames[0]; - - // Taxonomy - NcbiTaxonomy taxonomy(par.taxonomyPath + "/names.dmp", - par.taxonomyPath + "/nodes.dmp", - par.taxonomyPath + "/merged.dmp"); - + // Load taxonomy ids FILE * taxIdFile; - if((taxIdFile = fopen((string(dbDirectory) + "/taxID_list").c_str(),"r")) == NULL){ + if((taxIdFile = fopen((string(dbDir) + "/taxID_list").c_str(),"r")) == NULL){ cout<<"Cannot open the taxID list file."< taxId2speciesId; - while(feof(taxIdFile) == 0) - { + while(feof(taxIdFile) == 0) { fscanf(taxIdFile,"%s",taxID); TaxID taxId = atol(taxID); - TaxonNode const * taxon = taxonomy.taxonNode(taxId); + TaxonNode const * taxon = taxonomy->taxonNode(taxId); if (taxId == taxon->taxId){ - TaxID speciesTaxID = taxonomy.getTaxIdAtRank(taxId, "species"); + TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); while (taxon->taxId != speciesTaxID) { taxId2speciesId[taxon->taxId] = speciesTaxID; - taxon = taxonomy.taxonNode(taxon->parentTaxId); + taxon = taxonomy->taxonNode(taxon->parentTaxId); } taxId2speciesId[speciesTaxID] = speciesTaxID; } else { // merged - TaxID speciesTaxID = taxonomy.getTaxIdAtRank(taxId, "species"); + TaxID speciesTaxID = taxonomy->getTaxIdAtRank(taxId, "species"); while (taxon->taxId != speciesTaxID) { taxId2speciesId[taxon->taxId] = speciesTaxID; - taxon = taxonomy.taxonNode(taxon->parentTaxId); + taxon = taxonomy->taxonNode(taxon->parentTaxId); } taxId2speciesId[speciesTaxID] = speciesTaxID; taxId2speciesId[taxId] = speciesTaxID; @@ -217,9 +215,9 @@ void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits) fclose(taxIdFile); // File names for the final DB - string mergedDiffFileName = dbDirectory + "/diffIdx"; - string mergedInfoFileName = dbDirectory + "/info"; - string diffIdxSplitFileName = dbDirectory + "/split"; + string mergedDiffFileName = dbDir + "/diffIdx"; + string mergedInfoFileName = dbDir + "/info"; + string diffIdxSplitFileName = dbDir + "/split"; // Files to write FILE * mergedDiffFile = fopen(mergedDiffFileName.c_str(), "wb"); @@ -246,8 +244,8 @@ void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits) struct MmapedData *diffFileList = new struct MmapedData[numOfSplits]; struct MmapedData *infoFileList = new struct MmapedData[numOfSplits]; for (int file = 0; file < numOfSplits; file++) { - diffFileList[file] = mmapData((dbDirectory + "/" + to_string(file) + "_diffIdx").c_str()); - infoFileList[file] = mmapData((dbDirectory + "/" + to_string(file) + "_info").c_str()); + diffFileList[file] = mmapData((dbDir + "/" + to_string(file) + "_diffIdx").c_str()); + infoFileList[file] = mmapData((dbDir + "/" + to_string(file) + "_info").c_str()); maxIdxOfEachFiles[file] = diffFileList[file].fileSize / sizeof(uint16_t); numOfKmerBeforeMerge += infoFileList[file].fileSize / sizeof(TargetKmerInfo); } @@ -329,7 +327,7 @@ void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits) } if (taxIds.size() > 1) { - entryInfo.sequenceID = taxonomy.LCA(taxIds)->taxId; + entryInfo.sequenceID = taxonomy->LCA(taxIds)->taxId; } else { entryInfo.sequenceID = taxIds[0]; } @@ -388,7 +386,7 @@ void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits) cout<<"Reference DB files you need are as below"< #include "IndexCreator.h" +#include "NcbiTaxonomy.h" #include "printBinary.h" #include "common.h" @@ -14,6 +15,8 @@ using namespace std; class FileMerger { private: + NcbiTaxonomy * taxonomy; + string dbDir; uint64_t MARKER; int splitNum; size_t bufferSize; diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index b7945148..f6cbb6f7 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -8,6 +8,7 @@ #include #include #include "NcbiTaxonomy.cpp" +#include "common.h" IndexCreator::IndexCreator(const LocalParameters & par) { // Parameters @@ -190,6 +191,7 @@ void IndexCreator::makeBlocksForParallelProcessing() { } string eachFile; string seqHeader; + string accession_version; unordered_map foundAcc2taxid; for (int i = 0; i < fileNum; ++i) { @@ -198,8 +200,8 @@ void IndexCreator::makeBlocksForParallelProcessing() { fastaList[i].path = eachFile; processedSeqCnt.push_back(taxIdList.size()); seqHeader = getSeqSegmentsWithHead(fastaList[i].sequences, eachFile, acc2taxid, foundAcc2taxid); - seqHeader = seqHeader.substr(1, seqHeader.find('.') - 1); - TaxID speciesTaxid = taxonomy->getTaxIdAtRank(acc2taxid[seqHeader], "species"); + accession_version = seqHeader.substr(1, LocalUtil::getFirstWhiteSpacePos(seqHeader) - 1); + TaxID speciesTaxid = taxonomy->getTaxIdAtRank(searchAccession2TaxID(accession_version, acc2taxid), "species"); // Split current file into blocks for parallel processing splitFastaForProdigalTraining(i, speciesTaxid); @@ -246,19 +248,9 @@ void IndexCreator::makeBlocksForParallelProcessing_accession_level() { getline(fnaListFile, eachFile); fastaList[i].path = eachFile; processedSeqCnt.push_back(taxIdList.size()); - seqHeader = getSeqSegmentsWithHead(fastaList[i].sequences, eachFile, acc2taxid, newAcc2taxid); - // accession_version = seqHeader.substr(1, seqHeader.find('.') - 1); - accession = seqHeader.substr(1, seqHeader.find('.') - 1); accession_version = seqHeader.substr(1, LocalUtil::getFirstWhiteSpacePos(seqHeader) - 1); - // newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid[accession], newTaxID)); - tempTaxIDList.push_back(acc2taxid[accession]); - - // TaxID speciesTaxid = taxonomy->getTaxIdAtRank(acc2taxid[accession], "species"); - - // // Split current file into blocks for parallel processing - // splitFastaForProdigalTraining(i, speciesTaxid); - // fastaList[i].speciesID = speciesTaxid; + tempTaxIDList.push_back(searchAccession2TaxID(accession_version, acc2taxid)); } // Edit taxonomy dump files @@ -284,7 +276,6 @@ void IndexCreator::makeBlocksForParallelProcessing_accession_level() { fprintf(acc2taxidFile, "%s\t%d\t%d\n", it.first.c_str(), it.second.first, it.second.second); } fclose(acc2taxidFile); - } void IndexCreator::splitFastaForProdigalTraining(int file_idx, TaxID speciesID) { @@ -700,19 +691,30 @@ string IndexCreator::getSeqSegmentsWithHead(vector & seqSegments, vector seqSegmentsTmp; string accession; string accession_version; + int taxid; if (seqFile.is_open()) { getline(seqFile, firstLine, '\n'); - accession = firstLine.substr(1, firstLine.find('.') - 1); accession_version = firstLine.substr(1, LocalUtil::getFirstWhiteSpacePos(firstLine) - 1); - newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid.at(accession), newTaxID)); + taxid = searchAccession2TaxID(accession_version, acc2taxid); + if (taxid == 0) { + cerr << "Cannot find accession: " << accession_version << endl; + cerr << "Please run 'add-to-library' first." << endl; + exit(1); + } + newAcc2taxid.emplace_back(accession_version, make_pair(taxid, newTaxID)); taxIdList.push_back(newTaxID++); while (getline(seqFile, eachLine, '\n')) { if (eachLine[0] == '>') { - accession = eachLine.substr(1, eachLine.find('.') - 1); accession_version = eachLine.substr(1, LocalUtil::getFirstWhiteSpacePos(eachLine) - 1); - newAcc2taxid.emplace_back(accession_version, make_pair(acc2taxid.at(accession), newTaxID)); + taxid = searchAccession2TaxID(accession_version, acc2taxid); + if (taxid == 0) { + cerr << "Cannot find accession: " << accession_version << endl; + cerr << "Please run 'add-to-library' first." << endl; + exit(1); + } + newAcc2taxid.emplace_back(accession_version, make_pair(taxid, newTaxID)); taxIdList.push_back(newTaxID++); pos = (size_t) seqFile.tellg(); seqSegmentsTmp.emplace_back(start, pos - eachLine.length() - 3,pos - eachLine.length() - start - 2); @@ -744,16 +746,32 @@ string IndexCreator::getSeqSegmentsWithHead(vector & seqSegments, vector seqSegmentsTmp; vector headers; size_t seqCnt = taxIdList.size(); + string accession_version; + int taxid; + if (seqFile.is_open()) { getline(seqFile, firstLine, '\n'); -// cout << firstLine << endl; - taxIdList.push_back(acc2taxid.at(firstLine.substr(1, firstLine.find('.') - 1))); - foundAcc2taxid[firstLine.substr(1, firstLine.find(' ') - 1)] = taxIdList.back(); + accession_version = firstLine.substr(1, LocalUtil::getFirstWhiteSpacePos(firstLine) - 1); + taxid = searchAccession2TaxID(accession_version, acc2taxid); + if (taxid == 0) { + cerr << "Cannot find accession: " << accession_version << endl; + cerr << "Please run 'add-to-library' first." << endl; + exit(1); + } + taxIdList.push_back(taxid); + + foundAcc2taxid[accession_version] = taxIdList.back(); while (getline(seqFile, eachLine, '\n')) { if (eachLine[0] == '>') { -// cout << eachLine << endl; - taxIdList.push_back(acc2taxid.at(eachLine.substr(1, eachLine.find('.') - 1))); - foundAcc2taxid[eachLine.substr(1, eachLine.find(' ') - 1)] = taxIdList.back(); + accession_version = eachLine.substr(1, LocalUtil::getFirstWhiteSpacePos(eachLine) - 1); + taxid = searchAccession2TaxID(accession_version, acc2taxid); + if (taxid == 0) { + cerr << "Cannot find accession: " << accession_version << endl; + cerr << "Please run 'add-to-library' first." << endl; + exit(1); + } + taxIdList.push_back(taxid); + foundAcc2taxid[accession_version] = taxIdList.back(); pos = (size_t) seqFile.tellg(); seqSegmentsTmp.emplace_back(start, pos - eachLine.length() - 3,pos - eachLine.length() - start - 2); start = pos - eachLine.length() - 1; diff --git a/src/commons/common.cpp b/src/commons/common.cpp index c1ebff2a..01334b75 100644 --- a/src/commons/common.cpp +++ b/src/commons/common.cpp @@ -115,4 +115,42 @@ int loadDbParameters(LocalParameters &par) { } } return 0; -} \ No newline at end of file +} + +int searchAccession2TaxID(const std::string &name, + const std::unordered_map &acc2taxid) { + if (acc2taxid.find(name) != acc2taxid.end()) { + return acc2taxid.at(name); + } + + // Cannot fine with version --> Remove the version number + size_t pos = name.find('.'); + if (pos != std::string::npos) { + std::string nameWithoutVersion = name.substr(0, pos); + if (acc2taxid.find(nameWithoutVersion) != acc2taxid.end()) { + return acc2taxid.at(nameWithoutVersion); + } + } + + // With prefix? Ex) NZ_CP083375.1 + pos = name.find('_'); + std::string nameWithoutPrefix; + if (pos != std::string::npos) { + // Try without prefix + nameWithoutPrefix = name.substr(pos + 1); // CP083375.1 + if (acc2taxid.find(nameWithoutPrefix) != acc2taxid.end()) { + return acc2taxid.at(nameWithoutPrefix); + } + + // Remove version + pos = nameWithoutPrefix.find('.'); + if (pos != std::string::npos) { + nameWithoutPrefix = nameWithoutPrefix.substr(0, pos); // CP083375 + if (acc2taxid.find(nameWithoutPrefix) != acc2taxid.end()) { + return acc2taxid.at(nameWithoutPrefix); + } + } + } + + return 0; +} diff --git a/src/commons/common.h b/src/commons/common.h index 7749c39a..493f64cd 100644 --- a/src/commons/common.h +++ b/src/commons/common.h @@ -86,4 +86,6 @@ NcbiTaxonomy * loadTaxonomy(const std::string & dbDir, const std::string & taxon int loadDbParameters(LocalParameters & par); +int searchAccession2TaxID(const std::string & name, const std::unordered_map & acc2taxid); + #endif //ADCLASSIFIER2_COMMON_H diff --git a/src/metabuli.cpp b/src/metabuli.cpp index 3d9188c2..40a8f2c3 100644 --- a/src/metabuli.cpp +++ b/src/metabuli.cpp @@ -137,7 +137,7 @@ std::vector externalThreshold = {}; std::vector externalDownloads = { { "RefSeq", - "Database built with NCBI RefSeq assemblies (Complete/Chromosome level only, Prokaryote & Virus) and Human genome (GRCh38.p14)", + "Database built with NCBI RefSeq assemblies (Complete/Chromosome level only, Prokaryote & Virus) and Human genome (CHM13v2.0)", "O'Leary et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res. (2016)", "https://www.ncbi.nlm.nih.gov/refseq/", true, LocalParameters::DBTYPE_INDEX_DB, metabulidatabases_sh, metabulidatabases_sh_len, @@ -145,7 +145,7 @@ std::vector externalDownloads = { }, { "RefSeq217", - "Database built with genomes of NCBI release 217 (Prokaryote & Virus) and Human genome (GRCh38.p14)", + "Database built with genomes of NCBI release 217 (Prokaryote & Virus) and Human genome (CHM13v2.0)", "O'Leary et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res. (2016)", "https://www.ncbi.nlm.nih.gov/refseq/", true, LocalParameters::DBTYPE_INDEX_DB, metabulidatabases_sh, metabulidatabases_sh_len, @@ -161,7 +161,7 @@ std::vector externalDownloads = { }, { "RefSeq_virus", - "Database built with NCBI RefSeq virus assemblies (Complete/Chromosome level only)", + "Database built with NCBI RefSeq virus assemblies (Complete/Chromosome level) and Human genome (CHM13v2.0)", "O'Leary et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res. (2016)", "https://www.ncbi.nlm.nih.gov/refseq/", true, LocalParameters::DBTYPE_INDEX_DB, metabulidatabases_sh, metabulidatabases_sh_len, diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp index ee19855c..69bedded 100644 --- a/src/workflow/add_to_library.cpp +++ b/src/workflow/add_to_library.cpp @@ -77,26 +77,33 @@ int addToLibrary(int argc, const char **argv, const Command &command){ while (kseq->ReadEntry()) { const KSeqWrapper::KSeqEntry & e = kseq->entry; - // Extract accession and Remove the version number - string accession = string(e.name.s); - size_t pos = accession.find('.'); - if (pos != string::npos) { accession = accession.substr(0, pos); } - - // Skip if accession is not in the mapping file - if (acc2taxid.find(accession) == acc2taxid.end()) { - cout << "During processing " << fileNames[i] << ", accession " << accession << + int taxID = searchAccession2TaxID(e.name.s, acc2taxid); + if (taxID == 0) { + cout << "During processing " << fileNames[i] << ", accession " << e.name.s << " is not found in the mapping file. It is skipped." << endl; - unmapped.push_back(accession); + unmapped.push_back(e.name.s); continue; } + // string accession = string(e.name.s); + // size_t pos = accession.find('.'); + // if (pos != string::npos) { accession = accession.substr(0, pos); } + + // // Skip if accession is not in the mapping file + // if (acc2taxid.find(accession) == acc2taxid.end()) { + // cout << "During processing " << fileNames[i] << ", accession " << accession << + // " is not found in the mapping file. It is skipped." << endl; + // unmapped.push_back(accession); + // continue; + // } // Get species taxID - int speciesTaxID = taxonomy->getTaxIdAtRank(acc2taxid[accession], "species"); + int speciesTaxID = taxonomy->getTaxIdAtRank(taxID, "species"); // Skip if species taxID is not found if (speciesTaxID == 0) { - cout << "During processing " << fileNames[i] << ", accession " << accession << + cout << "During processing " << fileNames[i] << ", accession " << e.name.s << " is not matched to any species. It is skipped." << endl; + unmapped.push_back(e.name.s); continue; } From c2d805633d198069d99693eafcdcdf42cd0732fc Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 14 Sep 2023 00:21:30 +0900 Subject: [PATCH 33/65] 1) edit readme 2) remove some printed logs 3) fix the problme of empty 'my.accession2taxid' --- README.md | 19 +++++++++++++++---- lib/prodigal/gene.cpp | 2 +- src/commons/IndexCreator.cpp | 2 +- src/commons/LocalParameters.cpp | 2 +- src/workflow/add_to_library.cpp | 4 ++++ 5 files changed, 22 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index c240cc0b..d7cb466c 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,9 @@ The steps for building a database with NCBI or GTDB taxonomy are described below metabuli add-to-library - FASTA list: A file containing absolute paths of each FASTA file. - accession2taxid: A path to NCBI-style accession2taxid. -- DBDIR: Sequences will be stored in 'DBDIR/library'. +- DBDIR: Sequences will be stored in 'DBDIR/library'. + +** When resume is needed, remove the files in DBDIR/library and run the command again. ``` It groups your sequences into separate files according to their species. Accessions that are not included in the `` will be skipped and listed in `unmapped.txt`. @@ -191,17 +193,24 @@ This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some ### To build a database with GTDB taxonomy #### 1. Prepare GTDB taxonomy and accession2taxid -*Requirements*: You need assembly FASTA files whose file name (or path) includes the assembly accession. +*Requirements*: +You need assembly FASTA files whose file name (or path) includes the assembly accession. If you downloaded assemblies using `ncbi-genome-download`, you probably don't have to care about it. The regular expression of assembly accessions is (GC[AF]_[0-9].[0-9]) ``` # 1. -In the 'util' directory +# 1-1. Move to the 'util' directory +cd METABULI_DIR/util + +# 1-2. Run prepare_gtdb_taxonomy.sh ./prepare_gtdb_taxonomy.sh - DBDIR : Result files are stored in 'DBDIR/taxonomy'. + +** Please clone Metabuli's repository to use this script. +** It is not provided in the precompiled binaries or bioconda package. ``` -This will generate taxonomy dump files and `assacc_to_taxid.tsv` with other files. +In `DBDIR/taxonomy`, it will generate taxonomy `dmp` files and `assacc_to_taxid.tsv` with other files. ``` # 2. @@ -210,6 +219,8 @@ metabuli add-to-library --assembly true Each path must include a corresponding assembly accession. - accession2taxid : 'assacc_to_taxid.tsv' from the previous step - DBDIR : The same DBDIR from the previous step. + +** When resume is needed, remove the files in DBDIR/library and run the command again. ``` This will add your FASTA files to DBDIR/library according to their species taxonomy ID and generate 'my.accession2taxid' diff --git a/lib/prodigal/gene.cpp b/lib/prodigal/gene.cpp index 3cbd7f01..40debaf1 100644 --- a/lib/prodigal/gene.cpp +++ b/lib/prodigal/gene.cpp @@ -51,7 +51,7 @@ int add_genes(struct _gene *glist, struct _node *nod, int dbeg) { } path = nod[path].tracef; if(ctr == MAX_GENES) { - fprintf(stderr, "warning, max # of genes exceeded, truncating...\n"); + // fprintf(stderr, "warning, max # of genes exceeded, truncating...\n"); return ctr; } } diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index f6cbb6f7..d3cb7db4 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -1004,7 +1004,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, munmap(fastaFile.data, fastaFile.fileSize + 1); } else { // Withdraw the reservation if the buffer is full. - cout << "Buffer is full. Withdraw the reservation." << endl; + // cout << "Buffer is full. Withdraw the reservation." << endl; checker[i] = false; __sync_fetch_and_add(&hasOverflow, 1); __sync_fetch_and_sub(&kmerBuffer.startIndexOfReserve, estimatedKmerCnt); diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index bf4a20f5..9c79fdea 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -294,7 +294,7 @@ LocalParameters::LocalParameters() : // classify.push_back(&MIN_COVERED_POS); // classify.push_back(&PRINT_LOG); // classify.push_back(&MAX_GAP); -// classify.push_back(&TAXONOMY_PATH); + classify.push_back(&TAXONOMY_PATH); classify.push_back(&PARAM_MASK_RESIDUES); classify.push_back(&PARAM_MASK_PROBABILTY); classify.push_back(&RAM_USAGE); diff --git a/src/workflow/add_to_library.cpp b/src/workflow/add_to_library.cpp index 69bedded..e6b5c9e0 100644 --- a/src/workflow/add_to_library.cpp +++ b/src/workflow/add_to_library.cpp @@ -174,6 +174,10 @@ int addToLibrary(int argc, const char **argv, const Command &command){ KSeqWrapper* kseq = KSeqFactory(fileNames[i].c_str()); while (kseq->ReadEntry()){ const KSeqWrapper::KSeqEntry & e = kseq->entry; + // Extract accession + string accession = string(e.name.s); + acc2taxid[accession] = assembly2taxid[assemblyID]; + // Write to file FILE *file = fopen((dbDir + "/library/" + to_string(speciesTaxID) + ".fna").c_str(), "a"); fprintf(file, ">%s %s\n", e.name.s, e.comment.s); From ecbc9aa587cdf11f063156e557d23110aabb922a Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Mon, 25 Sep 2023 15:06:27 +0900 Subject: [PATCH 34/65] fix errors in 1) query indexing 2) grade --- src/commons/Classifier.cpp | 18 --------------- src/commons/FileMerger.cpp | 2 ++ src/commons/IndexCreator.cpp | 7 +++--- src/commons/KmerExtractor.cpp | 2 +- src/commons/KmerMatcher.cpp | 7 +++++- src/commons/QueryIndexer.cpp | 26 +++++++++++++--------- src/commons/SeqIterator.cpp | 1 + src/commons/Taxonomer.cpp | 20 ++++++----------- src/commons/common.cpp | 6 ----- src/util/grade.cpp | 42 +++++++++++------------------------ 10 files changed, 50 insertions(+), 81 deletions(-) diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index 3384ed88..58130092 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -98,27 +98,10 @@ void Classifier::startClassify(const LocalParameters &par) { kseq2); numOfTatalQueryKmerCnt += kmerBuffer.startIndexOfReserve; -//#ifdef OPENMP -// if (par.printLog == 1) { -// omp_set_num_threads(1); -// } else { -// omp_set_num_threads(par.threads); -// } -//#endif - // Search matches between query and target k-mers kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer); kmerMatcher->sortMatches(&matchBuffer); - -//#ifdef OPENMP -// if (par.printLog == 1) { -// omp_set_num_threads(1); -// } else { -// omp_set_num_threads(par.threads); -// } -//#endif - // Classify queries based on the matches taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par); processedSeqCnt += queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start; @@ -146,5 +129,4 @@ void Classifier::startClassify(const LocalParameters &par) { free(matchBuffer.buffer); delete kseq1; delete kseq2; - } diff --git a/src/commons/FileMerger.cpp b/src/commons/FileMerger.cpp index 91d55bf6..291c1316 100644 --- a/src/commons/FileMerger.cpp +++ b/src/commons/FileMerger.cpp @@ -305,6 +305,8 @@ void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits) int hasSeenOtherStrains = 0; taxIds.clear(); taxIds.push_back(entryInfo.sequenceID); // Wrong + + // Scan redundant k-mers while(taxId2speciesId[entryInfo.sequenceID] == taxId2speciesId[lookingInfos[idxOfMin].sequenceID]){ if(entryKmer != lookingKmers[idxOfMin]) break; diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index d3cb7db4..a08283c4 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -131,7 +131,6 @@ void IndexCreator::updateIndex(const LocalParameters &par) { // Train Prodigal for each species time_t prodigalStart = time(nullptr); - // trainProdigal(); time_t prodigalEnd = time(nullptr); cout << "Prodigal training time: " << prodigalEnd - prodigalStart << " seconds" << endl; @@ -521,6 +520,8 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq hasSeenOtherStrains = 0; taxIds.clear(); taxIds.push_back(taxIdList[lookingKmer->info.sequenceID]); + + // Scan redundancy while(lookingKmer->taxIdAtRank == kmerBuffer.buffer[i].taxIdAtRank){ if (lookingKmer->ADkmer != kmerBuffer.buffer[i].ADkmer) { break; @@ -882,7 +883,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, seq = kseq_init(&buffer); kseq_read(seq); lengthOfTrainingSeq = seq->seq.l; - cout << "T: " << seq->name.s << " " << lengthOfTrainingSeq << " " << estimatedKmerCnt << endl; + // cout << "T: " << seq->name.s << " " << lengthOfTrainingSeq << " " << estimatedKmerCnt << endl; // Train prodigal prodigal->is_meta = 0; @@ -1009,7 +1010,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, __sync_fetch_and_add(&hasOverflow, 1); __sync_fetch_and_sub(&kmerBuffer.startIndexOfReserve, estimatedKmerCnt); } - cout << totalLength << " " << prodigal->fng << endl; + // cout << totalLength << " " << prodigal->fng << endl; delete prodigal; } diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp index 91f5ee8f..4d656cdf 100644 --- a/src/commons/KmerExtractor.cpp +++ b/src/commons/KmerExtractor.cpp @@ -86,7 +86,7 @@ void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq1, size_t queryIdx = processedQueryNum - currentQueryNum + i; // Get k-mer count auto kmerCnt = LocalUtil::getQueryKmerNumber(reads1[i].length(), spaceNum); - + // Ignore short read if (kmerCnt < 1) { continue; } diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 9455070e..646bd1cf 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -178,7 +178,8 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, // Devide query k-mers into blocks size_t splitWidth = queryKmerNum / (threads - 1); querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]); - for (size_t i = 1; i < threads; i++) { + size_t i = 1; + for (; (i < threads) && (splitWidth * i < queryKmerNum); i++) { queryAA = AminoAcidPart(queryKmerList[splitWidth * i].ADkmer); bool needLastTargetBlock = true; for (size_t j = 0; j < numOfDiffIdxSplits_use; j++) { @@ -205,6 +206,10 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, } } } + + if (i != threads) { + threads = querySplits.size(); + } } bool *splitCheckList = (bool *) malloc(sizeof(bool) * threads); diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp index 0651b479..66ec70a4 100644 --- a/src/commons/QueryIndexer.cpp +++ b/src/commons/QueryIndexer.cpp @@ -30,26 +30,32 @@ void QueryIndexer::setAvailableRam() { void QueryIndexer::indexQueryFile() { // Read 1 if (seqMode == 1 || seqMode == 3) { - KSeqWrapper* kseq; - kseq = KSeqFactory(queryPath_1.c_str()); + KSeqWrapper* kseq = KSeqFactory(queryPath_1.c_str()); size_t kmerCnt = 0; size_t seqCnt = 0; size_t start = 0; while (kseq->ReadEntry()) { readNum_1++; - const KSeqWrapper::KSeqEntry &e = kseq->entry; - totalReadLength += e.sequence.l; - size_t currentKmerCnt = LocalUtil::getQueryKmerNumber(e.sequence.l, spaceNum); - kmerCnt += currentKmerCnt; seqCnt++; + totalReadLength += kseq->entry.sequence.l; + size_t currentKmerCnt = LocalUtil::getQueryKmerNumber(kseq->entry.sequence.l, spaceNum); + kmerCnt += currentKmerCnt; + // std::cout << "currentKmerCnt: " << kmerCnt << "\n"; + if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) { - querySplits.emplace_back(start, readNum_1, kmerCnt - currentKmerCnt); + querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt); kmerCnt = currentKmerCnt; - start = readNum_1; + start = readNum_1 - 1; seqCnt = 1; } } querySplits.emplace_back(start, readNum_1, kmerCnt); + // Print elements + for (auto & querySplit : querySplits) { + std::cout << "start: " << querySplit.start << "\t"; + std::cout << "end: " << querySplit.end << "\t"; + std::cout << "kmerCnt: " << querySplit.kmerCnt << "\n"; + } delete kseq; } else { KSeqWrapper* kseq_1 = KSeqFactory(queryPath_1.c_str()); @@ -87,9 +93,9 @@ void QueryIndexer::indexQueryFile() { } if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) { - querySplits.emplace_back(start, readNum_1, kmerCnt - currentKmerCnt); + querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt); kmerCnt = currentKmerCnt; - start = readNum_1; + start = readNum_1 - 1; seqCnt_1 = 1; } diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp index 525ac651..55255342 100644 --- a/src/commons/SeqIterator.cpp +++ b/src/commons/SeqIterator.cpp @@ -355,6 +355,7 @@ void SeqIterator::fillQueryKmerBuffer(const char *seq, int seqLen, QueryKmerBuff posToWrite++; } } + // cout << "posToWrite: " << posToWrite << endl; } void diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 663ebba4..adf2dbdd 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -75,13 +75,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, vector & queryList, const LocalParameters &par) { TaxID selectedTaxon; -// if (par.printLog) { + +// if (true) { // cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; // for (size_t i = offset; i < end + 1; i++) { -// cout << taxId2genusId[matchList[i].targetId] << " " << taxId2speciesId[matchList[i].targetId] << -// " " << matchList[i].targetId << " " << matchList[i].qInfo.frame << " "; -// print_binary16(16, matchList[i].rightEndHamming); -// cout << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; +// cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; // } // } @@ -111,14 +109,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, } } -// if (par.printLog) { +// if (true) { // cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n"; // for (size_t i = 0; i < genusMatches.size(); i++) { -// cout << taxId2genusId[genusMatches[i].targetId] << " " << taxId2speciesId[genusMatches[i].targetId] << -// " " << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " "; -// print_binary16(16, genusMatches[i].rightEndHamming); -// cout << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) << " " << int(genusMatches[i].redundancy) << endl; -// } +// cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) << " " << int(genusMatches[i].redundancy) << endl; +// } // cout << "Genus score: " << genusScore.score << "\n"; // } @@ -481,7 +476,6 @@ void Taxonomer::remainConsecutiveMatches(vector & curFrameMatches for (const auto& entry : linkedMatches) { if (!used.count(entry.first)) { used.insert(entry.first); - vector curMatches; DFS(entry.first, linkedMatches, filteredMatchIdx, 0, MIN_DEPTH, used, idx2depth); } } @@ -506,7 +500,7 @@ size_t Taxonomer::DFS(size_t curMatchIdx, const map> & li depth++; size_t maxDepth = 0; size_t returnDepth = 0; - if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { //|| linkedMatches.at(curMatchIdx).empty()) { + if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { // reached a leaf node idx2depth[curMatchIdx] = depth; if (depth > MIN_DEPTH) { diff --git a/src/commons/common.cpp b/src/commons/common.cpp index 01334b75..edba4fe0 100644 --- a/src/commons/common.cpp +++ b/src/commons/common.cpp @@ -4,16 +4,11 @@ #include #include #include -// #include "MathUtil.h" #include "Debug.h" #include "Reporter.h" #include "Util.h" #include "sys/mman.h" -// #include -// #include -// #include - void process_mem_usage(double &vm_usage, double &resident_set) { vm_usage = 0.0; resident_set = 0.0; @@ -68,7 +63,6 @@ NcbiTaxonomy *loadTaxonomy(const std::string &dbDir, taxonomyDir + "/nodes.dmp", taxonomyDir + "/merged.dmp"); } - return new NcbiTaxonomy(dbDir + "/taxonomy/names.dmp", dbDir + "/taxonomy/nodes.dmp", dbDir + "/taxonomy/merged.dmp"); diff --git a/src/util/grade.cpp b/src/util/grade.cpp index e009a361..79dbaaa1 100644 --- a/src/util/grade.cpp +++ b/src/util/grade.cpp @@ -121,7 +121,7 @@ int grade(int argc, const char **argv, const Command &command) { } cout << "Classification results loaded" << endl; - size_t numberOfFiles = mappingFileNames.size(); + size_t numberOfFiles = readClassificationFileNames.size(); vector results; results.resize(numberOfFiles); @@ -172,37 +172,21 @@ ncbiTaxonomy, par, cout, printColumnsIdx, cerr) mappingFile = mappingFileNames[i]; readClassificationFileName = readClassificationFileNames[i]; - if (par.testType == "cami-long"){ - // Load mapping file - ifstream mappingFileFile; - mappingFileFile.open(mappingFile); - string line; - if (mappingFileFile.is_open()) { - getline(mappingFileFile, line); - while (getline(mappingFileFile, line)) { - vector splitLine = Util::split(line, "\t"); - assacc2taxid[splitLine[0]] = stoi(splitLine[2]); - } - } else { - cerr << "Cannot open file for answer" << endl; + // Load the mapping file (answer sheet) (accession to taxID) + string key, value; + ifstream map; + map.open(mappingFile); + size_t numberOfAnswers = 0; + if (map.is_open()) { + while (getline(map, key, '\t')) { + getline(map, value, '\n'); + assacc2taxid[key] = stoi(value); + numberOfAnswers++; } } else { - // Load the mapping file (answer sheet) (accession to taxID) - string key, value; - ifstream map; - map.open(mappingFile); - size_t numberOfAnswers = 0; - if (map.is_open()) { - while (getline(map, key, '\t')) { - getline(map, value, '\n'); - assacc2taxid[key] = stoi(value); - numberOfAnswers++; - } - } else { - cout << "Cannot open file for answer" << endl; - } - map.close(); + cout << "Cannot open file for answer" << endl; } + map.close(); // Load classification results string resultLine; From 04d20aad60616405ab8c04d70881c26524058d1b Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Wed, 4 Oct 2023 17:07:07 +0900 Subject: [PATCH 35/65] fix unintended initialization of prodigal results --- src/commons/ProdigalWrapper.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/commons/ProdigalWrapper.cpp b/src/commons/ProdigalWrapper.cpp index 6c27f516..9e9c6aec 100644 --- a/src/commons/ProdigalWrapper.cpp +++ b/src/commons/ProdigalWrapper.cpp @@ -51,6 +51,12 @@ ProdigalWrapper::ProdigalWrapper() { void ProdigalWrapper:: trainASpecies(char * genome){ + // Initialize memories to reuse them + memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char)); + memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char)); + memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char)); + memset(nodes, 0, nn * sizeof(struct _node)); + nn = 0; slen = 0; ipath = 0; nmask = 0; // Initialize training information memset(mlist, 0, MAX_MASKS*sizeof(mask)); @@ -116,17 +122,17 @@ trainASpecies(char * genome){ train_starts_sd(seq, rseq, slen, nodes, nn, &tinf); determine_sd_usage(&tinf); if(force_nonsd == 1) tinf.uses_sd = 0; - if(tinf.uses_sd == 0) train_starts_nonsd(seq, rseq, slen, nodes, nn, &tinf); + if(tinf.uses_sd == 0) train_starts_nonsd(seq, rseq, slen, nodes, nn, &tinf); +} +void ProdigalWrapper::trainMeta(char *genome) { // Initialize memories to reuse them memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char)); memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char)); memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char)); memset(nodes, 0, nn * sizeof(struct _node)); nn = 0; slen = 0; ipath = 0; nmask = 0; -} -void ProdigalWrapper::trainMeta(char *genome) { // Initialize training information memset(&tinf, 0, sizeof(struct _training)); tinf.st_wt = 4.35; @@ -173,15 +179,16 @@ void ProdigalWrapper::trainMeta(char *genome) { max_score = nodes[ipath].score; } } +} +void ProdigalWrapper::getPredictedGenes(char * genome) { // Initialize memories to reuse them + // Initialization should be done here not at the end of the function memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char)); memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char)); memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char)); - memset(nodes, 0, nn * sizeof(struct _node)); - nn = 0; slen = 0; ipath = 0; nmask = 0; -} -void ProdigalWrapper::getPredictedGenes(char * genome){ + memset(nodes, 0, nn*sizeof(struct _node)); + nn = 0; slen = 0; nmask = 0; ipath=0; /* Initialize structure */ slen = getNextSeq(genome, 0); @@ -241,13 +248,6 @@ void ProdigalWrapper::getPredictedGenes(char * genome){ tweak_final_starts(genes, ng, nodes, nn, meta[max_phase].tinf); record_gene_data(genes, ng, nodes, meta[max_phase].tinf, num_seq); } - - // Initialize memories to reuse them - memset(seq, 0, (slen / 4 + 1) * sizeof(unsigned char)); - memset(rseq, 0, (slen / 4 + 1) * sizeof(unsigned char)); - memset(useq, 0, (slen / 8 + 1) * sizeof(unsigned char)); - memset(nodes, 0, nn*sizeof(struct _node)); - nn = 0; slen = 0; nmask = 0; ipath=0; } int ProdigalWrapper::getNextSeq(char * line, int training) { From 907796dbec2eeb9226b71b88d7d4f958fda96fe2 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Wed, 4 Oct 2023 17:07:33 +0900 Subject: [PATCH 36/65] remove codes used for debugging --- src/commons/SeqIterator.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp index 55255342..d9677a93 100644 --- a/src/commons/SeqIterator.cpp +++ b/src/commons/SeqIterator.cpp @@ -446,10 +446,10 @@ SeqIterator::fillBufferWithKmerFromBlock(const PredictedBlock &block, const char kmerBuffer.buffer[posToWrite] = {UINT64_MAX, -1, 0, false}; } else { addDNAInfo_TargetKmer(tempKmer, seq, block, kmerCnt); - if(posToWrite >= kmerBuffer.bufferSize - 2) { - cout << "HERE " << posToWrite << endl; - return -1; - } + // if(posToWrite >= kmerBuffer.bufferSize - 2) { + // cout << "HERE " << posToWrite << endl; + // return -1; + // } kmerBuffer.buffer[posToWrite] = {tempKmer, taxIdAtRank, seqID, false}; } posToWrite++; From 8468627b7880057c3e0bcf8eae8505ae90942475 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 10 Oct 2023 14:23:36 +0900 Subject: [PATCH 37/65] mapping2taxon for metamaps without EM --- src/LocalCommandDeclarations.h | 1 + src/commons/LocalParameters.h | 2 +- src/metabuli.cpp | 10 +++++++++- src/util/CMakeLists.txt | 1 + 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h index 000aa648..56b470f5 100644 --- a/src/LocalCommandDeclarations.h +++ b/src/LocalCommandDeclarations.h @@ -13,5 +13,6 @@ extern int applyThreshold(int argc, const char **argv, const Command& command); extern int binning2report(int argc, const char **argv, const Command& command); extern int filterByGenus(int argc, const char **argv, const Command& command); extern int databaseReport(int argc, const char **argv, const Command& command); +extern int mapping2taxon(int argc, const char **argv, const Command& command); #endif //ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index ccbd03c6..0a8be0c7 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -31,7 +31,7 @@ class LocalParameters : public Parameters { std::vector binning2report; std::vector filterByGenus; std::vector databaseReport; - + std::vector mapping2taxon; // Superkingdom taxonomy id PARAMETER(VIRUS_TAX_ID) diff --git a/src/metabuli.cpp b/src/metabuli.cpp index 40a8f2c3..82f42a76 100644 --- a/src/metabuli.cpp +++ b/src/metabuli.cpp @@ -129,7 +129,15 @@ std::vector commands = { CITATION_SPACEPHARER, {{"Binning Result", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"Genus list", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}, - {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}} + {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}}, + {"mapping2taxon", mapping2taxon, &localPar.mapping2taxon, COMMAND_EXPERT, + "It takes a mapping file (multiple targets for each read) and generates a read2taxon file (one target for each read)", + nullptr, + "Jaebeom Kim ", + " ", + CITATION_SPACEPHARER, + {{"mapping file", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, + {"taxonomy directory", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}} }; std::vector externalThreshold = {}; diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 6a68f515..89e7f0f9 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -6,4 +6,5 @@ set(util_source_files util/report.cpp util/grade.cpp util/database-report.cpp + util/mapping2taxon.cpp PARENT_SCOPE) \ No newline at end of file From 9c62d6c505084527b654eaae58609b2594314d99 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 12 Oct 2023 00:16:20 +0900 Subject: [PATCH 38/65] new parameter: minimum number of ssp. specific matches for lower-rank classification --- src/commons/LocalParameters.cpp | 8 ++++++++ src/commons/LocalParameters.h | 2 ++ src/commons/Taxonomer.cpp | 3 ++- src/commons/Taxonomer.h | 1 + src/workflow/classify.cpp | 1 + 5 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 9c79fdea..dc9ffe04 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -143,6 +143,13 @@ LocalParameters::LocalParameters() : typeid(int), (void *) &matchPerKmer, "^[0-9]+$"), + MIN_SS_MATCH(MIN_SS_MATCH_ID, + "--min-ss-match", + "Min. num. of ssp.-specific matches for ssp. classification", + "Min. number of ssp.-specific matches for ssp. classification", + typeid(int), + (void *) &minSSMatch, + "^[0-9]+$"), LIBRARY_PATH(LIBRARY_PATH_ID, "--library-path", "Path to library where the FASTA files are stored", @@ -300,6 +307,7 @@ LocalParameters::LocalParameters() : classify.push_back(&RAM_USAGE); classify.push_back(&MATCH_PER_KMER); classify.push_back(&ACCESSION_LEVEL); + classify.push_back(&MIN_SS_MATCH); // filter filter.push_back(&PARAM_THREADS); diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 0a8be0c7..858d73a6 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -55,6 +55,7 @@ class LocalParameters : public Parameters { PARAMETER(MIN_CONS_CNT) PARAMETER(MIN_CONS_CNT_EUK) PARAMETER(MATCH_PER_KMER) + PARAMETER(MIN_SS_MATCH) // DB build parameters PARAMETER(LIBRARY_PATH) @@ -101,6 +102,7 @@ class LocalParameters : public Parameters { int maxGap; int minConsCntEuk; int matchPerKmer; + int minSSMatch; // Database creation std::string tinfoPath; diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index adf2dbdd..0f97a3dc 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -18,6 +18,7 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon maxGap = par.maxGap; minCoveredPos = par.minCoveredPos; accessionLevel = par.accessionLevel; + minSSMatch = par.minSSMatch; } Taxonomer::~Taxonomer() { @@ -308,7 +309,7 @@ TaxID Taxonomer::BFS(const unordered_map & cladeCnt, TaxID r if (cladeCnt.at(root).children.empty()) { // root is a leaf return root; } - unsigned int maxCnt = 3; + unsigned int maxCnt = minSSMatch; unsigned int currentCnt; vector bestChildren; for (auto it = cladeCnt.at(root).children.begin(); it != cladeCnt.at(root).children.end(); it++) { diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 6c597dd2..45687923 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -31,6 +31,7 @@ class Taxonomer { int maxGap; int minCoveredPos; int accessionLevel; + int minSSMatch; struct MatchBlock { MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {} diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp index 460bab34..0ce445ff 100644 --- a/src/workflow/classify.cpp +++ b/src/workflow/classify.cpp @@ -25,6 +25,7 @@ void setClassifyDefaults(LocalParameters & par){ par.maskProb = 0.9; par.matchPerKmer = 4; par.accessionLevel = 0; + par.minSSMatch = 3; } int classify(int argc, const char **argv, const Command& command) From ba21d6fa33003c3ac4cb5f534749f687ea89a1f6 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 12 Oct 2023 00:16:43 +0900 Subject: [PATCH 39/65] fix missed new line --- src/util/grade.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/util/grade.cpp b/src/util/grade.cpp index 79dbaaa1..a5712d63 100644 --- a/src/util/grade.cpp +++ b/src/util/grade.cpp @@ -314,6 +314,7 @@ ncbiTaxonomy, par, cout, printColumnsIdx, cerr) for (const auto & value : idx2values[idx]) { fnFile << value << "\t"; } + fnFile << endl; } fnFile.close(); } @@ -482,4 +483,4 @@ char compareTaxon_hivExclusion(TaxID shot, TaxID target, CountAtRank & count){ count.FP++; return 'X'; } -} \ No newline at end of file +} From e505ee2e699e2956b465d1e528adbe8556c7025a Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 17 Oct 2023 15:44:05 +0900 Subject: [PATCH 40/65] There was a minor error during extending the last ORF. Now it is fixed, and performance imporved little --- src/commons/IndexCreator.cpp | 2 +- src/commons/SeqIterator.cpp | 109 +++++++++++++++++++++-------------- 2 files changed, 66 insertions(+), 45 deletions(-) diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index a08283c4..2e011d7e 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -961,7 +961,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, } } else { // Reverse complement reverseCompliment = seqIterator.reverseCompliment(seq->seq.s, seq->seq.l); - + // Get extended ORFs prodigal->getPredictedGenes(reverseCompliment); prodigal->removeCompletelyOverlappingGenes(); diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp index d9677a93..dc49b367 100644 --- a/src/commons/SeqIterator.cpp +++ b/src/commons/SeqIterator.cpp @@ -628,56 +628,77 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect } } - //For the last gene - if (find(intergenicKmerList.begin(), intergenicKmerList.end(), leftKmerHash) != - intergenicKmerList.end()) { //extension to left - if (!isReverse) { //forward + // For the last gene + // Extend to the end of the genome + isReverse = !(nodes[genes[numOfGene - 1].start_ndx].strand == 1); + rightEnd = length - 1; + if (isReverse) { + frame = (genes[numOfGene - 1].end - 1) % 3; + while (rightEnd % 3 != frame) rightEnd--; + } + // If left region is not covered, cover it. + leftEnd = genes[numOfGene - 1].begin - 1; + if (hasBeenExtendedToLeft) { + leftEnd = genes[numOfGene - 2].end - 1 - 22; + if (!isReverse) { frame = (genes[numOfGene - 1].begin - 1) % 3; - leftEnd = genes[numOfGene - 2].end - 1 - 22; while (leftEnd % 3 != frame) leftEnd++; - blocks.emplace_back(leftEnd, length - 1, 1); - blockIdx++; - } else { // reverse - frame = (genes[numOfGene - 1].end - 1) % 3; - rightEnd = length - 1; - while (rightEnd % 3 != frame) rightEnd--; - blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1); - blockIdx++; - } - } else { //extension to right - if (hasBeenExtendedToLeft) { - if (!isReverse) { //forward - frame = (genes[numOfGene - 1].begin - 1) % 3; - leftEnd = genes[numOfGene - 2].end - 1 - 22; - while (leftEnd % 3 != frame) leftEnd++; - blocks.emplace_back(leftEnd, length - 1, 1); - blockIdx++; - } else { - frame = (genes[numOfGene - 1].end - 1) % 3; - rightEnd = length - 1; - while (rightEnd % 3 != frame) rightEnd--; - blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1); - blockIdx++; - } - } else { - if (!isReverse) { - blocks.emplace_back(genes[numOfGene - 1].begin, length - 1, 1); - blockIdx++; - } else { - frame = (genes[numOfGene - 1].end - 1) % 3; - rightEnd = length - 1; - while (rightEnd % 3 != frame) rightEnd--; - blocks.emplace_back(genes[numOfGene - 1].begin - 1, rightEnd, -1); - blockIdx++; - } } - - //If current intergenic sequences is new, update intergenicKmerList. - if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { + } + blocks.emplace_back(leftEnd, rightEnd, isReverse ? -1 : 1); + if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { intergenicKmerList.push_back(rightKmerHash); - } } + // if (find(intergenicKmerList.begin(), intergenicKmerList.end(), leftKmerHash) != + // intergenicKmerList.end()) { //extension to left + // if (!isReverse) { //forward + // frame = (genes[numOfGene - 1].begin - 1) % 3; + // leftEnd = genes[numOfGene - 2].end - 1 - 22; + // while (leftEnd % 3 != frame) leftEnd++; + // blocks.emplace_back(leftEnd, length - 1, 1); + // blockIdx++; + // } else { // reverse + // frame = (genes[numOfGene - 1].end - 1) % 3; + // rightEnd = length - 1; + // while (rightEnd % 3 != frame) rightEnd--; + // blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1); + // blockIdx++; + // } + // } else { //extension to right + // if (hasBeenExtendedToLeft) { + // if (!isReverse) { //forward + // frame = (genes[numOfGene - 1].begin - 1) % 3; + // leftEnd = genes[numOfGene - 2].end - 1 - 22; + // while (leftEnd % 3 != frame) leftEnd++; + // blocks.emplace_back(leftEnd, length - 1, 1); + // blockIdx++; + // } else { + // frame = (genes[numOfGene - 1].end - 1) % 3; + // rightEnd = length - 1; + // while (rightEnd % 3 != frame) rightEnd--; + // blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1); + // blockIdx++; + // } + // } else { + // if (!isReverse) { + // blocks.emplace_back(genes[numOfGene - 1].begin, length - 1, 1); + // blockIdx++; + // } else { + // frame = (genes[numOfGene - 1].end - 1) % 3; + // rightEnd = length - 1; + // while (rightEnd % 3 != frame) rightEnd--; + // blocks.emplace_back(genes[numOfGene - 1].begin - 1, rightEnd, -1); + // blockIdx++; + // } + // } + + // //If current intergenic sequences is new, update intergenicKmerList. + // if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { + // intergenicKmerList.push_back(rightKmerHash); + // } + // } + free(newIntergenicKmer); free(leftKmer); free(rightKmer); From ffe587bad3df2ffa2f15ca359724cc35acfe306c Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Wed, 18 Oct 2023 16:46:33 +0900 Subject: [PATCH 41/65] remove unused vairable --- src/commons/Taxonomer.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 0f97a3dc..40f9302b 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -87,8 +87,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, // Get the best genus for current query vector genusMatches; genusMatches.reserve(end - offset + 1); - - int res; TaxonScore genusScore(0, 0, 0, 0); if (par.seqMode == 2) { if (par.spaceMask != "11111111"){ From bc74ffd99e7e81e2b733160408eebe839bc47e09 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Fri, 20 Oct 2023 17:38:10 +0900 Subject: [PATCH 42/65] first commit --- src/commons/KmerMatcher.cpp | 123 ++++++++++++++++++++++-------------- 1 file changed, 76 insertions(+), 47 deletions(-) diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 646bd1cf..df603657 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -159,57 +159,87 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, // Each split has start and end points of query list + proper offset point of target k-mer list std::vector querySplits; uint64_t queryAA; - - if (threads == 1) { //Single thread - querySplits.emplace_back(0, queryKmerNum - 1, queryKmerNum, diffIdxSplits.data[0]); - } else if (threads == 2) { //Two threads - size_t splitWidth = queryKmerNum / 2; - querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]); - for (size_t tSplitCnt = 0; tSplitCnt < numOfDiffIdxSplits_use; tSplitCnt++) { - queryAA = AminoAcidPart(queryKmerList[splitWidth].ADkmer); - if (queryAA <= AminoAcidPart(diffIdxSplits.data[tSplitCnt].ADkmer)) { - tSplitCnt = tSplitCnt - (tSplitCnt != 0); - querySplits.emplace_back(splitWidth, queryKmerNum - 1, queryKmerNum - splitWidth, - diffIdxSplits.data[tSplitCnt]); + size_t quotient = queryKmerNum / threads; + size_t remainder = queryKmerNum % threads; + size_t startIdx = 0; + size_t endIdx = 0; // endIdx is inclusive + for (size_t i = 0; i < threads; i++) { + endIdx = startIdx + quotient - 1; + if (remainder > 0) { + endIdx++; + remainder--; + } + bool needLastTargetBlock = true; + queryAA = AminoAcidPart(queryKmerList[startIdx].ADkmer); + for (size_t j = 0; j < numOfDiffIdxSplits_use; j ++) { + if (queryAA <= AminoAcidPart(diffIdxSplits.data[j].ADkmer)) { + j = j - (j != 0); + querySplits.emplace_back(startIdx, endIdx, endIdx - startIdx + 1, diffIdxSplits.data[j]); + needLastTargetBlock = false; break; } } - } else { //More than two threads - // Devide query k-mers into blocks - size_t splitWidth = queryKmerNum / (threads - 1); - querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]); - size_t i = 1; - for (; (i < threads) && (splitWidth * i < queryKmerNum); i++) { - queryAA = AminoAcidPart(queryKmerList[splitWidth * i].ADkmer); - bool needLastTargetBlock = true; - for (size_t j = 0; j < numOfDiffIdxSplits_use; j++) { - if (queryAA <= AminoAcidPart(diffIdxSplits.data[j].ADkmer)) { - j = j - (j != 0); - if (i != threads - 1) { - querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth, - diffIdxSplits.data[j]); - } else { - querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i, - diffIdxSplits.data[j]); - } - needLastTargetBlock = false; - break; - } - } - if (needLastTargetBlock) { - if (i != threads - 1) { // If it is not the last split - querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth, - diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); - } else { - querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i, - diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); - } - } + if (needLastTargetBlock) { + querySplits.emplace_back(startIdx, endIdx, endIdx - startIdx + 1, diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); } + startIdx = endIdx + 1; + } + + // if (threads == 1) { //Single thread + // querySplits.emplace_back(0, queryKmerNum - 1, queryKmerNum, diffIdxSplits.data[0]); + // } else if (threads == 2) { //Two threads + // size_t splitWidth = queryKmerNum / 2; + // querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]); + // for (size_t tSplitCnt = 0; tSplitCnt < numOfDiffIdxSplits_use; tSplitCnt++) { + // queryAA = AminoAcidPart(queryKmerList[splitWidth].ADkmer); + // if (queryAA <= AminoAcidPart(diffIdxSplits.data[tSplitCnt].ADkmer)) { + // tSplitCnt = tSplitCnt - (tSplitCnt != 0); + // querySplits.emplace_back(splitWidth, queryKmerNum - 1, queryKmerNum - splitWidth, + // diffIdxSplits.data[tSplitCnt]); + // break; + // } + // } + // } else { //More than two threads + // // Devide query k-mers into blocks + // size_t splitWidth = queryKmerNum / (threads - 1); + // querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]); + // size_t i = 1; + // for (; (i < threads) && (splitWidth * i < queryKmerNum); i++) { + // queryAA = AminoAcidPart(queryKmerList[splitWidth * i].ADkmer); + // bool needLastTargetBlock = true; + // for (size_t j = 0; j < numOfDiffIdxSplits_use; j++) { + // if (queryAA <= AminoAcidPart(diffIdxSplits.data[j].ADkmer)) { + // j = j - (j != 0); + // if (i != threads - 1) { + // querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth, + // diffIdxSplits.data[j]); + // } else { + // querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i, + // diffIdxSplits.data[j]); + // } + // needLastTargetBlock = false; + // break; + // } + // } + // if (needLastTargetBlock) { + // if (i != threads - 1) { // If it is not the last split + // querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth, + // diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); + // } else { + // querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i, + // diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); + // } + // } + // } + + // if (i != threads) { + // threads = querySplits.size(); + // } + // } - if (i != threads) { - threads = querySplits.size(); - } + // Print query splits + for (size_t i = 0; i < querySplits.size(); i++) { + cout << i << "\t" << querySplits[i].start << "\t" << querySplits[i].end << endl; } bool *splitCheckList = (bool *) malloc(sizeof(bool) * threads); @@ -411,7 +441,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI // cout << (int) getHammingDistanceSum(currentQuery, currentTargetKmer) << "\t"; // print_binary16(16, getHammings(currentQuery, currentTargetKmer)); cout << endl; // } - if (unlikely(BufferSize < diffIdxBufferIdx + 7)){ loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 ); From 9bfeab18b128e37679b412d24e39947975060b43 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Fri, 20 Oct 2023 17:56:02 +0900 Subject: [PATCH 43/65] print diffIdxSplits --- src/commons/IndexCreator.h | 8 ++++++++ src/commons/KmerMatcher.cpp | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h index 33bc1c66..48968fc5 100644 --- a/src/commons/IndexCreator.h +++ b/src/commons/IndexCreator.h @@ -166,6 +166,14 @@ class IndexCreator{ public: static void splitSequenceFile(vector & seqSegments, MmapedData seqFile); + static void printIndexSplitList(DiffIdxSplit * splitList) { + for (int i = 0; i < 4096; i++) { + cout << splitList[i].infoIdxOffset << " " << + splitList[i].diffIdxOffset << " " << + splitList[i].ADkmer << endl; + } + } + string getSeqSegmentsWithHead(vector & seqSegments, const string & seqFileName, const unordered_map & acc2taxid, diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index df603657..778e79a5 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -1,4 +1,5 @@ #include "KmerMatcher.h" +#include "IndexCreator.h" #include "Kmer.h" #include "Mmap.h" #include @@ -145,6 +146,8 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, } } + IndexCreator::printIndexSplitList(diffIdxSplits.data); + // Filter out meaningless target splits size_t numOfDiffIdxSplits = diffIdxSplits.fileSize / sizeof(DiffIdxSplit); size_t numOfDiffIdxSplits_use = numOfDiffIdxSplits; @@ -155,6 +158,11 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, } } + + cout << numOfDiffIdxSplits_use << endl; + IndexCreator::printIndexSplitList(diffIdxSplits.data); + + // Divide query k-mer list into blocks for multi threading. // Each split has start and end points of query list + proper offset point of target k-mer list std::vector querySplits; From 802d070bb490f773d99a62b44c33028b39f3335d Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Sat, 21 Oct 2023 09:14:56 +0900 Subject: [PATCH 44/65] fix error in void IndexCreator::writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & kmerNum, const LocalParameters & par, --- src/commons/IndexCreator.cpp | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index 2e011d7e..cae63d6f 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -407,17 +407,34 @@ void IndexCreator::writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & k DiffIdxSplit splitList[par.splitNum]; memset(splitList, 0, sizeof(DiffIdxSplit) * par.splitNum); size_t splitWidth = uniqKmerCnt / par.splitNum; + size_t remainder = uniqKmerCnt % par.splitNum; size_t splitCnt = 1; + size_t start = 0; for (size_t i = 1; i < (size_t) par.splitNum; i++) { - for (size_t j = uniqKmerIdx[0] + splitWidth * i; j + 1 < uniqKmerCnt; j++) { - if (AminoAcidPart(kmerBuffer[j].ADkmer) != AminoAcidPart(kmerBuffer[j + 1].ADkmer)) { - if (kmerBuffer[j].ADkmer != splitList[splitCnt - 1].ADkmer){ - splitList[splitCnt].ADkmer = kmerBuffer[j].ADkmer; - splitCnt ++; - } + start = start + splitWidth; + if (remainder > 0) { + start++; + remainder--; + } + for (size_t j = start; j + 1 < start + splitWidth; j++) { + if (AminoAcidPart(kmerBuffer[uniqKmerIdx[j]].ADkmer) + != AminoAcidPart(kmerBuffer[uniqKmerIdx[j + 1]].ADkmer)) { + splitList[splitCnt].ADkmer = kmerBuffer[uniqKmerIdx[j + 1]].ADkmer; + cout << splitList[splitCnt].ADkmer << endl; + splitCnt++; break; } } + // for (size_t j = uniqKmerIdx[0] + splitWidth * i; j + 1 < uniqKmerCnt; j++) { // here is a bug + // if (AminoAcidPart(kmerBuffer[j].ADkmer) != AminoAcidPart(kmerBuffer[j + 1].ADkmer)) { + // if (kmerBuffer[j].ADkmer != splitList[splitCnt - 1].ADkmer){ + // splitList[splitCnt].ADkmer = kmerBuffer[j].ADkmer; + // cout << splitList[splitCnt].ADkmer << endl; + // splitCnt ++; + // } + // break; + // } + // } } FILE * diffIdxFile = fopen(diffIdxFileName.c_str(), "wb"); @@ -446,6 +463,8 @@ void IndexCreator::writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & k if((splitIdx < splitCnt) && (lastKmer == splitList[splitIdx].ADkmer)){ splitList[splitIdx].diffIdxOffset = totalDiffIdx; splitList[splitIdx].infoIdxOffset = write; + cout << "Split " << splitIdx << " at " << splitList[splitIdx].infoIdxOffset << " " << + splitList[splitIdx].diffIdxOffset << " " << splitList[splitIdx].ADkmer << endl; splitIdx ++; } } @@ -454,6 +473,7 @@ void IndexCreator::writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & k cout<<"written k-mer count: "<< write << endl; flushKmerBuf(diffIdxBuffer, diffIdxFile, localBufIdx); + printIndexSplitList(splitList); fwrite(splitList, sizeof(DiffIdxSplit), par.splitNum, diffIdxSplitFile); free(diffIdxBuffer); From 1119a09e8d7770429ed60c217a4e227acc9dc9bb Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Sat, 21 Oct 2023 09:19:11 +0900 Subject: [PATCH 45/65] remove prints --- src/commons/FileMerger.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/commons/FileMerger.cpp b/src/commons/FileMerger.cpp index 291c1316..46924337 100644 --- a/src/commons/FileMerger.cpp +++ b/src/commons/FileMerger.cpp @@ -257,6 +257,7 @@ void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits) int offsetListIdx = 1; for(size_t os = 0; os < splitNum; os++){ offsetList[os] = os * sizeOfSplit; + // cout << os * sizeOfSplit << endl; } offsetList[splitNum] = UINT64_MAX; DiffIdxSplit splitList[splitNum]; From 3f3bf9d410770c365c44170cd5b3bbafd92cb0f4 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Mon, 23 Oct 2023 17:22:43 +0900 Subject: [PATCH 46/65] remove prints and old codes --- src/commons/IndexCreator.cpp | 14 ++------ src/commons/KmerMatcher.cpp | 64 ++---------------------------------- 2 files changed, 4 insertions(+), 74 deletions(-) diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index cae63d6f..fe864baf 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -425,16 +425,6 @@ void IndexCreator::writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & k break; } } - // for (size_t j = uniqKmerIdx[0] + splitWidth * i; j + 1 < uniqKmerCnt; j++) { // here is a bug - // if (AminoAcidPart(kmerBuffer[j].ADkmer) != AminoAcidPart(kmerBuffer[j + 1].ADkmer)) { - // if (kmerBuffer[j].ADkmer != splitList[splitCnt - 1].ADkmer){ - // splitList[splitCnt].ADkmer = kmerBuffer[j].ADkmer; - // cout << splitList[splitCnt].ADkmer << endl; - // splitCnt ++; - // } - // break; - // } - // } } FILE * diffIdxFile = fopen(diffIdxFileName.c_str(), "wb"); @@ -463,8 +453,8 @@ void IndexCreator::writeTargetFilesAndSplits(TargetKmer * kmerBuffer, size_t & k if((splitIdx < splitCnt) && (lastKmer == splitList[splitIdx].ADkmer)){ splitList[splitIdx].diffIdxOffset = totalDiffIdx; splitList[splitIdx].infoIdxOffset = write; - cout << "Split " << splitIdx << " at " << splitList[splitIdx].infoIdxOffset << " " << - splitList[splitIdx].diffIdxOffset << " " << splitList[splitIdx].ADkmer << endl; + // cout << "Split " << splitIdx << " at " << splitList[splitIdx].infoIdxOffset << " " << + // splitList[splitIdx].diffIdxOffset << " " << splitList[splitIdx].ADkmer << endl; splitIdx ++; } } diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 778e79a5..9f8c975c 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -145,8 +145,6 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, break; } } - - IndexCreator::printIndexSplitList(diffIdxSplits.data); // Filter out meaningless target splits size_t numOfDiffIdxSplits = diffIdxSplits.fileSize / sizeof(DiffIdxSplit); @@ -158,11 +156,6 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, } } - - cout << numOfDiffIdxSplits_use << endl; - IndexCreator::printIndexSplitList(diffIdxSplits.data); - - // Divide query k-mer list into blocks for multi threading. // Each split has start and end points of query list + proper offset point of target k-mer list std::vector querySplits; @@ -192,62 +185,9 @@ int KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, } startIdx = endIdx + 1; } - - // if (threads == 1) { //Single thread - // querySplits.emplace_back(0, queryKmerNum - 1, queryKmerNum, diffIdxSplits.data[0]); - // } else if (threads == 2) { //Two threads - // size_t splitWidth = queryKmerNum / 2; - // querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]); - // for (size_t tSplitCnt = 0; tSplitCnt < numOfDiffIdxSplits_use; tSplitCnt++) { - // queryAA = AminoAcidPart(queryKmerList[splitWidth].ADkmer); - // if (queryAA <= AminoAcidPart(diffIdxSplits.data[tSplitCnt].ADkmer)) { - // tSplitCnt = tSplitCnt - (tSplitCnt != 0); - // querySplits.emplace_back(splitWidth, queryKmerNum - 1, queryKmerNum - splitWidth, - // diffIdxSplits.data[tSplitCnt]); - // break; - // } - // } - // } else { //More than two threads - // // Devide query k-mers into blocks - // size_t splitWidth = queryKmerNum / (threads - 1); - // querySplits.emplace_back(0, splitWidth - 1, splitWidth, diffIdxSplits.data[0]); - // size_t i = 1; - // for (; (i < threads) && (splitWidth * i < queryKmerNum); i++) { - // queryAA = AminoAcidPart(queryKmerList[splitWidth * i].ADkmer); - // bool needLastTargetBlock = true; - // for (size_t j = 0; j < numOfDiffIdxSplits_use; j++) { - // if (queryAA <= AminoAcidPart(diffIdxSplits.data[j].ADkmer)) { - // j = j - (j != 0); - // if (i != threads - 1) { - // querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth, - // diffIdxSplits.data[j]); - // } else { - // querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i, - // diffIdxSplits.data[j]); - // } - // needLastTargetBlock = false; - // break; - // } - // } - // if (needLastTargetBlock) { - // if (i != threads - 1) { // If it is not the last split - // querySplits.emplace_back(splitWidth * i, splitWidth * (i + 1) - 1, splitWidth, - // diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); - // } else { - // querySplits.emplace_back(splitWidth * i, queryKmerNum - 1, queryKmerNum - splitWidth * i, - // diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); - // } - // } - // } - - // if (i != threads) { - // threads = querySplits.size(); - // } - // } - // Print query splits - for (size_t i = 0; i < querySplits.size(); i++) { - cout << i << "\t" << querySplits[i].start << "\t" << querySplits[i].end << endl; + if (querySplits.size() != threads) { + threads = querySplits.size(); } bool *splitCheckList = (bool *) malloc(sizeof(bool) * threads); From b75d2ae74652c43a651d09901e33262c6a543a64 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Fri, 27 Oct 2023 15:27:58 +0900 Subject: [PATCH 47/65] Fix error in KmerMatcher.cpp: changed condition to start with getNextTargetKmer --- src/commons/KmerMatcher.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 9f8c975c..f99ccbd3 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -251,11 +251,13 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI loadBuffer(kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx, BufferSize); fseek(diffIdxFp, 2 * (long) (diffIdxBufferIdx), SEEK_SET); loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize); - - if (i == 0) { + + if (querySplits[i].diffIdxSplit.ADkmer == 0 && querySplits[i].diffIdxSplit.diffIdxOffset == 0 + && querySplits[i].diffIdxSplit.infoIdxOffset == 0) { currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, diffIdxBufferIdx, diffIdxPos); } + currentQuery = UINT64_MAX; currentQueryAA = UINT64_MAX; From a7db9f9af6deb61e1b2838e073a71b55e32e20c3 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Mon, 30 Oct 2023 16:39:14 +0900 Subject: [PATCH 48/65] fixed the problem related to reads < 26 bp. Thank you Niko! --- src/commons/KmerExtractor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp index 4d656cdf..232cc89a 100644 --- a/src/commons/KmerExtractor.cpp +++ b/src/commons/KmerExtractor.cpp @@ -85,7 +85,7 @@ void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq1, for (size_t i = 0; i < currentQueryNum; i ++) { size_t queryIdx = processedQueryNum - currentQueryNum + i; // Get k-mer count - auto kmerCnt = LocalUtil::getQueryKmerNumber(reads1[i].length(), spaceNum); + int kmerCnt = LocalUtil::getQueryKmerNumber(reads1[i].length(), spaceNum); // Ignore short read if (kmerCnt < 1) { continue; } From 49dc8090c333a9ebae7db10dd26ac90b43ae5719 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Mon, 30 Oct 2023 19:17:24 +0900 Subject: [PATCH 49/65] Choose the best species directly skipping genus selection --- src/commons/Taxonomer.cpp | 74 ++++++++++++++++++++++++++++++++++++++- src/commons/Taxonomer.h | 10 ++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 40f9302b..c60e66d0 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -1,5 +1,7 @@ #include "Taxonomer.h" +#include "Match.h" #include "NcbiTaxonomy.h" +#include #include @@ -327,6 +329,76 @@ TaxID Taxonomer::BFS(const unordered_map & cladeCnt, TaxID r } } +TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, + const Match *matchList, + size_t end, + size_t offset, + int queryLength, + const LocalParameters &par) { + TaxID currentSpecies; + vector filteredMatches; + vector> matchesForEachSpecies; + vector speciesScores; + TaxonScore bestScore; + size_t i = offset; + uint8_t curFrame; + vector curFrameMatches; + + while (i < end + 1) { + currentSpecies = matchList[i].speciesId; + // For current species + while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { + curFrame = matchList[i].qInfo.frame; + curFrameMatches.clear(); + // For current frame + while ((i < end + 1) && currentSpecies == matchList[i].speciesId && curFrame == matchList[i].qInfo.frame) { + curFrameMatches.push_back(&matchList[i]); + i ++; + } + if (curFrameMatches.size() > 1) { + remainConsecutiveMatches(curFrameMatches, filteredMatches, currentSpecies, par); + } + } + // Construct a match combination using filtered matches of current species + // so that it can best cover the query, and score the combination + if (!filteredMatches.empty()) { + matchesForEachSpecies.push_back(filteredMatches); + speciesScores.push_back(scoreGenus(filteredMatches, queryLength)); + } + filteredMatches.clear(); + } + + // If there are no meaningful species + if (speciesScores.empty()) { + bestScore.score = 0; + return bestScore; + } + + TaxonScore maxScore = *max_element(speciesScores.begin(), speciesScores.end(), + [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); + + vector maxIdx; + for (size_t g = 0; g < speciesScores.size(); g++) { + if (speciesScores[g].score == maxScore.score) { + maxIdx.push_back(g); + } + } + bestScore = maxScore; + + for (unsigned long g : maxIdx) { + for (const Match * m : matchesForEachSpecies[g]) { + speciesMatches.push_back(*m); + } + } + + // More than one species + if (maxIdx.size() > 1) { + bestScore.taxId = 0; + } + + return bestScore; +} + TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, size_t offset, int readLength1, int readLength2, const LocalParameters & par) { TaxID currentGenus; @@ -1093,7 +1165,7 @@ TaxonScore Taxonomer::scoreSpecies(const vector &matches, int queryLength, int queryLength2) { - // Get the smallest hamming distance at each position of query + // Get the largest hamming distance at each position of query int aminoAcidNum_total = queryLength / 3 + queryLength2 / 3; int aminoAcidNum_read1 = queryLength / 3; auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3]; diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 45687923..8a1e0999 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -62,6 +62,13 @@ class Taxonomer { const Match *matchList, vector & queryList, const LocalParameters &par); + + void chooseBestTaxon2(uint32_t currentQuery, + size_t offset, + size_t end, + const Match *matchList, + vector & queryList, + const LocalParameters &par); void remainConsecutiveMatches(vector & curFrameMatches, vector & filteredMatches, @@ -80,6 +87,9 @@ class Taxonomer { TaxonScore getBestGenusMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, int readLength1, int readLength2, const LocalParameters &par); + TaxonScore getBestSpeciesMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, + size_t offset, int queryLength, const LocalParameters &par); + TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, int readLength1, int readLength2); TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, From d0f53447c3161f0eb0a8ab3b71efec4cd4597bf9 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 31 Oct 2023 11:17:56 +0900 Subject: [PATCH 50/65] First running version --- src/commons/Taxonomer.cpp | 726 ++++++++++++++++++++++++-------------- src/commons/Taxonomer.h | 31 +- 2 files changed, 486 insertions(+), 271 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index c60e66d0..b52f4447 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -21,6 +21,9 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon minCoveredPos = par.minCoveredPos; accessionLevel = par.accessionLevel; minSSMatch = par.minSSMatch; + minConsCnt = par.minConsCnt; + minConsCntEuk = par.minConsCntEuk; + eukaryotaTaxId = par.eukaryotaTaxId; } Taxonomer::~Taxonomer() { @@ -54,7 +57,7 @@ void Taxonomer::assignTaxonomy(const Match *matchList, { #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < blockIdx; ++i) { - chooseBestTaxon(matchBlocks[i].id, + chooseBestTaxon2(matchBlocks[i].id, matchBlocks[i].start, matchBlocks[i].end, matchList, @@ -71,6 +74,127 @@ void Taxonomer::assignTaxonomy(const Match *matchList, } +void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, + size_t offset, + size_t end, + const Match *matchList, + vector & queryList, + const LocalParameters &par) { + TaxID selectedTaxon; + +// if (true) { +// cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; +// for (size_t i = offset; i < end + 1; i++) { +// cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; +// } +// } + + // Get the best species for current query + vector speciesMatches; + speciesMatches.reserve(end - offset + 1); + TaxonScore speciesScore(0, 0, 0, 0); + if (par.seqMode == 2) { + speciesScore = getBestSpeciesMatches(speciesMatches, matchList, end, offset, + queryList[currentQuery].queryLength, + queryList[currentQuery].queryLength2); + } else { + speciesScore = getBestSpeciesMatches(speciesMatches, matchList, end, offset, + queryList[currentQuery].queryLength); + } + +// if (true) { +// cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n"; +// for (size_t i = 0; i < genusMatches.size(); i++) { +// cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) << " " << int(genusMatches[i].redundancy) << endl; +// } +// cout << "Genus score: " << genusScore.score << "\n"; +// } + + // If there is no proper species for current query, it is un-classified. + if (speciesScore.score == 0 || speciesScore.coverage < par.minCoverage || speciesScore.score < par.minScore) { + queryList[currentQuery].isClassified = false; + queryList[currentQuery].classification = 0; + queryList[currentQuery].score = speciesScore.score; + queryList[currentQuery].coverage = speciesScore.coverage; + queryList[currentQuery].hammingDist = speciesScore.hammingDist; + queryList[currentQuery].newSpecies = false; + return; + } + + // If there are two or more good genus level candidates, find the LCA. + if (speciesScore.taxId == 0) { + vector genusList; + genusList.reserve(speciesMatches.size()); + for (auto & genusMatch : speciesMatches) { + genusList.push_back(genusMatch.genusId); + } + selectedTaxon = taxonomy->LCA(genusList)->taxId; + queryList[currentQuery].isClassified = true; + queryList[currentQuery].classification = selectedTaxon; + queryList[currentQuery].score = speciesScore.score; + queryList[currentQuery].coverage = speciesScore.coverage; + queryList[currentQuery].hammingDist = speciesScore.hammingDist; + for (auto & spMatch : speciesMatches) { + queryList[currentQuery].taxCnt[spMatch.targetId]++; + } + return; + } + + // If score is not enough, classify to the parent of the selected species + if (speciesScore.score < par.minSpScore) { + queryList[currentQuery].isClassified = true; + queryList[currentQuery].classification = taxonomy->taxonNode( + taxonomy->getTaxIdAtRank(speciesScore.taxId, "species"))->parentTaxId; + queryList[currentQuery].score = speciesScore.score; + queryList[currentQuery].coverage = speciesScore.coverage; + queryList[currentQuery].hammingDist = speciesScore.hammingDist; + for (auto & spMatch : speciesMatches) { + queryList[currentQuery].taxCnt[spMatch.targetId]++; + } + return; + } + + // Sort matches by the position of the query sequence +// sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first, +// genusMatches.begin() + speciesMatchRange[selectedSpecies].second, +// [](const Match & a, const Match & b) { +// if (a.qInfo.position / 3 == b.qInfo.position / 3) +// return a.hamming < b.hamming; +// else +// return a.qInfo.position / 3 < b.qInfo.position / 3; +// }); + + sort(speciesMatches.begin(), speciesMatches.end(), + [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); + + + TaxID result = lowerRankClassification(speciesMatches, speciesScore.taxId); + + // Record matches of selected species + for (auto & spMatch : speciesMatches) { + queryList[currentQuery].taxCnt[spMatch.targetId]++; + } + + // Store classification results + queryList[currentQuery].isClassified = true; + queryList[currentQuery].classification = result; + queryList[currentQuery].score = speciesScore.score; + queryList[currentQuery].coverage = speciesScore.coverage; + queryList[currentQuery].hammingDist = speciesScore.hammingDist; + queryList[currentQuery].newSpecies = false; +// if (par.printLog) { +// cout << "# " << currentQuery << endl; +// for (size_t i = 0; i < genusMatches.size(); i++) { +// cout << i << " " << genusMatches[i].qInfo.pos << " " << +// genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl; +// } +// cout << "Score: " << speciesScore.score << " " << selectedSpecies << " " +// << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx) +// +// << endl; +// } +} + void Taxonomer::chooseBestTaxon(uint32_t currentQuery, size_t offset, size_t end, @@ -91,23 +215,12 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, genusMatches.reserve(end - offset + 1); TaxonScore genusScore(0, 0, 0, 0); if (par.seqMode == 2) { - if (par.spaceMask != "11111111"){ - genusScore = getBestGenusMatches_spaced(genusMatches, matchList, end, offset, - queryList[currentQuery].queryLength, - queryList[currentQuery].queryLength2); - } else { - genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, + genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, queryList[currentQuery].queryLength, - queryList[currentQuery].queryLength2, par); - } + queryList[currentQuery].queryLength2); } else { - if (par.spaceMask != "11111111") { - genusScore = getBestGenusMatches_spaced(genusMatches, matchList, end, offset, - queryList[currentQuery].queryLength); - } else { - genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, - queryList[currentQuery].queryLength, par); - } + genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, + queryList[currentQuery].queryLength); } // if (true) { @@ -207,12 +320,15 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, // return a.qInfo.position / 3 < b.qInfo.position / 3; // }); - sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first, - genusMatches.begin() + speciesMatchRange[selectedSpecies].second, - [](const Match & a, const Match & b) { return a.qInfo.pos > b.qInfo.pos; }); + vector::const_iterator first = genusMatches.begin() + speciesMatchRange[selectedSpecies].first; + vector::const_iterator last = genusMatches.begin() + speciesMatchRange[selectedSpecies].second; + vector speciesMatches(first, last); - TaxID result = lowerRankClassification(genusMatches, speciesMatchRange[selectedSpecies], selectedSpecies); + sort(speciesMatches.begin(), speciesMatches.end(), + [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); + + TaxID result = lowerRankClassification(speciesMatches, selectedSpecies); // Record matches of selected species for (size_t i = speciesMatchRange[selectedSpecies].first; i < speciesMatchRange[selectedSpecies].second; i++) { @@ -239,17 +355,17 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, // } } -TaxID Taxonomer::lowerRankClassification(vector &matches, pair &matchRange, TaxID spTaxId) { - int i = matchRange.second - 1; +TaxID Taxonomer::lowerRankClassification(vector &matches, TaxID spTaxId) { + unordered_map taxCnt; + size_t matchNum = matches.size(); - while ( i >= matchRange.first ) { + for (size_t i = 0; i < matchNum; i++) { size_t currQuotient = matches[i].qInfo.pos / 3; uint8_t minHamming = matches[i].hamming; Match * minHammingMatch = & matches[i]; TaxID minHammingTaxId = minHammingMatch->targetId; - i --; - while ( (i >= matchRange.first) && (currQuotient == matches[i].qInfo.pos / 3) ) { + while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) { if (matches[i].hamming < minHamming) { minHamming = matches[i].hamming; minHammingMatch = & matches[i]; @@ -259,11 +375,33 @@ TaxID Taxonomer::lowerRankClassification(vector &matches, pair minHammingMatch->redundancy = true; matches[i].redundancy = true; } - i--; + i++; } - taxCnt[minHammingTaxId]++; + taxCnt[minHammingTaxId]++; } + // int i = matchRange.second - 1; + // while ( i >= matchRange.first ) { + // size_t currQuotient = matches[i].qInfo.pos / 3; + // uint8_t minHamming = matches[i].hamming; + // Match * minHammingMatch = & matches[i]; + // TaxID minHammingTaxId = minHammingMatch->targetId; + // i --; + // while ( (i >= matchRange.first) && (currQuotient == matches[i].qInfo.pos / 3) ) { + // if (matches[i].hamming < minHamming) { + // minHamming = matches[i].hamming; + // minHammingMatch = & matches[i]; + // minHammingTaxId = minHammingMatch->targetId; + // } else if (matches[i].hamming == minHamming) { + // minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId); + // minHammingMatch->redundancy = true; + // matches[i].redundancy = true; + // } + // i--; + // } + // taxCnt[minHammingTaxId]++; + // } + unordered_map cladeCnt; getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); @@ -333,8 +471,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, const Match *matchList, size_t end, size_t offset, - int queryLength, - const LocalParameters &par) { + int queryLength) { TaxID currentSpecies; vector filteredMatches; vector> matchesForEachSpecies; @@ -356,14 +493,84 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, i ++; } if (curFrameMatches.size() > 1) { - remainConsecutiveMatches(curFrameMatches, filteredMatches, currentSpecies, par); + remainConsecutiveMatches(curFrameMatches, filteredMatches, currentSpecies); } } // Construct a match combination using filtered matches of current species // so that it can best cover the query, and score the combination if (!filteredMatches.empty()) { matchesForEachSpecies.push_back(filteredMatches); - speciesScores.push_back(scoreGenus(filteredMatches, queryLength)); + speciesScores.push_back(scoreTaxon(filteredMatches, currentSpecies, queryLength)); + } + filteredMatches.clear(); + } + + // If there are no meaningful species + if (speciesScores.empty()) { + bestScore.score = 0; + return bestScore; + } + + TaxonScore maxScore = *max_element(speciesScores.begin(), speciesScores.end(), + [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); + + vector maxIdx; + for (size_t g = 0; g < speciesScores.size(); g++) { + if (speciesScores[g].score == maxScore.score) { + maxIdx.push_back(g); + } + } + bestScore = maxScore; + + for (unsigned long g : maxIdx) { + for (const Match * m : matchesForEachSpecies[g]) { + speciesMatches.push_back(*m); + } + } + + // More than one species + if (maxIdx.size() > 1) { + bestScore.taxId = 0; + } + + return bestScore; +} + +TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, + const Match *matchList, + size_t end, + size_t offset, + int readLength1, + int readLength2) { + TaxID currentSpecies; + vector filteredMatches; + vector> matchesForEachSpecies; + vector speciesScores; + TaxonScore bestScore; + size_t i = offset; + uint8_t curFrame; + vector curFrameMatches; + + while (i < end + 1) { + currentSpecies = matchList[i].speciesId; + // For current species + while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { + curFrame = matchList[i].qInfo.frame; + curFrameMatches.clear(); + // For current frame + while ((i < end + 1) && currentSpecies == matchList[i].speciesId && curFrame == matchList[i].qInfo.frame) { + curFrameMatches.push_back(&matchList[i]); + i ++; + } + if (curFrameMatches.size() > 1) { + remainConsecutiveMatches(curFrameMatches, filteredMatches, currentSpecies); + } + } + // Construct a match combination using filtered matches of current species + // so that it can best cover the query, and score the combination + if (!filteredMatches.empty()) { + matchesForEachSpecies.push_back(filteredMatches); + speciesScores.push_back(scoreTaxon(filteredMatches, currentSpecies, readLength1, readLength2)); } filteredMatches.clear(); } @@ -400,7 +607,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, } TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, - size_t offset, int readLength1, int readLength2, const LocalParameters & par) { + size_t offset, int readLength1, int readLength2) { TaxID currentGenus; TaxID currentSpecies; @@ -433,7 +640,7 @@ TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Mat i ++; } if (curFrameMatches.size() > 1) { - remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus, par); + remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus); } } } @@ -442,7 +649,7 @@ TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Mat // so that it can best cover the query, and score the combination if (!filteredMatches.empty()) { matchesForEachGenus.push_back(filteredMatches); - genusScores.push_back(scoreGenus(filteredMatches, readLength1, readLength2)); + genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, readLength1, readLength2)); } filteredMatches.clear(); } @@ -488,8 +695,7 @@ TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Mat void Taxonomer::remainConsecutiveMatches(vector & curFrameMatches, vector & filteredMatches, - TaxID genusId, - const LocalParameters & par) { + TaxID genusId) { size_t i = 0; size_t end = curFrameMatches.size(); vector> curPosMatches; // @@ -537,9 +743,9 @@ void Taxonomer::remainConsecutiveMatches(vector & curFrameMatches // } // Iterate linkedMatches to get filteredMatches - int MIN_DEPTH = par.minConsCnt - 1; - if (taxonomy->IsAncestor(par.eukaryotaTaxId, genusId)) { - MIN_DEPTH = par.minConsCntEuk - 1; + int MIN_DEPTH = minConsCnt - 1; + if (taxonomy->IsAncestor(eukaryotaTaxId, genusId)) { + MIN_DEPTH = minConsCntEuk - 1; } unordered_set used; vector filteredMatchIdx; @@ -597,116 +803,116 @@ size_t Taxonomer::DFS(size_t curMatchIdx, const map> & li return maxDepth; } -TaxonScore Taxonomer::getBestGenusMatches_spaced(vector &genusMatches, const Match *matchList, size_t end, - size_t offset, int readLength1, int readLength2) { - TaxID currentGenus; - TaxID currentSpecies; - - vector tempMatchContainer; - vector filteredMatches; - vector> matchesForEachGenus; - vector conservedWithinGenus; - vector genusScores; - TaxonScore bestScore; - size_t i = offset; - bool lastIn; - while (i + 1 < end + 1) { - currentGenus = matchList[i].genusId; - // For current genus - while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) { -// currentSpecies = taxId2speciesId[matchList[i].targetId]; - currentSpecies = matchList[i].speciesId; - // For current species - // Filter un-consecutive matches (probably random matches) - lastIn = false; - int distance = 0; - int diffPosCntOfCurrRange = 1; - int dnaDist = 0; - - // For the same species - while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) { - distance = matchList[i+1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3; - dnaDist = matchList[i+1].qInfo.pos - matchList[i].qInfo.pos; - if (distance == 0) { // At the same position - tempMatchContainer.push_back(matchList + i); - } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping - lastIn = true; - tempMatchContainer.push_back(matchList + i); - diffPosCntOfCurrRange ++; - } else { // Not consecutive --> End range - if (lastIn){ - tempMatchContainer.push_back(matchList + i); - if (diffPosCntOfCurrRange >= minCoveredPos) { - filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), - tempMatchContainer.end()); - } - } - lastIn = false; - // Initialize range info - tempMatchContainer.clear(); - diffPosCntOfCurrRange = 1; - } - i++; - } - - // Met next species - if (lastIn) { - tempMatchContainer.push_back(matchList + i); - if (diffPosCntOfCurrRange >= minCoveredPos) { - filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), - tempMatchContainer.end()); - } - } - tempMatchContainer.clear(); - i++; - } - - // Construct a match combination using filtered matches of current genus - // so that it can best cover the query, and score the combination - if (!filteredMatches.empty()) { - genusScores.push_back(scoreGenus(filteredMatches, readLength1, readLength2)); - } - filteredMatches.clear(); - } +// TaxonScore Taxonomer::getBestGenusMatches_spaced(vector &genusMatches, const Match *matchList, size_t end, +// size_t offset, int readLength1, int readLength2) { +// TaxID currentGenus; +// TaxID currentSpecies; + +// vector tempMatchContainer; +// vector filteredMatches; +// vector> matchesForEachGenus; +// vector conservedWithinGenus; +// vector genusScores; +// TaxonScore bestScore; +// size_t i = offset; +// bool lastIn; +// while (i + 1 < end + 1) { +// currentGenus = matchList[i].genusId; +// // For current genus +// while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) { +// // currentSpecies = taxId2speciesId[matchList[i].targetId]; +// currentSpecies = matchList[i].speciesId; +// // For current species +// // Filter un-consecutive matches (probably random matches) +// lastIn = false; +// int distance = 0; +// int diffPosCntOfCurrRange = 1; +// int dnaDist = 0; + +// // For the same species +// while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) { +// distance = matchList[i+1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3; +// dnaDist = matchList[i+1].qInfo.pos - matchList[i].qInfo.pos; +// if (distance == 0) { // At the same position +// tempMatchContainer.push_back(matchList + i); +// } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping +// lastIn = true; +// tempMatchContainer.push_back(matchList + i); +// diffPosCntOfCurrRange ++; +// } else { // Not consecutive --> End range +// if (lastIn){ +// tempMatchContainer.push_back(matchList + i); +// if (diffPosCntOfCurrRange >= minCoveredPos) { +// filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), +// tempMatchContainer.end()); +// } +// } +// lastIn = false; +// // Initialize range info +// tempMatchContainer.clear(); +// diffPosCntOfCurrRange = 1; +// } +// i++; +// } + +// // Met next species +// if (lastIn) { +// tempMatchContainer.push_back(matchList + i); +// if (diffPosCntOfCurrRange >= minCoveredPos) { +// filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), +// tempMatchContainer.end()); +// } +// } +// tempMatchContainer.clear(); +// i++; +// } - // If there are no meaningful genus - if (genusScores.empty()) { - bestScore.score = 0; - return bestScore; - } +// // Construct a match combination using filtered matches of current genus +// // so that it can best cover the query, and score the combination +// if (!filteredMatches.empty()) { +// genusScores.push_back(scoreTaxon(filteredMatches, readLength1, readLength2)); +// } +// filteredMatches.clear(); +// } + +// // If there are no meaningful genus +// if (genusScores.empty()) { +// bestScore.score = 0; +// return bestScore; +// } + +// TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), +// [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); + +// vector maxIdx; +// for (size_t g = 0; g < genusScores.size(); g++) { +// if (genusScores[g].score > maxScore.score * 0.95f) { +// maxIdx.push_back(g); +// } +// } +// bestScore = maxScore; - TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), - [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); +// for (unsigned long g : maxIdx) { +// for (const Match * m : matchesForEachGenus[g]) { +// genusMatches.push_back(*m); +// } +// } - vector maxIdx; - for (size_t g = 0; g < genusScores.size(); g++) { - if (genusScores[g].score > maxScore.score * 0.95f) { - maxIdx.push_back(g); - } - } - bestScore = maxScore; +// // More than one genus +// if (maxIdx.size() > 1) { +// bestScore.taxId = 0; +// return bestScore; +// } +// return bestScore; - for (unsigned long g : maxIdx) { - for (const Match * m : matchesForEachGenus[g]) { - genusMatches.push_back(*m); - } - } - - // More than one genus - if (maxIdx.size() > 1) { - bestScore.taxId = 0; - return bestScore; - } - return bestScore; - - //Three cases - //1. one genus - //2. more than one genus - //4. no genus -} +// //Three cases +// //1. one genus +// //2. more than one genus +// //4. no genus +// } TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, - size_t offset, int queryLength, const LocalParameters & par) { + size_t offset, int queryLength) { TaxID currentGenus; TaxID currentSpecies; @@ -735,7 +941,7 @@ TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Mat i ++; } if (curFrameMatches.size() > 1) { - remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus, par); + remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus); } } } @@ -745,7 +951,7 @@ TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Mat if (!filteredMatches.empty()) { matchesForEachGenus.push_back(filteredMatches); - genusScores.push_back(scoreGenus(filteredMatches, queryLength)); + genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, queryLength)); } filteredMatches.clear(); } @@ -787,116 +993,117 @@ TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Mat //4. no genus } -TaxonScore Taxonomer::getBestGenusMatches_spaced(vector &genusMatches, const Match *matchList, size_t end, - size_t offset, int readLength) { - TaxID currentGenus; - TaxID currentSpecies; - - vector tempMatchContainer; - vector filteredMatches; - vector> matchesForEachGenus; - vector conservedWithinGenus; - vector genusScores; - TaxonScore bestScore; - size_t i = offset; - bool lastIn; - size_t speciesMatchCnt; - while (i + 1 < end + 1) { - currentGenus = matchList[i].genusId; - // For current genus - while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) { - currentSpecies = matchList[i].speciesId; - // For current species - // Filter un-consecutive matches (probably random matches) - lastIn = false; - int distance = 0; - int diffPosCntOfCurrRange = 1; - int dnaDist = 0; - - // For the same species - while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) { - distance = matchList[i + 1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3; - dnaDist = matchList[i + 1].qInfo.pos - matchList[i].qInfo.pos; - if (distance == 0) { // At the same position - tempMatchContainer.push_back(matchList + i); - } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping - lastIn = true; - tempMatchContainer.push_back(matchList + i); - diffPosCntOfCurrRange++; - } else { // Not consecutive --> End range - if (lastIn) { - tempMatchContainer.push_back(matchList + i); - if (diffPosCntOfCurrRange >= minCoveredPos) { - filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), - tempMatchContainer.end()); - } - } - lastIn = false; - // Initialize range info - tempMatchContainer.clear(); - diffPosCntOfCurrRange = 1; - } - i++; - } - - // Met next species - if (lastIn) { - tempMatchContainer.push_back(matchList + i); - if (diffPosCntOfCurrRange >= minCoveredPos) { - filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), - tempMatchContainer.end()); - } - } - tempMatchContainer.clear(); - i++; - } - - // Construct a match combination using filtered matches of current genus - // so that it can best cover the query, and score the combination - if (!filteredMatches.empty()) { - genusScores.push_back(scoreGenus(filteredMatches, readLength)); - } - filteredMatches.clear(); - } - - // If there are no meaningful genus - if (genusScores.empty()) { - bestScore.score = 0; - return bestScore; - } - - TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), - [](const TaxonScore &a, const TaxonScore &b) { return a.score < b.score; }); - - vector maxIdx; - for (size_t g = 0; g < genusScores.size(); g++) { - if (genusScores[g].score > maxScore.score * 0.95f) { - maxIdx.push_back(g); - } - } - bestScore = maxScore; - - for (unsigned long g: maxIdx) { - genusMatches.insert(genusMatches.end(), - matchesForEachGenus[g].begin(), - matchesForEachGenus[g].end()); - } - - // More than one genus - if (maxIdx.size() > 1) { - bestScore.taxId = 0; - return bestScore; - } - return bestScore; - - //Three cases - //1. one genus - //2. more than one genus - //4. no genus -} +// TaxonScore Taxonomer::getBestGenusMatches_spaced(vector &genusMatches, const Match *matchList, size_t end, +// size_t offset, int readLength) { +// TaxID currentGenus; +// TaxID currentSpecies; + +// vector tempMatchContainer; +// vector filteredMatches; +// vector> matchesForEachGenus; +// vector conservedWithinGenus; +// vector genusScores; +// TaxonScore bestScore; +// size_t i = offset; +// bool lastIn; +// size_t speciesMatchCnt; +// while (i + 1 < end + 1) { +// currentGenus = matchList[i].genusId; +// // For current genus +// while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) { +// currentSpecies = matchList[i].speciesId; +// // For current species +// // Filter un-consecutive matches (probably random matches) +// lastIn = false; +// int distance = 0; +// int diffPosCntOfCurrRange = 1; +// int dnaDist = 0; + +// // For the same species +// while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) { +// distance = matchList[i + 1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3; +// dnaDist = matchList[i + 1].qInfo.pos - matchList[i].qInfo.pos; +// if (distance == 0) { // At the same position +// tempMatchContainer.push_back(matchList + i); +// } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping +// lastIn = true; +// tempMatchContainer.push_back(matchList + i); +// diffPosCntOfCurrRange++; +// } else { // Not consecutive --> End range +// if (lastIn) { +// tempMatchContainer.push_back(matchList + i); +// if (diffPosCntOfCurrRange >= minCoveredPos) { +// filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), +// tempMatchContainer.end()); +// } +// } +// lastIn = false; +// // Initialize range info +// tempMatchContainer.clear(); +// diffPosCntOfCurrRange = 1; +// } +// i++; +// } + +// // Met next species +// if (lastIn) { +// tempMatchContainer.push_back(matchList + i); +// if (diffPosCntOfCurrRange >= minCoveredPos) { +// filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), +// tempMatchContainer.end()); +// } +// } +// tempMatchContainer.clear(); +// i++; +// } -TaxonScore Taxonomer::scoreGenus(vector &filteredMatches, - int queryLength) { +// // Construct a match combination using filtered matches of current genus +// // so that it can best cover the query, and score the combination +// if (!filteredMatches.empty()) { +// genusScores.push_back(scoreTaxon(filteredMatches, readLength)); +// } +// filteredMatches.clear(); +// } + +// // If there are no meaningful genus +// if (genusScores.empty()) { +// bestScore.score = 0; +// return bestScore; +// } + +// TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), +// [](const TaxonScore &a, const TaxonScore &b) { return a.score < b.score; }); + +// vector maxIdx; +// for (size_t g = 0; g < genusScores.size(); g++) { +// if (genusScores[g].score > maxScore.score * 0.95f) { +// maxIdx.push_back(g); +// } +// } +// bestScore = maxScore; + +// for (unsigned long g: maxIdx) { +// genusMatches.insert(genusMatches.end(), +// matchesForEachGenus[g].begin(), +// matchesForEachGenus[g].end()); +// } + +// // More than one genus +// if (maxIdx.size() > 1) { +// bestScore.taxId = 0; +// return bestScore; +// } +// return bestScore; + +// //Three cases +// //1. one genus +// //2. more than one genus +// //4. no genus +// } + +TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, + TaxID taxId, + int queryLength) { // Calculate Hamming distance & covered length int coveredPosCnt = 0; uint16_t currHammings; @@ -948,12 +1155,13 @@ TaxonScore Taxonomer::scoreGenus(vector &filteredMatches, float score = ((float) coveredLength - hammingSum) / (float) queryLength; float coverage = (float) (coveredLength) / (float) (queryLength); - return {filteredMatches[0]->genusId, score, coverage, (int) hammingSum}; + return {taxId, score, coverage, (int) hammingSum}; } -TaxonScore Taxonomer::scoreGenus(vector &filteredMatches, - int readLength1, - int readLength2) { +TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, + TaxID taxId, + int readLength1, + int readLength2) { // Calculate Hamming distance & covered length uint16_t currHammings; @@ -1024,7 +1232,7 @@ TaxonScore Taxonomer::scoreGenus(vector &filteredMatches, float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (readLength1 + readLength2); // matchesForEachGenus.push_back(move(filteredMatches)); - return {filteredMatches[0]->genusId, score, coverage, (int) hammingSum}; + return {taxId, score, coverage, (int) hammingSum}; } TaxonScore Taxonomer::chooseSpecies(const vector &matches, diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 8a1e0999..a756db73 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -32,6 +32,9 @@ class Taxonomer { int minCoveredPos; int accessionLevel; int minSSMatch; + int minConsCnt; + int minConsCntEuk; + int eukaryotaTaxId; struct MatchBlock { MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {} @@ -72,8 +75,7 @@ class Taxonomer { void remainConsecutiveMatches(vector & curFrameMatches, vector & filteredMatches, - TaxID genusId, - const LocalParameters & par); + TaxID genusId); size_t DFS(size_t curMatchIdx, const map>& linkedMatches, vector& fiteredMatchIdx, size_t depth, size_t MIN_DEPTH, unordered_set& used, @@ -82,23 +84,28 @@ class Taxonomer { static bool isConsecutive(const Match * match1, const Match * match2); TaxonScore getBestGenusMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, - size_t offset, int queryLength, const LocalParameters &par); + size_t offset, int queryLength); TaxonScore getBestGenusMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, - int readLength1, int readLength2, const LocalParameters &par); + int readLength1, int readLength2); TaxonScore getBestSpeciesMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, - size_t offset, int queryLength, const LocalParameters &par); + size_t offset, int queryLength); + + TaxonScore getBestSpeciesMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, + size_t offset, int readLength1, int readLength2); - TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, - int readLength1, int readLength2); - TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, - int readLength1); + // TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, + // int readLength1, int readLength2); + // TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, + // int readLength1); - TaxonScore scoreGenus(vector &filteredMatches, + TaxonScore scoreTaxon(vector &filteredMatches, + TaxID taxId, int queryLength); - TaxonScore scoreGenus(vector &filteredMatches, + TaxonScore scoreTaxon(vector &filteredMatches, + TaxID taxId, int readLength1, int readLength2); @@ -129,7 +136,7 @@ class Taxonomer { int queryLength, int queryLength2); - TaxID lowerRankClassification(vector &matches, pair &matchRange, TaxID speciesID); + TaxID lowerRankClassification(vector &matches, TaxID speciesID); void getSpeciesCladeCounts(const unordered_map & taxCnt, unordered_map & cladeCnt, From 4ce6f0aecaf664395e814482ed4643a64cc2e56c Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 31 Oct 2023 17:31:05 +0900 Subject: [PATCH 51/65] use minimum Hamming dist. instead of maximum --- src/commons/Taxonomer.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index b52f4447..8d4bf351 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -362,7 +362,7 @@ TaxID Taxonomer::lowerRankClassification(vector &matches, TaxID spTaxId) for (size_t i = 0; i < matchNum; i++) { size_t currQuotient = matches[i].qInfo.pos / 3; - uint8_t minHamming = matches[i].hamming; + uint8_t minHamming = 0; //matches[i].hamming; Match * minHammingMatch = & matches[i]; TaxID minHammingTaxId = minHammingMatch->targetId; while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) { @@ -1114,7 +1114,7 @@ TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, // Get the largest hamming distance at each position of query auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1]; - memset(hammingsAtEachPos, -1, (aminoAcidNum + 1)); + memset(hammingsAtEachPos, 24, (aminoAcidNum + 1)); while (f < matchNum) { currPos = filteredMatches[f]->qInfo.pos / 3; currHammings = filteredMatches[f]->rightEndHamming; @@ -1171,27 +1171,27 @@ TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, size_t matchNum = filteredMatches.size(); size_t f = 0; - // Get the largest hamming distance at each position of query + // Get the smallest hamming distance at each position of query auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3]; - memset(hammingsAtEachPos, -1, (aminoAcidNum_total + 3)); + memset(hammingsAtEachPos, 24, (aminoAcidNum_total + 3)); while (f < matchNum) { currPos = (int) filteredMatches[f]->qInfo.pos / 3; currHammings = filteredMatches[f]->rightEndHamming; - if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]]) + if (GET_2_BITS(currHammings) < hammingsAtEachPos[currPos + unmaskedPos[0]]) hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); - if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]]) + if (GET_2_BITS(currHammings >> 2) < hammingsAtEachPos[currPos + unmaskedPos[1]]) hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); - if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]]) + if (GET_2_BITS(currHammings >> 4) < hammingsAtEachPos[currPos + unmaskedPos[2]]) hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); - if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]]) + if (GET_2_BITS(currHammings >> 6) < hammingsAtEachPos[currPos + unmaskedPos[3]]) hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); - if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]]) + if (GET_2_BITS(currHammings >> 8) < hammingsAtEachPos[currPos + unmaskedPos[4]]) hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); - if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]]) + if (GET_2_BITS(currHammings >> 10) < hammingsAtEachPos[currPos + unmaskedPos[5]]) hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); - if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]]) + if (GET_2_BITS(currHammings >> 12) < hammingsAtEachPos[currPos + unmaskedPos[6]]) hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); - if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]]) + if (GET_2_BITS(currHammings >> 14) < hammingsAtEachPos[currPos + unmaskedPos[7]]) hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); f++; } From cf407d729ba8c813ed56121de5a42c8528cfa515 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 31 Oct 2023 22:01:20 +0900 Subject: [PATCH 52/65] util.maping2taxon.cpp --- src/util/mapping2taxon.cpp | 103 +++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 src/util/mapping2taxon.cpp diff --git a/src/util/mapping2taxon.cpp b/src/util/mapping2taxon.cpp new file mode 100644 index 00000000..d7aedfdd --- /dev/null +++ b/src/util/mapping2taxon.cpp @@ -0,0 +1,103 @@ +#include +#include +#include +#include +#include "Command.h" +#include "LocalParameters.h" +#include "NcbiTaxonomy.h" +#include "common.h" +#include "fstream" +#include +#include + +using namespace std; + +struct read2taxon { + string read; + TaxID taxon; +}; + +int parseTaxId_metamaps(const string & mappingRes) { + vector tokens = Util::split(mappingRes, " "); + return stoi(Util::split(tokens[5], "|")[2]); +} + +// It takes a mapping result of Metamaps. +// The mapping result includes mutliple mappings for a read, which have mapping scores. +// The function returns the taxon ID of the best mapping. +// If there are multiple mappings with the same best score, it returns the LCA of them. +int mapping2taxon(int argc, const char **argv, const Command &command) { + LocalParameters &par = LocalParameters::getLocalInstance(); + par.parseParameters(argc, argv, command, false, Parameters::PARSE_ALLOW_EMPTY, 0); + string mappingFile = par.filenames[0]; + string taxonomyDir = par.filenames[1]; + string output = mappingFile + ".reads2taxon"; + ofstream out(output); + + vector read2taxon; + NcbiTaxonomy *taxonomy = loadTaxonomy("", taxonomyDir); + cout << "Taxonomy loaded" << endl; + + // Iterate through mapping file + ifstream mapping(mappingFile); + string line; + vector taxIds; + string previousRead = ""; + double bestScore = -2; + TaxID bestTaxId = -1; + bool lastStored = false; + + while (getline(mapping, line)) { + vector tokens = Util::split(line, " "); + string currentRead = tokens[0]; + if (currentRead == previousRead) { // Same read + // Get score + stringstream scoreString(tokens[13]); + double curScore = 0; + scoreString >> curScore; + + if (curScore > bestScore) { + taxIds.clear(); + bestScore = curScore; + bestTaxId = parseTaxId_metamaps(line); + taxIds.push_back(bestTaxId); + } else if (curScore == bestScore) { + taxIds.push_back(parseTaxId_metamaps(line)); + bestTaxId = taxonomy->LCA(taxIds)->taxId; + } + lastStored = false; + } else { // New read + // Store results for previous read + // out << previousRead << "\t" << bestTaxId << endl; + read2taxon.push_back({previousRead, bestTaxId}); + lastStored = true; + + // Initialize variables + previousRead = currentRead; + taxIds.clear(); + + // Get score + stringstream scoreString(tokens[13]); + double curScore = 0; + scoreString >> curScore; + + // Update variables + bestScore = curScore; + bestTaxId = parseTaxId_metamaps(line); + taxIds.push_back(bestTaxId); + } + } + + if (!lastStored) { + // out << previousRead << "\t" << bestTaxId << endl; + read2taxon.push_back({previousRead, bestTaxId}); + } + + // Write to file + cout << "Writing to file" << endl; + for (size_t i = 1; i < read2taxon.size(); i++) { + out << read2taxon[i].read << "\t" << read2taxon[i].taxon << "\n"; + } + + return 0; +} From 3bb9c2c8de092252bb7e9c8ee67ae1c52db116ae Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 31 Oct 2023 23:13:41 +0900 Subject: [PATCH 53/65] max DNA Hamming dist. is set as 6 --- src/commons/KmerMatcher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index f99ccbd3..8a13ceb1 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -516,7 +516,7 @@ void KmerMatcher::compareDna(uint64_t query, // Select target k-mers that passed hamming criteria for (size_t h = 0; h < size; h++) { - if (hammingSums[h] <= minHammingSum + hammingMargin) { + if (hammingSums[h] <= 6) {// minHammingSum + hammingMargin) { selectedMatches.push_back(h); selectedHammingSum.push_back(hammingSums[h]); if (frame < 3) { From 3533ff4493296b15f20debb829d6e0cc3fadd33a Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 31 Oct 2023 23:24:53 +0900 Subject: [PATCH 54/65] fix error in scoreTaxon: -1 --> 24 --- src/commons/Taxonomer.cpp | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 8d4bf351..2e28be47 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -356,7 +356,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, } TaxID Taxonomer::lowerRankClassification(vector &matches, TaxID spTaxId) { - unordered_map taxCnt; size_t matchNum = matches.size(); @@ -1112,27 +1111,27 @@ TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, size_t matchNum = filteredMatches.size(); size_t f = 0; - // Get the largest hamming distance at each position of query + // Get the smallest hamming distance at each position of query auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1]; memset(hammingsAtEachPos, 24, (aminoAcidNum + 1)); while (f < matchNum) { currPos = filteredMatches[f]->qInfo.pos / 3; currHammings = filteredMatches[f]->rightEndHamming; - if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]]) + if (GET_2_BITS(currHammings) < hammingsAtEachPos[currPos + unmaskedPos[0]]) hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); - if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]]) + if (GET_2_BITS(currHammings >> 2) < hammingsAtEachPos[currPos + unmaskedPos[1]]) hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); - if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]]) + if (GET_2_BITS(currHammings >> 4) < hammingsAtEachPos[currPos + unmaskedPos[2]]) hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); - if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]]) + if (GET_2_BITS(currHammings >> 6) < hammingsAtEachPos[currPos + unmaskedPos[3]]) hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); - if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]]) + if (GET_2_BITS(currHammings >> 8) < hammingsAtEachPos[currPos + unmaskedPos[4]]) hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); - if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]]) + if (GET_2_BITS(currHammings >> 10) < hammingsAtEachPos[currPos + unmaskedPos[5]]) hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); - if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]]) + if (GET_2_BITS(currHammings >> 12) < hammingsAtEachPos[currPos + unmaskedPos[6]]) hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); - if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]]) + if (GET_2_BITS(currHammings >> 14) < hammingsAtEachPos[currPos + unmaskedPos[7]]) hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); f++; } @@ -1142,7 +1141,7 @@ TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, for (int h = 0; h < aminoAcidNum; h++) { if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. coveredPosCnt++; - } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively + } else if (hammingsAtEachPos[h] != 24) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively hammingSum += 1.0f + (0.5f * hammingsAtEachPos[h]); coveredPosCnt++; } @@ -1205,7 +1204,7 @@ TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, if (h < aminoAcidNum_read1) { if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. coveredPosCnt_read1++; - } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively + } else if (hammingsAtEachPos[h] != 24) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); coveredPosCnt_read1++; } @@ -1214,7 +1213,7 @@ TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, else { if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. coveredPosCnt_read2++; - } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively + } else if (hammingsAtEachPos[h] != 24) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); coveredPosCnt_read2++; } From 0741eb4da5c73278e95e735e6fd403cf1a4bdf60 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 7 Nov 2023 19:51:21 +0900 Subject: [PATCH 55/65] first running version --- src/commons/Classifier.cpp | 3 +- src/commons/KmerExtractor.cpp | 2 +- src/commons/Match.h | 19 +- src/commons/Taxonomer.cpp | 1003 ++++++++++++++++++++------------- src/commons/Taxonomer.h | 53 +- 5 files changed, 682 insertions(+), 398 deletions(-) diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index 58130092..46eee5c6 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -102,7 +102,8 @@ void Classifier::startClassify(const LocalParameters &par) { kmerMatcher->matchKmers(&kmerBuffer, &matchBuffer); kmerMatcher->sortMatches(&matchBuffer); - // Classify queries based on the matches + // Classify queries based on the matches. + // omp_set_num_threads(1); taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par); processedSeqCnt += queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start; cout << "The number of processed sequences: " << processedSeqCnt << " (" << (double) processedSeqCnt / (double) numOfSeq << ")" << endl; diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp index 232cc89a..30addeb5 100644 --- a/src/commons/KmerExtractor.cpp +++ b/src/commons/KmerExtractor.cpp @@ -196,7 +196,7 @@ void KmerExtractor::fillQueryKmerBufferParallel_paired(KSeqWrapper *kseq1, // Process Read 2 seqIterator2.sixFrameTranslation(maskedSeq2, (int) reads2[i].length()); seqIterator2.fillQueryKmerBuffer(maskedSeq2, (int) reads2[i].length(), kmerBuffer, posToWrite, - (uint32_t) queryIdx, queryList[queryIdx].queryLength); + (uint32_t) queryIdx, queryList[queryIdx].queryLength+3); if (maskMode) { delete[] maskedSeq1; diff --git a/src/commons/Match.h b/src/commons/Match.h index 436eb0bb..f47881ef 100644 --- a/src/commons/Match.h +++ b/src/commons/Match.h @@ -2,7 +2,9 @@ #define ADCLASSIFIER2_MATCH_H #include "Kmer.h" +#include #include +#include "BitManipulateMacros.h" struct Match { // 24 byte Match(){} @@ -26,7 +28,22 @@ struct Match { // 24 byte void printMatch() const { std::cout << qInfo.sequenceID << " " << qInfo.pos << " " << qInfo.frame << " " - << targetId << " " << genusId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << std::endl; + << targetId << " " << genusId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << " " << getScore() << std::endl; + } + + + float getScore(float score = 0.0f, int cnt = 0) const { + int currentHamming = GET_2_BITS(rightEndHamming >> cnt * 2); + if (currentHamming == 0) { + score += 3.0f; + } else { + score += 2.0f - 0.5f * currentHamming; + } + if (cnt == 7) { + return score; + } else { + return getScore(score, cnt + 1); + } } }; diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 2e28be47..a3a3b3b5 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -1,6 +1,7 @@ #include "Taxonomer.h" #include "Match.h" #include "NcbiTaxonomy.h" +#include #include #include @@ -80,7 +81,6 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, const Match *matchList, vector & queryList, const LocalParameters &par) { - TaxID selectedTaxon; // if (true) { // cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; @@ -92,7 +92,7 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, // Get the best species for current query vector speciesMatches; speciesMatches.reserve(end - offset + 1); - TaxonScore speciesScore(0, 0, 0, 0); + TaxonScore speciesScore(0, 0, 0, 0, 0); if (par.seqMode == 2) { speciesScore = getBestSpeciesMatches(speciesMatches, matchList, end, offset, queryList[currentQuery].queryLength, @@ -121,22 +121,23 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, return; } - // If there are two or more good genus level candidates, find the LCA. - if (speciesScore.taxId == 0) { - vector genusList; - genusList.reserve(speciesMatches.size()); - for (auto & genusMatch : speciesMatches) { - genusList.push_back(genusMatch.genusId); - } - selectedTaxon = taxonomy->LCA(genusList)->taxId; + // If there are two or more good species level candidates, find the LCA. + if (speciesScore.LCA) { + // cout << "LCA" << endl; + // vector genusList; + // genusList.reserve(speciesMatches.size()); + // for (auto & genusMatch : speciesMatches) { + // genusList.push_back(genusMatch.genusId); + // } + // selectedTaxon = taxonomy->LCA(genusList)->taxId; queryList[currentQuery].isClassified = true; - queryList[currentQuery].classification = selectedTaxon; + queryList[currentQuery].classification = speciesScore.taxId; queryList[currentQuery].score = speciesScore.score; queryList[currentQuery].coverage = speciesScore.coverage; queryList[currentQuery].hammingDist = speciesScore.hammingDist; - for (auto & spMatch : speciesMatches) { - queryList[currentQuery].taxCnt[spMatch.targetId]++; - } + // for (auto & spMatch : speciesMatches) { + // queryList[currentQuery].taxCnt[spMatch.targetId]++; + // } return; } @@ -164,9 +165,10 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, // return a.qInfo.position / 3 < b.qInfo.position / 3; // }); - sort(speciesMatches.begin(), speciesMatches.end(), - [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); + // sort(speciesMatches.begin(), speciesMatches.end(), + // [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); + cout << "7 " << currentQuery << endl; TaxID result = lowerRankClassification(speciesMatches, speciesScore.taxId); @@ -182,6 +184,7 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, queryList[currentQuery].coverage = speciesScore.coverage; queryList[currentQuery].hammingDist = speciesScore.hammingDist; queryList[currentQuery].newSpecies = false; + cout << "8" << currentQuery << endl; // if (par.printLog) { // cout << "# " << currentQuery << endl; // for (size_t i = 0; i < genusMatches.size(); i++) { @@ -195,188 +198,192 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, // } } -void Taxonomer::chooseBestTaxon(uint32_t currentQuery, - size_t offset, - size_t end, - const Match *matchList, - vector & queryList, - const LocalParameters &par) { - TaxID selectedTaxon; - -// if (true) { -// cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; -// for (size_t i = offset; i < end + 1; i++) { -// cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; -// } -// } +// void Taxonomer::chooseBestTaxon(uint32_t currentQuery, +// size_t offset, +// size_t end, +// const Match *matchList, +// vector & queryList, +// const LocalParameters &par) { +// TaxID selectedTaxon; + +// // if (true) { +// // cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; +// // for (size_t i = offset; i < end + 1; i++) { +// // cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; +// // } +// // } + +// // Get the best genus for current query +// vector genusMatches; +// genusMatches.reserve(end - offset + 1); +// TaxonScore genusScore(0, 0, 0, 0); +// if (par.seqMode == 2) { +// genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, +// queryList[currentQuery].queryLength, +// queryList[currentQuery].queryLength2); +// } else { +// genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, +// queryList[currentQuery].queryLength); +// } - // Get the best genus for current query - vector genusMatches; - genusMatches.reserve(end - offset + 1); - TaxonScore genusScore(0, 0, 0, 0); - if (par.seqMode == 2) { - genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, - queryList[currentQuery].queryLength, - queryList[currentQuery].queryLength2); - } else { - genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, - queryList[currentQuery].queryLength); - } +// // if (true) { +// // cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n"; +// // for (size_t i = 0; i < genusMatches.size(); i++) { +// // cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) << " " << int(genusMatches[i].redundancy) << endl; +// // } +// // cout << "Genus score: " << genusScore.score << "\n"; +// // } + +// // If there is no proper genus for current query, it is un-classified. +// if (genusScore.score == 0 || genusScore.coverage < par.minCoverage || genusScore.score < par.minScore) { +// queryList[currentQuery].isClassified = false; +// queryList[currentQuery].classification = 0; +// queryList[currentQuery].score = genusScore.score; +// queryList[currentQuery].coverage = genusScore.coverage; +// queryList[currentQuery].hammingDist = genusScore.hammingDist; +// queryList[currentQuery].newSpecies = false; +// return; +// } -// if (true) { -// cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n"; -// for (size_t i = 0; i < genusMatches.size(); i++) { -// cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) << " " << int(genusMatches[i].redundancy) << endl; +// // If there are two or more good genus level candidates, find the LCA. +// if (genusScore.taxId == 0) { +// vector genusList; +// genusList.reserve(genusMatches.size()); +// for (auto & genusMatch : genusMatches) { +// genusList.push_back(genusMatch.genusId); // } -// cout << "Genus score: " << genusScore.score << "\n"; -// } - - // If there is no proper genus for current query, it is un-classified. - if (genusScore.score == 0 || genusScore.coverage < par.minCoverage || genusScore.score < par.minScore) { - queryList[currentQuery].isClassified = false; - queryList[currentQuery].classification = 0; - queryList[currentQuery].score = genusScore.score; - queryList[currentQuery].coverage = genusScore.coverage; - queryList[currentQuery].hammingDist = genusScore.hammingDist; - queryList[currentQuery].newSpecies = false; - return; - } - - // If there are two or more good genus level candidates, find the LCA. - if (genusScore.taxId == 0) { - vector genusList; - genusList.reserve(genusMatches.size()); - for (auto & genusMatch : genusMatches) { - genusList.push_back(genusMatch.genusId); - } - selectedTaxon = taxonomy->LCA(genusList)->taxId; - queryList[currentQuery].isClassified = true; - queryList[currentQuery].classification = selectedTaxon; - queryList[currentQuery].score = genusScore.score; - queryList[currentQuery].coverage = genusScore.coverage; - queryList[currentQuery].hammingDist = genusScore.hammingDist; - for (auto & genusMatch : genusMatches) { - queryList[currentQuery].taxCnt[genusMatch.targetId]++; - } - return; - } +// selectedTaxon = taxonomy->LCA(genusList)->taxId; +// queryList[currentQuery].isClassified = true; +// queryList[currentQuery].classification = selectedTaxon; +// queryList[currentQuery].score = genusScore.score; +// queryList[currentQuery].coverage = genusScore.coverage; +// queryList[currentQuery].hammingDist = genusScore.hammingDist; +// for (auto & genusMatch : genusMatches) { +// queryList[currentQuery].taxCnt[genusMatch.targetId]++; +// } +// return; +// } - // Choose the species with the highest coverage. - TaxID selectedSpecies; - TaxonScore speciesScore; - vector species; - unordered_map> speciesMatchRange; - if (par.seqMode == 2) { - speciesScore = chooseSpecies(genusMatches, - queryList[currentQuery].queryLength, - queryList[currentQuery].queryLength2, - species, - speciesMatchRange); - } else { - speciesScore = chooseSpecies(genusMatches, - queryList[currentQuery].queryLength, - species, - speciesMatchRange); - } +// // Choose the species with the highest coverage. +// TaxID selectedSpecies; +// TaxonScore speciesScore; +// vector species; +// unordered_map> speciesMatchRange; +// if (par.seqMode == 2) { +// speciesScore = chooseSpecies(genusMatches, +// queryList[currentQuery].queryLength, +// queryList[currentQuery].queryLength2, +// species, +// speciesMatchRange); +// } else { +// speciesScore = chooseSpecies(genusMatches, +// queryList[currentQuery].queryLength, +// species, +// speciesMatchRange); +// } - // Classify to LCA if more than one species are selected - if (species.size() > 1) { - queryList[currentQuery].isClassified = true; - queryList[currentQuery].classification = taxonomy->LCA(species)->taxId; - queryList[currentQuery].score = genusScore.score; - queryList[currentQuery].coverage = genusScore.coverage; - queryList[currentQuery].hammingDist = genusScore.hammingDist; - for (auto & genusMatch : genusMatches) { - queryList[currentQuery].taxCnt[genusMatch.targetId]++; - } - return; - } +// // Classify to LCA if more than one species are selected +// if (species.size() > 1) { +// queryList[currentQuery].isClassified = true; +// queryList[currentQuery].classification = taxonomy->LCA(species)->taxId; +// queryList[currentQuery].score = genusScore.score; +// queryList[currentQuery].coverage = genusScore.coverage; +// queryList[currentQuery].hammingDist = genusScore.hammingDist; +// for (auto & genusMatch : genusMatches) { +// queryList[currentQuery].taxCnt[genusMatch.targetId]++; +// } +// return; +// } - // If score is not enough, classify to the parent of the selected species - if (speciesScore.score < par.minSpScore) { - queryList[currentQuery].isClassified = true; - queryList[currentQuery].classification = taxonomy->taxonNode( - taxonomy->getTaxIdAtRank(species[0], "species"))->parentTaxId; - queryList[currentQuery].score = genusScore.score; - queryList[currentQuery].coverage = genusScore.coverage; - queryList[currentQuery].hammingDist = genusScore.hammingDist; - for (auto & genusMatch : genusMatches) { - if(genusMatch.speciesId == species[0]){ - queryList[currentQuery].taxCnt[genusMatch.targetId]++; - } - } - return; - } +// // If score is not enough, classify to the parent of the selected species +// if (speciesScore.score < par.minSpScore) { +// queryList[currentQuery].isClassified = true; +// queryList[currentQuery].classification = taxonomy->taxonNode( +// taxonomy->getTaxIdAtRank(species[0], "species"))->parentTaxId; +// queryList[currentQuery].score = genusScore.score; +// queryList[currentQuery].coverage = genusScore.coverage; +// queryList[currentQuery].hammingDist = genusScore.hammingDist; +// for (auto & genusMatch : genusMatches) { +// if(genusMatch.speciesId == species[0]){ +// queryList[currentQuery].taxCnt[genusMatch.targetId]++; +// } +// } +// return; +// } - // Sort matches by the position of the query sequence - selectedSpecies = species[0]; -// sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first, -// genusMatches.begin() + speciesMatchRange[selectedSpecies].second, -// [](const Match & a, const Match & b) { -// if (a.qInfo.position / 3 == b.qInfo.position / 3) -// return a.hamming < b.hamming; -// else -// return a.qInfo.position / 3 < b.qInfo.position / 3; -// }); +// // Sort matches by the position of the query sequence +// selectedSpecies = species[0]; +// // sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first, +// // genusMatches.begin() + speciesMatchRange[selectedSpecies].second, +// // [](const Match & a, const Match & b) { +// // if (a.qInfo.position / 3 == b.qInfo.position / 3) +// // return a.hamming < b.hamming; +// // else +// // return a.qInfo.position / 3 < b.qInfo.position / 3; +// // }); - vector::const_iterator first = genusMatches.begin() + speciesMatchRange[selectedSpecies].first; - vector::const_iterator last = genusMatches.begin() + speciesMatchRange[selectedSpecies].second; - vector speciesMatches(first, last); +// vector::const_iterator first = genusMatches.begin() + speciesMatchRange[selectedSpecies].first; +// vector::const_iterator last = genusMatches.begin() + speciesMatchRange[selectedSpecies].second; +// vector speciesMatches(first, last); - sort(speciesMatches.begin(), speciesMatches.end(), - [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); +// sort(speciesMatches.begin(), speciesMatches.end(), +// [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); - TaxID result = lowerRankClassification(speciesMatches, selectedSpecies); +// TaxID result = lowerRankClassification(speciesMatches, selectedSpecies); - // Record matches of selected species - for (size_t i = speciesMatchRange[selectedSpecies].first; i < speciesMatchRange[selectedSpecies].second; i++) { - queryList[currentQuery].taxCnt[genusMatches[i].targetId]++; - } +// // Record matches of selected species +// for (size_t i = speciesMatchRange[selectedSpecies].first; i < speciesMatchRange[selectedSpecies].second; i++) { +// queryList[currentQuery].taxCnt[genusMatches[i].targetId]++; +// } - // Store classification results - queryList[currentQuery].isClassified = true; - queryList[currentQuery].classification = result; - queryList[currentQuery].score = speciesScore.score; - queryList[currentQuery].coverage = speciesScore.coverage; - queryList[currentQuery].hammingDist = speciesScore.hammingDist; - queryList[currentQuery].newSpecies = false; -// if (par.printLog) { -// cout << "# " << currentQuery << endl; -// for (size_t i = 0; i < genusMatches.size(); i++) { -// cout << i << " " << genusMatches[i].qInfo.pos << " " << -// genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl; -// } -// cout << "Score: " << speciesScore.score << " " << selectedSpecies << " " -// << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx) -// -// << endl; -// } -} +// // Store classification results +// queryList[currentQuery].isClassified = true; +// queryList[currentQuery].classification = result; +// queryList[currentQuery].score = speciesScore.score; +// queryList[currentQuery].coverage = speciesScore.coverage; +// queryList[currentQuery].hammingDist = speciesScore.hammingDist; +// queryList[currentQuery].newSpecies = false; +// // if (par.printLog) { +// // cout << "# " << currentQuery << endl; +// // for (size_t i = 0; i < genusMatches.size(); i++) { +// // cout << i << " " << genusMatches[i].qInfo.pos << " " << +// // genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl; +// // } +// // cout << "Score: " << speciesScore.score << " " << selectedSpecies << " " +// // << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx) +// // +// // << endl; +// // } +// } TaxID Taxonomer::lowerRankClassification(vector &matches, TaxID spTaxId) { unordered_map taxCnt; size_t matchNum = matches.size(); + // cout << spTaxId << endl; + for (size_t i = 0; i < matchNum; i++) { - size_t currQuotient = matches[i].qInfo.pos / 3; - uint8_t minHamming = 0; //matches[i].hamming; - Match * minHammingMatch = & matches[i]; - TaxID minHammingTaxId = minHammingMatch->targetId; - while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) { - if (matches[i].hamming < minHamming) { - minHamming = matches[i].hamming; - minHammingMatch = & matches[i]; - minHammingTaxId = minHammingMatch->targetId; - } else if (matches[i].hamming == minHamming) { - minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId); - minHammingMatch->redundancy = true; - matches[i].redundancy = true; - } - i++; - } - taxCnt[minHammingTaxId]++; + // cout << matches[i].targetId << endl; + taxCnt[matches[i].targetId] ++; + // size_t currQuotient = matches[i].qInfo.pos / 3; + // uint8_t minHamming = 0; //matches[i].hamming; + // Match * minHammingMatch = & matches[i]; + // TaxID minHammingTaxId = minHammingMatch->targetId; + // while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) { + // if (matches[i].hamming < minHamming) { + // minHamming = matches[i].hamming; + // minHammingMatch = & matches[i]; + // minHammingTaxId = minHammingMatch->targetId; + // } else if (matches[i].hamming == minHamming) { + // minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId); + // minHammingMatch->redundancy = true; + // matches[i].redundancy = true; + // } + // i++; + // } + // taxCnt[minHammingTaxId]++; } // int i = matchRange.second - 1; @@ -402,10 +409,16 @@ TaxID Taxonomer::lowerRankClassification(vector &matches, TaxID spTaxId) // } unordered_map cladeCnt; + // cout << "8" << endl; getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); - + // // print cladeCnt + // for (auto it = cladeCnt.begin(); it != cladeCnt.end(); it++) { + // cout << it->first << " " << it->second.taxCount << " " << it->second.cladeCount << endl; + // } + // cout << "9" << endl; if (accessionLevel == 2) { // Don't do accession-level classification // Remove leaf nodes + // cout << "10" << endl; for (auto it = cladeCnt.begin(); it != cladeCnt.end(); it++) { TaxonNode const * taxon = taxonomy->taxonNode(it->first); if (strcmp(taxonomy->getString(taxon->rankIdx), "") == 0) { @@ -415,8 +428,10 @@ TaxID Taxonomer::lowerRankClassification(vector &matches, TaxID spTaxId) it->first)); } } + return BFS(cladeCnt, spTaxId); } else { + // cout << "10-2" << endl; return BFS(cladeCnt, spTaxId); } } @@ -479,6 +494,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, size_t i = offset; uint8_t curFrame; vector curFrameMatches; + vector matchPaths; while (i < end + 1) { currentSpecies = matchList[i].speciesId; @@ -492,7 +508,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, i ++; } if (curFrameMatches.size() > 1) { - remainConsecutiveMatches(curFrameMatches, filteredMatches, currentSpecies); + remainConsecutiveMatches(curFrameMatches, matchPaths, currentSpecies); } } // Construct a match combination using filtered matches of current species @@ -549,6 +565,10 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, size_t i = offset; uint8_t curFrame; vector curFrameMatches; + vector matchPaths; + unordered_map species2score; + unordered_map> species2matchPaths; + float bestSpScore = 0; while (i < end + 1) { currentSpecies = matchList[i].speciesId; @@ -562,139 +582,247 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, i ++; } if (curFrameMatches.size() > 1) { - remainConsecutiveMatches(curFrameMatches, filteredMatches, currentSpecies); + // cout << "1" << endl; + remainConsecutiveMatches(curFrameMatches, matchPaths, currentSpecies); } } - // Construct a match combination using filtered matches of current species + // Combine MatchPaths // so that it can best cover the query, and score the combination - if (!filteredMatches.empty()) { - matchesForEachSpecies.push_back(filteredMatches); - speciesScores.push_back(scoreTaxon(filteredMatches, currentSpecies, readLength1, readLength2)); + if (!matchPaths.empty()) { + // Initialize species2matchPaths + species2matchPaths[currentSpecies].emplace_back(0, 0, 0, 0); + // cout << "2" << endl; + cout << currentSpecies << endl; + float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], readLength1 + readLength2); + cout << endl; + species2score[currentSpecies] = score; + if (score > bestSpScore) { + bestSpScore = score; + } } - filteredMatches.clear(); + matchPaths.clear(); } // If there are no meaningful species - if (speciesScores.empty()) { + if (species2score.empty()) { bestScore.score = 0; return bestScore; } - - TaxonScore maxScore = *max_element(speciesScores.begin(), speciesScores.end(), - [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); - - vector maxIdx; - for (size_t g = 0; g < speciesScores.size(); g++) { - if (speciesScores[g].score == maxScore.score) { - maxIdx.push_back(g); + // cout << "4" << endl; + vector maxSpecies; + for (auto & spScore : species2score) { + cout << spScore.first << " " << spScore.second << endl; + if (spScore.second == bestSpScore) { + maxSpecies.push_back(spScore.first); } } - bestScore = maxScore; - - for (unsigned long g : maxIdx) { - for (const Match * m : matchesForEachSpecies[g]) { - speciesMatches.push_back(*m); + // cout << "5" << endl; + // More than one species --> LCA + if (maxSpecies.size() > 1) { + bestScore.LCA = true; + bestScore.taxId = taxonomy->LCA(maxSpecies)->taxId; + for (auto & sp : maxSpecies) { + bestScore.score += species2score[sp]; } + bestScore.score /= maxSpecies.size(); + return bestScore; } - - // More than one species - if (maxIdx.size() > 1) { - bestScore.taxId = 0; + + // One species + bestScore.taxId = maxSpecies[0]; + bestScore.score = species2score[maxSpecies[0]]; + float coveredLength = 0.f; + int hammingDist = 0; + for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { + // cout << "here" << endl; + coveredLength += matchPath.end - matchPath.start + 1; + hammingDist += matchPath.hammingDist; + for (auto match : matchPath.matches) { + // cout << match->targetId << endl; + // match->printMatch(); + speciesMatches.push_back(*match); + // speciesMatches.back().printMatch(); + } } + bestScore.coverage = coveredLength / (readLength1 + readLength2); + bestScore.hammingDist = hammingDist; +// cout << "6" << endl; return bestScore; } -TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, - size_t offset, int readLength1, int readLength2) { - TaxID currentGenus; - TaxID currentSpecies; - - vector filteredMatches; - vector> matchesForEachGenus; - vector genusScores; - TaxonScore bestScore; - size_t i = offset; - uint8_t curFrame; - vector curFrameMatches; - while (i < end + 1) { -// currentGenus = taxId2genusId[matchList[i].targetId]; - currentGenus = matchList[i].genusId; - // For current genus - while ((i < end + 1) && currentGenus == matchList[i].genusId) { -// currentSpecies = taxId2speciesId[matchList[i].targetId]; - currentSpecies = matchList[i].speciesId; -// if (par.printLog) { -// cout << currentGenus << " " << currentSpecies << endl; -// } - // For current species - while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { - curFrame = matchList[i].qInfo.frame; - curFrameMatches.clear(); - - // For current frame - while ((i < end + 1) && currentSpecies == matchList[i].speciesId - && curFrame == matchList[i].qInfo.frame) { - curFrameMatches.push_back(&matchList[i]); - i ++; - } - if (curFrameMatches.size() > 1) { - remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus); +float Taxonomer::combineMatchPaths(vector & matchPaths, + vector & combinedMatchPaths, + int readLength) { + combinedMatchPaths.clear(); + // Sort matchPaths by the their score + sort(matchPaths.begin(), matchPaths.end(), + [](const MatchPath & a, const MatchPath & b) { return a.score > b.score;}); + + // Combine matchPaths + // 1. Add the matchPath with the highest score to combinedMatchPaths + // 2. Add the matchPath with the highest score that is not overlapped with the matchPath in combinedMatchPaths + // 3. Repeat 2 until no matchPath can be added + for (size_t i = 0; i < matchPaths.size(); i++) { + cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << " " << matchPaths[i].matches.back()->targetId << " " << matchPaths[i].matches.back()->qInfo.frame <printMatch(); + // } + } else { + bool isOverlapped = false; + for (size_t j = 0; j < combinedMatchPaths.size(); j++) { + if (!isMatchPathNotOverlapped(matchPaths[i], combinedMatchPaths[j])) { + isOverlapped = true; + break; + } else { + // cout << matchPaths[i].start << " " << matchPaths[i].end << endl; + // cout << combinedMatchPaths[j].start << " " << combinedMatchPaths[j].end << endl << endl;; } } + if (!isOverlapped) { + combinedMatchPaths.push_back(matchPaths[i]); + combinedMatchPaths.back().matches = matchPaths[i].matches; + // cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << endl; + // for (auto & match : matchPaths[i].matches) { + // match->printMatch(); + // } + } } - - // Construct a match combination using filtered matches of current genus - // so that it can best cover the query, and score the combination - if (!filteredMatches.empty()) { - matchesForEachGenus.push_back(filteredMatches); - genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, readLength1, readLength2)); - } - filteredMatches.clear(); } + // cout << endl; - // If there are no meaningful genus - if (genusScores.empty()) { - bestScore.score = 0; - return bestScore; - } - TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), - [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); - vector maxIdx; - for (size_t g = 0; g < genusScores.size(); g++) { - if (genusScores[g].score > maxScore.score * 0.95f) { - maxIdx.push_back(g); - } + // Calculate the score of combinedMatchPaths + float score = 0; + for (auto & matchPath : combinedMatchPaths) { + score += matchPath.score; } - bestScore = maxScore; + return score / readLength; +} - for (unsigned long g : maxIdx) { - for (const Match * m : matchesForEachGenus[g]) { - genusMatches.push_back(*m); - } - } +bool Taxonomer::isMatchPathNotOverlapped(const MatchPath & matchPath1, + const MatchPath & matchPath2) { + return (matchPath1.end < matchPath2.start) || (matchPath2.end < matchPath1.start); +} +\ +// if (matchPath1.start > matchPath2.start) { +// return isMatchPathOverlapped(matchPath2, matchPath1, readLength); +// } +// if (matchPath1.end < matchPath2.start) { +// return false; +// } +// if (matchPath1.endPos >= matchPath2.startPos) { +// if (matchPath1.endPos <= matchPath2.endPos) { +// return true; +// } else { +// if (matchPath1.startPos + readLength - 1 >= matchPath2.startPos) { +// return true; +// } else { +// return false; +// } +// } +// } +// return false; +// } - // More than one genus - if (maxIdx.size() > 1) { - bestScore.taxId = 0; - return bestScore; - } - return bestScore; +// TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, +// size_t offset, int readLength1, int readLength2) { +// TaxID currentGenus; +// TaxID currentSpecies; - //Three cases - //1. one genus - //2. more than one genus - //4. no genus -} +// vector filteredMatches; +// vector> matchesForEachGenus; +// vector genusScores; +// TaxonScore bestScore; +// size_t i = offset; +// uint8_t curFrame; +// vector curFrameMatches; +// while (i < end + 1) { +// // currentGenus = taxId2genusId[matchList[i].targetId]; +// currentGenus = matchList[i].genusId; +// // For current genus +// while ((i < end + 1) && currentGenus == matchList[i].genusId) { +// // currentSpecies = taxId2speciesId[matchList[i].targetId]; +// currentSpecies = matchList[i].speciesId; +// // if (par.printLog) { +// // cout << currentGenus << " " << currentSpecies << endl; +// // } +// // For current species +// while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { +// curFrame = matchList[i].qInfo.frame; +// curFrameMatches.clear(); + +// // For current frame +// while ((i < end + 1) && currentSpecies == matchList[i].speciesId +// && curFrame == matchList[i].qInfo.frame) { +// curFrameMatches.push_back(&matchList[i]); +// i ++; +// } +// if (curFrameMatches.size() > 1) { +// remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus); +// } +// } +// } -void Taxonomer::remainConsecutiveMatches(vector & curFrameMatches, - vector & filteredMatches, - TaxID genusId) { +// // Construct a match combination using filtered matches of current genus +// // so that it can best cover the query, and score the combination +// if (!filteredMatches.empty()) { +// matchesForEachGenus.push_back(filteredMatches); +// genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, readLength1, readLength2)); +// } +// filteredMatches.clear(); +// } + +// // If there are no meaningful genus +// if (genusScores.empty()) { +// bestScore.score = 0; +// return bestScore; +// } + +// TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), +// [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); + +// vector maxIdx; +// for (size_t g = 0; g < genusScores.size(); g++) { +// if (genusScores[g].score > maxScore.score * 0.95f) { +// maxIdx.push_back(g); +// } +// } +// bestScore = maxScore; + +// for (unsigned long g : maxIdx) { +// for (const Match * m : matchesForEachGenus[g]) { +// genusMatches.push_back(*m); +// } +// } + + + +// // More than one genus +// if (maxIdx.size() > 1) { +// bestScore.taxId = 0; +// return bestScore; +// } + +// return bestScore; + +// //Three cases +// //1. one genus +// //2. more than one genus +// //4. no genus +// } + +void Taxonomer::remainConsecutiveMatches(const vector & curFrameMatches, + vector & matchPaths, + TaxID genusId) { size_t i = 0; size_t end = curFrameMatches.size(); vector> curPosMatches; // @@ -741,18 +869,61 @@ void Taxonomer::remainConsecutiveMatches(vector & curFrameMatches // } // } - // Iterate linkedMatches to get filteredMatches - int MIN_DEPTH = minConsCnt - 1; + // Iterate linkedMatches to get filteredMatches + //(ignore matches not enoughly consecutive) + size_t MIN_DEPTH = minConsCnt - 1; if (taxonomy->IsAncestor(eukaryotaTaxId, genusId)) { MIN_DEPTH = minConsCntEuk - 1; } unordered_set used; - vector filteredMatchIdx; - unordered_map idx2depth; + unordered_map idx2depthScore; + // unordered_map edges; + unordered_map edges; + for (const auto& entry : linkedMatches) { if (!used.count(entry.first)) { used.insert(entry.first); - DFS(entry.first, linkedMatches, filteredMatchIdx, 0, MIN_DEPTH, used, idx2depth); + depthScore bestPath{}; + size_t bestNextIdx = 0; + float curScore = curFrameMatches[entry.first]->getScore(); + // cout << curFrameMatches[entry.first] + for (size_t j = 0; j < entry.second.size(); j++) { + used.insert(entry.second[j]); + depthScore curPath = DFS(curFrameMatches, + entry.second[j], + linkedMatches, + 1, + MIN_DEPTH, + used, + idx2depthScore, + edges, + curScore, 0); + if (curPath.score > bestPath.score && curPath.depth > MIN_DEPTH) { + bestNextIdx = entry.second[j]; + bestPath = curPath; + } + } + // Store the best path + if (bestPath.depth > MIN_DEPTH) { + // cout << entry.first << endl; + // curFrameMatches[entry.first]->printMatch(); + matchPaths.emplace_back(curFrameMatches[entry.first]->qInfo.pos, // start coordinate on query + curFrameMatches[entry.first]->qInfo.pos + bestPath.depth * 3 + 20, // end coordinate on query + bestPath.score, bestPath.hammingDist); + const Match * curMatch = curFrameMatches[entry.first]; + edges[curMatch] = curFrameMatches[bestNextIdx]; + matchPaths.back().matches.push_back(curMatch); + // curMatch = edges[curMatch]; + // edges2[curFrameMatches[entry.first]] = curFrameMatches[bestNextIdx]; + // Retrieve the best path + // cout << bestPath.depth << endl; + while (edges.find(curMatch) != edges.end()) { + // cout << curMatch << " "; + matchPaths.back().matches.push_back(edges[curMatch]); + curMatch = edges[curMatch]; + } + // cout << endl; + } } } @@ -763,43 +934,92 @@ void Taxonomer::remainConsecutiveMatches(vector & curFrameMatches // } // cout << endl; // } - - for (auto &idx: filteredMatchIdx) { - filteredMatches.push_back(curFrameMatches[idx]); - } } +// size_t Taxonomer::DFS(size_t curMatchIdx, const map> & linkedMatches, +// vector& filteredMatches, size_t depth, size_t MIN_DEPTH, unordered_set& used, +// unordered_map & idx2depth) { +// depth++; +// size_t maxDepth = 0; +// size_t returnDepth = 0; +// if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { +// // reached a leaf node +// idx2depth[curMatchIdx] = depth; +// if (depth > MIN_DEPTH) { +// filteredMatches.push_back(curMatchIdx); +// } +// return depth; +// } else { // not a leaf node +// for (auto &nextMatchIdx: linkedMatches.at(curMatchIdx)) { +// used.insert(nextMatchIdx); +// if (idx2depth.find(nextMatchIdx) != idx2depth.end()) { +// returnDepth = idx2depth[nextMatchIdx]; +// maxDepth = max(maxDepth, returnDepth); +// continue; +// } +// returnDepth = DFS(nextMatchIdx, linkedMatches, filteredMatches, depth, MIN_DEPTH, used, idx2depth); +// maxDepth = max(maxDepth, returnDepth); +// } +// if (maxDepth > MIN_DEPTH) { +// filteredMatches.push_back(curMatchIdx); +// idx2depth[curMatchIdx] = maxDepth; +// } +// } +// return maxDepth; +// } -size_t Taxonomer::DFS(size_t curMatchIdx, const map> & linkedMatches, - vector& filteredMatches, size_t depth, size_t MIN_DEPTH, unordered_set& used, - unordered_map & idx2depth) { +// return: end +depthScore Taxonomer::DFS(const vector &matches, + size_t curMatchIdx, + const map> &linkedMatches, + size_t depth, size_t MIN_DEPTH, + unordered_set &used, + unordered_map &idx2depthScore, + unordered_map & edges, float score, int hammingDist) { depth++; - size_t maxDepth = 0; - size_t returnDepth = 0; - if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { - // reached a leaf node - idx2depth[curMatchIdx] = depth; - if (depth > MIN_DEPTH) { - filteredMatches.push_back(curMatchIdx); + depthScore bestDepthScore = depthScore(0, 0, 0); + depthScore returnDepthScore; + if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { // reached a leaf node + uint8_t lastEndHamming = (matches[curMatchIdx]->rightEndHamming >> 14); + if (lastEndHamming == 0) { + score += 3.0f; + } else { + score += 2.0f - 0.5f * lastEndHamming; } - return depth; + idx2depthScore[curMatchIdx] = depthScore(depth, score, hammingDist + lastEndHamming); + return depthScore(depth, score, hammingDist + lastEndHamming); } else { // not a leaf node + uint8_t lastEndHamming = (matches[curMatchIdx]->rightEndHamming >> 14); + if (lastEndHamming == 0) { + score += 3.0f; + } else { + score += 2.0f - 0.5f * lastEndHamming; + } for (auto &nextMatchIdx: linkedMatches.at(curMatchIdx)) { used.insert(nextMatchIdx); - if (idx2depth.find(nextMatchIdx) != idx2depth.end()) { - returnDepth = idx2depth[nextMatchIdx]; - maxDepth = max(maxDepth, returnDepth); + + // Reuse the depth score of nextMatchIdx if it has been calculated + if (idx2depthScore.find(nextMatchIdx) != idx2depthScore.end()) { + returnDepthScore = idx2depthScore[nextMatchIdx]; + if (returnDepthScore.score > bestDepthScore.score + && returnDepthScore.depth > MIN_DEPTH) { + bestDepthScore = returnDepthScore; + edges[matches[curMatchIdx]] = matches[nextMatchIdx]; + } continue; } - returnDepth = DFS(nextMatchIdx, linkedMatches, filteredMatches, depth, MIN_DEPTH, used, idx2depth); - maxDepth = max(maxDepth, returnDepth); - } - if (maxDepth > MIN_DEPTH) { - filteredMatches.push_back(curMatchIdx); - idx2depth[curMatchIdx] = maxDepth; + returnDepthScore = DFS(matches, nextMatchIdx, linkedMatches, depth, MIN_DEPTH, used, idx2depthScore, edges, score, hammingDist + lastEndHamming); + if (returnDepthScore.score > bestDepthScore.score + && returnDepthScore.depth > MIN_DEPTH) { + bestDepthScore = returnDepthScore; + edges[matches[curMatchIdx]] = matches[nextMatchIdx]; + } + } + if (bestDepthScore.depth > MIN_DEPTH) { + idx2depthScore[curMatchIdx] = bestDepthScore; } } - return maxDepth; + return bestDepthScore; } // TaxonScore Taxonomer::getBestGenusMatches_spaced(vector &genusMatches, const Match *matchList, size_t end, @@ -910,87 +1130,87 @@ size_t Taxonomer::DFS(size_t curMatchIdx, const map> & li // //4. no genus // } -TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, - size_t offset, int queryLength) { - TaxID currentGenus; - TaxID currentSpecies; +// TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, +// size_t offset, int queryLength) { +// TaxID currentGenus; +// TaxID currentSpecies; - vector filteredMatches; - vector> matchesForEachGenus; - vector genusScores; - TaxonScore bestScore; - size_t i = offset; - uint8_t curFrame; - vector curFrameMatches; - while (i < end + 1) { - currentGenus = matchList[i].genusId; - // For current genus - while ((i < end + 1) && currentGenus == matchList[i].genusId) { - currentSpecies = matchList[i].speciesId; - - // For current species - while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { - curFrame = matchList[i].qInfo.frame; - curFrameMatches.clear(); - - // For current frame - while ((i < end + 1) && currentSpecies == matchList[i].speciesId - && curFrame == matchList[i].qInfo.frame) { - curFrameMatches.push_back(&matchList[i]); - i ++; - } - if (curFrameMatches.size() > 1) { - remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus); - } - } - } +// vector filteredMatches; +// vector> matchesForEachGenus; +// vector genusScores; +// TaxonScore bestScore; +// size_t i = offset; +// uint8_t curFrame; +// vector curFrameMatches; +// while (i < end + 1) { +// currentGenus = matchList[i].genusId; +// // For current genus +// while ((i < end + 1) && currentGenus == matchList[i].genusId) { +// currentSpecies = matchList[i].speciesId; - // Construct a match combination using filtered matches of current genus - // so that it can best cover the query, and score the combination +// // For current species +// while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { +// curFrame = matchList[i].qInfo.frame; +// curFrameMatches.clear(); + +// // For current frame +// while ((i < end + 1) && currentSpecies == matchList[i].speciesId +// && curFrame == matchList[i].qInfo.frame) { +// curFrameMatches.push_back(&matchList[i]); +// i ++; +// } +// if (curFrameMatches.size() > 1) { +// remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus); +// } +// } +// } - if (!filteredMatches.empty()) { - matchesForEachGenus.push_back(filteredMatches); - genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, queryLength)); - } - filteredMatches.clear(); - } +// // Construct a match combination using filtered matches of current genus +// // so that it can best cover the query, and score the combination - // If there are no meaningful genus - if (genusScores.empty()) { - bestScore.score = 0; - return bestScore; - } +// if (!filteredMatches.empty()) { +// matchesForEachGenus.push_back(filteredMatches); +// genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, queryLength)); +// } +// filteredMatches.clear(); +// } - TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), - [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); +// // If there are no meaningful genus +// if (genusScores.empty()) { +// bestScore.score = 0; +// return bestScore; +// } - vector maxIdx; - for (size_t g = 0; g < genusScores.size(); g++) { - if (genusScores[g].score > maxScore.score * 0.95f) { - maxIdx.push_back(g); - } - } +// TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), +// [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); - bestScore = maxScore; +// vector maxIdx; +// for (size_t g = 0; g < genusScores.size(); g++) { +// if (genusScores[g].score > maxScore.score * 0.95f) { +// maxIdx.push_back(g); +// } +// } - for (unsigned long g : maxIdx) { - for (const Match * m : matchesForEachGenus[g]) { - genusMatches.push_back(*m); - } - } +// bestScore = maxScore; - // More than one genus - if (maxIdx.size() > 1) { - bestScore.taxId = 0; - return bestScore; - } - return bestScore; +// for (unsigned long g : maxIdx) { +// for (const Match * m : matchesForEachGenus[g]) { +// genusMatches.push_back(*m); +// } +// } - //Three cases - //1. one genus - //2. more than one genus - //4. no genus -} +// // More than one genus +// if (maxIdx.size() > 1) { +// bestScore.taxId = 0; +// return bestScore; +// } +// return bestScore; + +// //Three cases +// //1. one genus +// //2. more than one genus +// //4. no genus +// } // TaxonScore Taxonomer::getBestGenusMatches_spaced(vector &genusMatches, const Match *matchList, size_t end, // size_t offset, int readLength) { @@ -1154,7 +1374,7 @@ TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, float score = ((float) coveredLength - hammingSum) / (float) queryLength; float coverage = (float) (coveredLength) / (float) (queryLength); - return {taxId, score, coverage, (int) hammingSum}; + return {taxId, score, coverage, (int) hammingSum, 0}; } TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, @@ -1163,7 +1383,6 @@ TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, int readLength2) { // Calculate Hamming distance & covered length - uint16_t currHammings; int aminoAcidNum_total = ((int) readLength1 / 3) + ((int) readLength2 / 3); int aminoAcidNum_read1 = ((int) readLength1 / 3); int currPos; @@ -1174,8 +1393,18 @@ TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3]; memset(hammingsAtEachPos, 24, (aminoAcidNum_total + 3)); while (f < matchNum) { + uint8_t minHammingDist = 24; + uint16_t currHammings = 0; currPos = (int) filteredMatches[f]->qInfo.pos / 3; - currHammings = filteredMatches[f]->rightEndHamming; + // Find the closest match at current position + while ((f < matchNum) && currPos == (int) filteredMatches[f]->qInfo.pos / 3) { + if (filteredMatches[f]->hamming < minHammingDist) { + minHammingDist = filteredMatches[f]->hamming; + currHammings = filteredMatches[f]->rightEndHamming; + } + f++; + } + // Update hamming distance at each position if (GET_2_BITS(currHammings) < hammingsAtEachPos[currPos + unmaskedPos[0]]) hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); if (GET_2_BITS(currHammings >> 2) < hammingsAtEachPos[currPos + unmaskedPos[1]]) @@ -1231,7 +1460,7 @@ TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (readLength1 + readLength2); // matchesForEachGenus.push_back(move(filteredMatches)); - return {taxId, score, coverage, (int) hammingSum}; + return {taxId, score, coverage, (int) hammingSum, 0}; } TaxonScore Taxonomer::chooseSpecies(const vector &matches, @@ -1363,7 +1592,7 @@ TaxonScore Taxonomer::scoreSpecies(const vector &matches, float score = ((float)coveredLength - hammingSum) / (float) queryLength; float coverage = (float) coveredLength / (float) (queryLength); - return {0, score, coverage, hammingDist}; + return {0, score, coverage, hammingDist, 0}; } TaxonScore Taxonomer::scoreSpecies(const vector &matches, @@ -1442,9 +1671,11 @@ TaxonScore Taxonomer::scoreSpecies(const vector &matches, float score = ((float) (coveredLength_read1 + coveredLength_read2) - hammingSum) / (float) (queryLength + queryLength2); float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (queryLength + queryLength2); - return {0, score, coverage, hammingDist}; + return {0, score, coverage, hammingDist, 0}; } bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) { + // match1 87654321 -> 08765432 + // match2 98765432 -> 08765432 return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF); } \ No newline at end of file diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index a756db73..95365b9b 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -14,11 +14,31 @@ struct TaxonScore { float score; float coverage; int hammingDist; - TaxonScore(TaxID taxId, float score, float coverage, int hammingDist) : - taxId(taxId), score(score), coverage(coverage), hammingDist(hammingDist) {} - TaxonScore() : taxId(0), score(0.0f), coverage(0.0f), hammingDist(0) {} + bool LCA; + TaxonScore(TaxID taxId, float score, float coverage, int hammingDist, bool LCA) : + taxId(taxId), score(score), coverage(coverage), hammingDist(hammingDist), LCA(LCA) {} + TaxonScore() : taxId(0), score(0.0f), coverage(0.0f), hammingDist(0), LCA(false) {} }; +struct depthScore { + depthScore(size_t depth, float score, int hammingDist) : depth(depth), score(score), hammingDist(hammingDist) {} + depthScore() : depth(0), score(0.f), hammingDist(0) {} + size_t depth; + float score; + int hammingDist; +}; + +struct MatchPath { + MatchPath(size_t start, size_t end, float score, int hammingDist) : start(start), end(end), score(score), hammingDist(hammingDist) {} + MatchPath() : start(0), end(0), score(0.f), hammingDist(0) {} + size_t start; + size_t end; + float score; + int hammingDist; + vector matches; +}; + + class Taxonomer { private: NcbiTaxonomy * taxonomy; @@ -73,13 +93,28 @@ class Taxonomer { vector & queryList, const LocalParameters &par); - void remainConsecutiveMatches(vector & curFrameMatches, - vector & filteredMatches, + void remainConsecutiveMatches(const vector & curFrameMatches, + vector & matchPaths, TaxID genusId); - - size_t DFS(size_t curMatchIdx, const map>& linkedMatches, - vector& fiteredMatchIdx, size_t depth, size_t MIN_DEPTH, unordered_set& used, - unordered_map & idx2depth); + + float combineMatchPaths(vector & matchPaths, + vector & combinedMatchPaths, + int readLength); + + bool isMatchPathNotOverlapped(const MatchPath & matchPath1, + const MatchPath & matchPath2); + + depthScore DFS(const vector &matches, size_t curMatchIdx, + const map> &linkedMatches, + size_t depth, size_t MIN_DEPTH, unordered_set &used, + unordered_map &idx2depthScore, + unordered_map & edges, float score, int hammingDist); + // depthScore DFS(const vector & curFrameMatches, + // size_t curMatchIdx, + // const map>& linkedMatches, + // size_t depth, size_t MIN_DEPTH, unordered_set& used, + // unordered_map & idx2depth, + // size_t startPos, vector & matchPaths); static bool isConsecutive(const Match * match1, const Match * match2); From 645a212c1bcfca1ebc3e99f1b91beb792ffb51b7 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 7 Nov 2023 22:59:38 +0900 Subject: [PATCH 56/65] don't extract k-mers span different orfs --- src/commons/SeqIterator.cpp | 22 +++++++++++----------- src/commons/Taxonomer.cpp | 16 ++++++++-------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp index dc49b367..46cf14a8 100644 --- a/src/commons/SeqIterator.cpp +++ b/src/commons/SeqIterator.cpp @@ -539,11 +539,11 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect frame = (genes[0].begin - 1) % 3; leftEnd = 0; while (leftEnd % 3 != frame) leftEnd++; - blocks.emplace_back(leftEnd, genes[1].begin - 1 + 22, 1); + blocks.emplace_back(leftEnd, genes[1].begin - 2, 1); blockIdx++; } else { frame = (genes[0].end - 1) % 3; - rightEnd = genes[1].begin - 1 + 22; + rightEnd = genes[1].begin - 2; while (rightEnd % 3 != frame) rightEnd--; blocks.emplace_back(0, rightEnd, -1); blockIdx++; @@ -583,12 +583,12 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect } else { if (!isReverse) { //forward frame = (genes[geneIdx].begin - 1) % 3; - leftEnd = genes[geneIdx - 1].end - 1 - 22; + leftEnd = genes[geneIdx - 1].end; while (leftEnd % 3 != frame) leftEnd++; blocks.emplace_back(leftEnd, genes[geneIdx].end - 1, 1); blockIdx++; } else { // reverse - blocks.emplace_back(genes[geneIdx - 1].end - 22 - 1, genes[geneIdx].end - 1, -1); + blocks.emplace_back(genes[geneIdx - 1].end, genes[geneIdx].end - 1, -1); blockIdx++; } } @@ -597,24 +597,24 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect if (hasBeenExtendedToLeft) { if (!isReverse) { //forward frame = (genes[geneIdx].begin - 1) % 3; - leftEnd = genes[geneIdx - 1].end - 1 - 22; + leftEnd = genes[geneIdx - 1].end; while (leftEnd % 3 != frame) leftEnd++; - blocks.emplace_back(leftEnd, genes[geneIdx + 1].begin - 1 + 22, 1); + blocks.emplace_back(leftEnd, genes[geneIdx + 1].begin - 2, 1); blockIdx++; } else { frame = (genes[geneIdx].end - 1) % 3; - rightEnd = genes[geneIdx + 1].begin - 1 + 22; + rightEnd = genes[geneIdx + 1].begin - 2; while (rightEnd % 3 != frame) rightEnd--; - blocks.emplace_back(genes[geneIdx - 1].end - 1 - 22, rightEnd, -1); + blocks.emplace_back(genes[geneIdx - 1].end, rightEnd, -1); blockIdx++; } } else { if (!isReverse) { //forward - blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx + 1].begin - 1 + 22, 1); + blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx + 1].begin - 2, 1); blockIdx++; } else { frame = (genes[geneIdx].end - 1) % 3; - rightEnd = genes[geneIdx + 1].begin - 1 + 22; + rightEnd = genes[geneIdx + 1].begin - 2; while (rightEnd % 3 != frame) rightEnd--; blocks.emplace_back(genes[geneIdx].begin - 1, rightEnd, -1); blockIdx++; @@ -639,7 +639,7 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect // If left region is not covered, cover it. leftEnd = genes[numOfGene - 1].begin - 1; if (hasBeenExtendedToLeft) { - leftEnd = genes[numOfGene - 2].end - 1 - 22; + leftEnd = genes[numOfGene - 2].end; if (!isReverse) { frame = (genes[numOfGene - 1].begin - 1) % 3; while (leftEnd % 3 != frame) leftEnd++; diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index a3a3b3b5..6c5fb707 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -168,7 +168,7 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, // sort(speciesMatches.begin(), speciesMatches.end(), // [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); - cout << "7 " << currentQuery << endl; + // cout << "7 " << currentQuery << endl; TaxID result = lowerRankClassification(speciesMatches, speciesScore.taxId); @@ -184,7 +184,7 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, queryList[currentQuery].coverage = speciesScore.coverage; queryList[currentQuery].hammingDist = speciesScore.hammingDist; queryList[currentQuery].newSpecies = false; - cout << "8" << currentQuery << endl; + // cout << "8" << currentQuery << endl; // if (par.printLog) { // cout << "# " << currentQuery << endl; // for (size_t i = 0; i < genusMatches.size(); i++) { @@ -592,9 +592,9 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, // Initialize species2matchPaths species2matchPaths[currentSpecies].emplace_back(0, 0, 0, 0); // cout << "2" << endl; - cout << currentSpecies << endl; + // cout << currentSpecies << endl; float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], readLength1 + readLength2); - cout << endl; + // cout << endl; species2score[currentSpecies] = score; if (score > bestSpScore) { bestSpScore = score; @@ -611,7 +611,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, // cout << "4" << endl; vector maxSpecies; for (auto & spScore : species2score) { - cout << spScore.first << " " << spScore.second << endl; + // cout << spScore.first << " " << spScore.second << endl; if (spScore.second == bestSpScore) { maxSpecies.push_back(spScore.first); } @@ -664,10 +664,10 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, // 2. Add the matchPath with the highest score that is not overlapped with the matchPath in combinedMatchPaths // 3. Repeat 2 until no matchPath can be added for (size_t i = 0; i < matchPaths.size(); i++) { - cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << " " << matchPaths[i].matches.back()->targetId << " " << matchPaths[i].matches.back()->qInfo.frame <targetId << " " << matchPaths[i].matches.back()->qInfo.frame <printMatch(); @@ -685,7 +685,7 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, } if (!isOverlapped) { combinedMatchPaths.push_back(matchPaths[i]); - combinedMatchPaths.back().matches = matchPaths[i].matches; + // combinedMatchPaths.back().matches = matchPaths[i].matches; // cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << endl; // for (auto & match : matchPaths[i].matches) { // match->printMatch(); From ce5b4324ac0cb26cc98540508cf97375f20353eb Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Wed, 8 Nov 2023 23:40:53 +0900 Subject: [PATCH 57/65] merge two linked match paths --- src/commons/Taxonomer.cpp | 87 +++++++++++++++++++++++++------ src/commons/Taxonomer.h | 11 ++-- src/util/mapping2taxon.cpp | 103 +++++++++++++++++++++++++++++++++++++ 3 files changed, 182 insertions(+), 19 deletions(-) create mode 100644 src/util/mapping2taxon.cpp diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 6c5fb707..e3c49b2b 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -1,4 +1,5 @@ #include "Taxonomer.h" +#include "BitManipulateMacros.h" #include "Match.h" #include "NcbiTaxonomy.h" #include @@ -593,7 +594,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, species2matchPaths[currentSpecies].emplace_back(0, 0, 0, 0); // cout << "2" << endl; // cout << currentSpecies << endl; - float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], readLength1 + readLength2); + float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], readLength1 + readLength2, matchList); // cout << endl; species2score[currentSpecies] = score; if (score > bestSpScore) { @@ -653,7 +654,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, float Taxonomer::combineMatchPaths(vector & matchPaths, vector & combinedMatchPaths, - int readLength) { + int readLength, const Match * matchList) { combinedMatchPaths.clear(); // Sort matchPaths by the their score sort(matchPaths.begin(), matchPaths.end(), @@ -675,13 +676,16 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, } else { bool isOverlapped = false; for (size_t j = 0; j < combinedMatchPaths.size(); j++) { - if (!isMatchPathNotOverlapped(matchPaths[i], combinedMatchPaths[j])) { + if (isMatchPathOverlapped(matchPaths[i], combinedMatchPaths[j])) { // overlap! + if (isMatchPathLinked(matchPaths[i], combinedMatchPaths[j])) { + // merge two linked matchPaths by editing the combinedMatchPaths[j] + mergeMatchPaths(matchPaths[i], combinedMatchPaths[j]); + break; + } else { + break; + } isOverlapped = true; - break; - } else { - // cout << matchPaths[i].start << " " << matchPaths[i].end << endl; - // cout << combinedMatchPaths[j].start << " " << combinedMatchPaths[j].end << endl << endl;; - } + } } if (!isOverlapped) { combinedMatchPaths.push_back(matchPaths[i]); @@ -693,10 +697,6 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, } } } - // cout << endl; - - - // Calculate the score of combinedMatchPaths float score = 0; for (auto & matchPath : combinedMatchPaths) { @@ -705,12 +705,59 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, return score / readLength; } -bool Taxonomer::isMatchPathNotOverlapped(const MatchPath & matchPath1, - const MatchPath & matchPath2) { - return (matchPath1.end < matchPath2.start) || (matchPath2.end < matchPath1.start); +bool Taxonomer::isMatchPathLinked(const MatchPath & matchPath1, const MatchPath & matchPath2) { + int overlappedLength = min(matchPath1.end, matchPath2.end) - max(matchPath1.start, matchPath2.start) + 1; + if (!(20 < overlappedLength && overlappedLength < 24)) { + return false; + } + const Match * last; + const Match * first; + if (matchPath1.start < matchPath2.start) { + last = matchPath1.matches.back(); + first = matchPath2.matches.front(); + } else { + last = matchPath2.matches.back(); + first = matchPath1.matches.front(); + } + if (overlappedLength == 21) { + return isConsecutive(last, first); + } else { + return isConsecutive_diffFrame(last, first); + } + return false; +} + +bool Taxonomer::isMatchPathOverlapped(const MatchPath & matchPath1, + const MatchPath & matchPath2) { + return !((matchPath1.end < matchPath2.start) || (matchPath2.end < matchPath1.start)); +} +// 87654321 +void Taxonomer::mergeMatchPaths(const MatchPath & source, MatchPath & target) { + if (source.start < target.start) { + target.start = source.start; + uint8_t lastEndHamming = GET_2_BITS(target.matches.front()->rightEndHamming); + target.hammingDist += source.hammingDist - (source.matches.back()->hamming - lastEndHamming); + target.score += source.score - source.matches.back()->getScore(); + if (lastEndHamming == 0) { + target.score += 3.0f; + } else { + target.score += 2.0f - 0.5f * lastEndHamming; + } + target.matches.insert(target.matches.begin(), source.matches.begin(), source.matches.end() - 1); + } else { + target.end = source.end; + uint8_t lastEndHamming = GET_2_BITS(source.matches.front()->rightEndHamming >> 14); + target.hammingDist += source.hammingDist - (source.matches.front()->hamming - lastEndHamming); + target.score += source.score - source.matches.front()->getScore(); + if (lastEndHamming == 0) { + target.score += 3.0f; + } else { + target.score += 2.0f - 0.5f * lastEndHamming; + } + target.matches.insert(target.matches.end(), source.matches.begin() + 1, source.matches.end()); + } } -\ // if (matchPath1.start > matchPath2.start) { // return isMatchPathOverlapped(matchPath2, matchPath1, readLength); @@ -1678,4 +1725,12 @@ bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) { // match1 87654321 -> 08765432 // match2 98765432 -> 08765432 return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF); +} + +bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * match2) { + // int hamming1 = match1->hamming - GET_2_BITS(match1->rightEndHamming); + // int hamming2 = match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14); + // match1 87654321 -> 08765432 + // match2 98765432 -> 08765432 + return (match1->hamming - GET_2_BITS(match1->rightEndHamming)) == (match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14)); } \ No newline at end of file diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 95365b9b..722ced8b 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -99,10 +99,13 @@ class Taxonomer { float combineMatchPaths(vector & matchPaths, vector & combinedMatchPaths, - int readLength); + int readLength, const Match * matchList); - bool isMatchPathNotOverlapped(const MatchPath & matchPath1, - const MatchPath & matchPath2); + bool isMatchPathOverlapped(const MatchPath & matchPath1, const MatchPath & matchPath2); + + bool isMatchPathLinked(const MatchPath & matchPath1, const MatchPath & matchPath2); + + void mergeMatchPaths(const MatchPath & source, MatchPath & target); depthScore DFS(const vector &matches, size_t curMatchIdx, const map> &linkedMatches, @@ -118,6 +121,8 @@ class Taxonomer { static bool isConsecutive(const Match * match1, const Match * match2); + static bool isConsecutive_diffFrame(const Match * match1, const Match * match2); + TaxonScore getBestGenusMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, int queryLength); diff --git a/src/util/mapping2taxon.cpp b/src/util/mapping2taxon.cpp new file mode 100644 index 00000000..e27928c9 --- /dev/null +++ b/src/util/mapping2taxon.cpp @@ -0,0 +1,103 @@ +#include +#include +#include +#include +#include "Command.h" +#include "LocalParameters.h" +#include "NcbiTaxonomy.h" +#include "common.h" +#include "fstream" +#include +#include + +using namespace std; + +struct read2taxon { + string read; + TaxID taxon; +}; + +int parseTaxId_metamaps(const string & mappingRes) { + vector tokens = Util::split(mappingRes, " "); + return stoi(Util::split(tokens[5], "|")[2]); +} + +// It takes a mapping result of Metamaps. +// The mapping result includes mutliple mappings for a read, which have mapping scores. +// The function returns the taxon ID of the best mapping. +// If there are multiple mappings with the same best score, it returns the LCA of them. +int mapping2taxon(int argc, const char **argv, const Command &command) { + LocalParameters &par = LocalParameters::getLocalInstance(); + par.parseParameters(argc, argv, command, false, Parameters::PARSE_ALLOW_EMPTY, 0); + string mappingFile = par.filenames[0]; + string taxonomyDir = par.filenames[1]; + string output = mappingFile + ".reads2taxon"; + ofstream out(output); + + vector read2taxon; + NcbiTaxonomy *taxonomy = loadTaxonomy("", taxonomyDir); + cout << "Taxonomy loaded" << endl; + + // Iterate through mapping file + ifstream mapping(mappingFile); + string line; + vector taxIds; + string previousRead = ""; + double bestScore = -2; + TaxID bestTaxId = -1; + bool lastStored = false; + + while (getline(mapping, line)) { + vector tokens = Util::split(line, " "); + string currentRead = tokens[0]; + if (currentRead == previousRead) { // Same read + // Get score + stringstream scoreString(tokens[13]); + double curScore = 0; + scoreString >> curScore; + + if (curScore > bestScore) { + taxIds.clear(); + bestScore = curScore; + bestTaxId = parseTaxId_metamaps(line); + taxIds.push_back(bestTaxId); + } else if (curScore == bestScore) { + taxIds.push_back(parseTaxId_metamaps(line)); + bestTaxId = taxonomy->LCA(taxIds)->taxId; + } + lastStored = false; + } else { // New read + // Store results for previous read + // out << previousRead << "\t" << bestTaxId << endl; + read2taxon.push_back({previousRead, bestTaxId}); + lastStored = true; + + // Initialize variables + previousRead = currentRead; + taxIds.clear(); + + // Get score + stringstream scoreString(tokens[13]); + double curScore = 0; + scoreString >> curScore; + + // Update variables + bestScore = curScore; + bestTaxId = parseTaxId_metamaps(line); + taxIds.push_back(bestTaxId); + } + } + + if (!lastStored) { + // out << previousRead << "\t" << bestTaxId << endl; + read2taxon.push_back({previousRead, bestTaxId}); + } + + // Write to file + cout << "Writing to file" << endl; + for (size_t i = 1; i < read2taxon.size(); i++) { + out << read2taxon[i].read << "\t" << read2taxon[i].taxon << "\n"; + } + + return 0; +} \ No newline at end of file From 61c9fff1d75a769818e2e82eb38d0d2ae3fe7fb0 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 9 Nov 2023 15:36:02 +0900 Subject: [PATCH 58/65] 1. Trimming overlapping match paths 2. Gather all the matches of selected species for lower rank classification 3. hamming <= min(minHamming, 6) --- src/commons/KmerMatcher.cpp | 8 +- src/commons/Match.h | 9 +- src/commons/Taxonomer.cpp | 887 ++++++------------------------------ src/commons/Taxonomer.h | 11 +- 4 files changed, 135 insertions(+), 780 deletions(-) diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 8a13ceb1..084b919b 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -292,7 +292,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI // } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, - taxId2genusId[candidateKmerInfos[idx].sequenceID], taxId2speciesId[candidateKmerInfos[idx].sequenceID], selectedHammings[k], selectedHammingSum[k], @@ -334,7 +333,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI // } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, - taxId2genusId[candidateKmerInfos[idx].sequenceID], taxId2speciesId[candidateKmerInfos[idx].sequenceID], selectedHammings[k], selectedHammingSum[k], @@ -430,7 +428,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI // } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, - taxId2genusId[candidateKmerInfos[idx].sequenceID], taxId2speciesId[candidateKmerInfos[idx].sequenceID], selectedHammings[k], selectedHammingSum[k], @@ -516,7 +513,7 @@ void KmerMatcher::compareDna(uint64_t query, // Select target k-mers that passed hamming criteria for (size_t h = 0; h < size; h++) { - if (hammingSums[h] <= 6) {// minHammingSum + hammingMargin) { + if (hammingSums[h] <= min(minHammingSum * 2, 6)) { selectedMatches.push_back(h); selectedHammingSum.push_back(hammingSums[h]); if (frame < 3) { @@ -534,9 +531,6 @@ bool KmerMatcher::compareMatches(const Match& a, const Match& b) { if (a.qInfo.sequenceID != b.qInfo.sequenceID) return a.qInfo.sequenceID < b.qInfo.sequenceID; - if (a.genusId != b.genusId) - return a.genusId < b.genusId; - if (a.speciesId != b.speciesId) return a.speciesId < b.speciesId; diff --git a/src/commons/Match.h b/src/commons/Match.h index f47881ef..9bc1fb44 100644 --- a/src/commons/Match.h +++ b/src/commons/Match.h @@ -6,21 +6,19 @@ #include #include "BitManipulateMacros.h" -struct Match { // 24 byte +struct Match { // 20 byte Match(){} Match(QueryKmerInfo qInfo, int targetId, - TaxID genusId, TaxID speciesId, uint16_t eachHamming, uint8_t hamming, bool redundancy): - qInfo(qInfo), targetId(targetId), genusId(genusId), speciesId(speciesId), + qInfo(qInfo), targetId(targetId), speciesId(speciesId), rightEndHamming(eachHamming), hamming(hamming), redundancy(redundancy) { } QueryKmerInfo qInfo; // 8 TaxID targetId; // 4 taxonomy id infact - TaxID genusId; // 4 TaxID speciesId; // 4 uint16_t rightEndHamming; // 2 uint8_t hamming; // 1 @@ -28,10 +26,9 @@ struct Match { // 24 byte void printMatch() const { std::cout << qInfo.sequenceID << " " << qInfo.pos << " " << qInfo.frame << " " - << targetId << " " << genusId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << " " << getScore() << std::endl; + << targetId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << " " << getScore() << std::endl; } - float getScore(float score = 0.0f, int cnt = 0) const { int currentHamming = GET_2_BITS(rightEndHamming >> cnt * 2); if (currentHamming == 0) { diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index e3c49b2b..c0760d8c 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -2,6 +2,7 @@ #include "BitManipulateMacros.h" #include "Match.h" #include "NcbiTaxonomy.h" +#include "printBinary.h" #include #include #include @@ -59,7 +60,7 @@ void Taxonomer::assignTaxonomy(const Match *matchList, { #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < blockIdx; ++i) { - chooseBestTaxon2(matchBlocks[i].id, + chooseBestTaxon(matchBlocks[i].id, matchBlocks[i].start, matchBlocks[i].end, matchList, @@ -76,12 +77,12 @@ void Taxonomer::assignTaxonomy(const Match *matchList, } -void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, - size_t offset, - size_t end, - const Match *matchList, - vector & queryList, - const LocalParameters &par) { +void Taxonomer::chooseBestTaxon(uint32_t currentQuery, + size_t offset, + size_t end, + const Match *matchList, + vector & queryList, + const LocalParameters &par) { // if (true) { // cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; @@ -89,7 +90,6 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, // cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; // } // } - // Get the best species for current query vector speciesMatches; speciesMatches.reserve(end - offset + 1); @@ -124,21 +124,11 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, // If there are two or more good species level candidates, find the LCA. if (speciesScore.LCA) { - // cout << "LCA" << endl; - // vector genusList; - // genusList.reserve(speciesMatches.size()); - // for (auto & genusMatch : speciesMatches) { - // genusList.push_back(genusMatch.genusId); - // } - // selectedTaxon = taxonomy->LCA(genusList)->taxId; queryList[currentQuery].isClassified = true; queryList[currentQuery].classification = speciesScore.taxId; queryList[currentQuery].score = speciesScore.score; queryList[currentQuery].coverage = speciesScore.coverage; queryList[currentQuery].hammingDist = speciesScore.hammingDist; - // for (auto & spMatch : speciesMatches) { - // queryList[currentQuery].taxCnt[spMatch.targetId]++; - // } return; } @@ -156,20 +146,9 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, return; } - // Sort matches by the position of the query sequence -// sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first, -// genusMatches.begin() + speciesMatchRange[selectedSpecies].second, -// [](const Match & a, const Match & b) { -// if (a.qInfo.position / 3 == b.qInfo.position / 3) -// return a.hamming < b.hamming; -// else -// return a.qInfo.position / 3 < b.qInfo.position / 3; -// }); - - // sort(speciesMatches.begin(), speciesMatches.end(), - // [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); - - // cout << "7 " << currentQuery << endl; + // Sort matches by the coordinate of the query + sort(speciesMatches.begin(), speciesMatches.end(), + [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); TaxID result = lowerRankClassification(speciesMatches, speciesScore.taxId); @@ -185,7 +164,6 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, queryList[currentQuery].coverage = speciesScore.coverage; queryList[currentQuery].hammingDist = speciesScore.hammingDist; queryList[currentQuery].newSpecies = false; - // cout << "8" << currentQuery << endl; // if (par.printLog) { // cout << "# " << currentQuery << endl; // for (size_t i = 0; i < genusMatches.size(); i++) { @@ -199,227 +177,36 @@ void Taxonomer::chooseBestTaxon2(uint32_t currentQuery, // } } -// void Taxonomer::chooseBestTaxon(uint32_t currentQuery, -// size_t offset, -// size_t end, -// const Match *matchList, -// vector & queryList, -// const LocalParameters &par) { -// TaxID selectedTaxon; - -// // if (true) { -// // cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; -// // for (size_t i = offset; i < end + 1; i++) { -// // cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; -// // } -// // } - -// // Get the best genus for current query -// vector genusMatches; -// genusMatches.reserve(end - offset + 1); -// TaxonScore genusScore(0, 0, 0, 0); -// if (par.seqMode == 2) { -// genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, -// queryList[currentQuery].queryLength, -// queryList[currentQuery].queryLength2); -// } else { -// genusScore = getBestGenusMatches(genusMatches, matchList, end, offset, -// queryList[currentQuery].queryLength); -// } - -// // if (true) { -// // cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n"; -// // for (size_t i = 0; i < genusMatches.size(); i++) { -// // cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) << " " << int(genusMatches[i].redundancy) << endl; -// // } -// // cout << "Genus score: " << genusScore.score << "\n"; -// // } - -// // If there is no proper genus for current query, it is un-classified. -// if (genusScore.score == 0 || genusScore.coverage < par.minCoverage || genusScore.score < par.minScore) { -// queryList[currentQuery].isClassified = false; -// queryList[currentQuery].classification = 0; -// queryList[currentQuery].score = genusScore.score; -// queryList[currentQuery].coverage = genusScore.coverage; -// queryList[currentQuery].hammingDist = genusScore.hammingDist; -// queryList[currentQuery].newSpecies = false; -// return; -// } - -// // If there are two or more good genus level candidates, find the LCA. -// if (genusScore.taxId == 0) { -// vector genusList; -// genusList.reserve(genusMatches.size()); -// for (auto & genusMatch : genusMatches) { -// genusList.push_back(genusMatch.genusId); -// } -// selectedTaxon = taxonomy->LCA(genusList)->taxId; -// queryList[currentQuery].isClassified = true; -// queryList[currentQuery].classification = selectedTaxon; -// queryList[currentQuery].score = genusScore.score; -// queryList[currentQuery].coverage = genusScore.coverage; -// queryList[currentQuery].hammingDist = genusScore.hammingDist; -// for (auto & genusMatch : genusMatches) { -// queryList[currentQuery].taxCnt[genusMatch.targetId]++; -// } -// return; -// } - -// // Choose the species with the highest coverage. -// TaxID selectedSpecies; -// TaxonScore speciesScore; -// vector species; -// unordered_map> speciesMatchRange; -// if (par.seqMode == 2) { -// speciesScore = chooseSpecies(genusMatches, -// queryList[currentQuery].queryLength, -// queryList[currentQuery].queryLength2, -// species, -// speciesMatchRange); -// } else { -// speciesScore = chooseSpecies(genusMatches, -// queryList[currentQuery].queryLength, -// species, -// speciesMatchRange); -// } - - -// // Classify to LCA if more than one species are selected -// if (species.size() > 1) { -// queryList[currentQuery].isClassified = true; -// queryList[currentQuery].classification = taxonomy->LCA(species)->taxId; -// queryList[currentQuery].score = genusScore.score; -// queryList[currentQuery].coverage = genusScore.coverage; -// queryList[currentQuery].hammingDist = genusScore.hammingDist; -// for (auto & genusMatch : genusMatches) { -// queryList[currentQuery].taxCnt[genusMatch.targetId]++; -// } -// return; -// } - -// // If score is not enough, classify to the parent of the selected species -// if (speciesScore.score < par.minSpScore) { -// queryList[currentQuery].isClassified = true; -// queryList[currentQuery].classification = taxonomy->taxonNode( -// taxonomy->getTaxIdAtRank(species[0], "species"))->parentTaxId; -// queryList[currentQuery].score = genusScore.score; -// queryList[currentQuery].coverage = genusScore.coverage; -// queryList[currentQuery].hammingDist = genusScore.hammingDist; -// for (auto & genusMatch : genusMatches) { -// if(genusMatch.speciesId == species[0]){ -// queryList[currentQuery].taxCnt[genusMatch.targetId]++; -// } -// } -// return; -// } - -// // Sort matches by the position of the query sequence -// selectedSpecies = species[0]; -// // sort(genusMatches.begin() + speciesMatchRange[selectedSpecies].first, -// // genusMatches.begin() + speciesMatchRange[selectedSpecies].second, -// // [](const Match & a, const Match & b) { -// // if (a.qInfo.position / 3 == b.qInfo.position / 3) -// // return a.hamming < b.hamming; -// // else -// // return a.qInfo.position / 3 < b.qInfo.position / 3; -// // }); - -// vector::const_iterator first = genusMatches.begin() + speciesMatchRange[selectedSpecies].first; -// vector::const_iterator last = genusMatches.begin() + speciesMatchRange[selectedSpecies].second; -// vector speciesMatches(first, last); - - -// sort(speciesMatches.begin(), speciesMatches.end(), -// [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); - -// TaxID result = lowerRankClassification(speciesMatches, selectedSpecies); - -// // Record matches of selected species -// for (size_t i = speciesMatchRange[selectedSpecies].first; i < speciesMatchRange[selectedSpecies].second; i++) { -// queryList[currentQuery].taxCnt[genusMatches[i].targetId]++; -// } - -// // Store classification results -// queryList[currentQuery].isClassified = true; -// queryList[currentQuery].classification = result; -// queryList[currentQuery].score = speciesScore.score; -// queryList[currentQuery].coverage = speciesScore.coverage; -// queryList[currentQuery].hammingDist = speciesScore.hammingDist; -// queryList[currentQuery].newSpecies = false; -// // if (par.printLog) { -// // cout << "# " << currentQuery << endl; -// // for (size_t i = 0; i < genusMatches.size(); i++) { -// // cout << i << " " << genusMatches[i].qInfo.pos << " " << -// // genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl; -// // } -// // cout << "Score: " << speciesScore.score << " " << selectedSpecies << " " -// // << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx) -// // -// // << endl; -// // } -// } - TaxID Taxonomer::lowerRankClassification(vector &matches, TaxID spTaxId) { unordered_map taxCnt; size_t matchNum = matches.size(); - // cout << spTaxId << endl; - for (size_t i = 0; i < matchNum; i++) { // cout << matches[i].targetId << endl; - taxCnt[matches[i].targetId] ++; - // size_t currQuotient = matches[i].qInfo.pos / 3; - // uint8_t minHamming = 0; //matches[i].hamming; - // Match * minHammingMatch = & matches[i]; - // TaxID minHammingTaxId = minHammingMatch->targetId; - // while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) { - // if (matches[i].hamming < minHamming) { - // minHamming = matches[i].hamming; - // minHammingMatch = & matches[i]; - // minHammingTaxId = minHammingMatch->targetId; - // } else if (matches[i].hamming == minHamming) { - // minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId); - // minHammingMatch->redundancy = true; - // matches[i].redundancy = true; - // } - // i++; - // } - // taxCnt[minHammingTaxId]++; + // taxCnt[matches[i].targetId] ++; + size_t currQuotient = matches[i].qInfo.pos / 3; + uint8_t minHamming = matches[i].hamming; + Match * minHammingMatch = & matches[i]; + TaxID minHammingTaxId = minHammingMatch->targetId; + while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) { + if (matches[i].hamming < minHamming) { + minHamming = matches[i].hamming; + minHammingMatch = & matches[i]; + minHammingTaxId = minHammingMatch->targetId; + } else if (matches[i].hamming == minHamming) { + minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId); + minHammingMatch->redundancy = true; + matches[i].redundancy = true; + } + i++; + } + taxCnt[minHammingTaxId]++; } - // int i = matchRange.second - 1; - // while ( i >= matchRange.first ) { - // size_t currQuotient = matches[i].qInfo.pos / 3; - // uint8_t minHamming = matches[i].hamming; - // Match * minHammingMatch = & matches[i]; - // TaxID minHammingTaxId = minHammingMatch->targetId; - // i --; - // while ( (i >= matchRange.first) && (currQuotient == matches[i].qInfo.pos / 3) ) { - // if (matches[i].hamming < minHamming) { - // minHamming = matches[i].hamming; - // minHammingMatch = & matches[i]; - // minHammingTaxId = minHammingMatch->targetId; - // } else if (matches[i].hamming == minHamming) { - // minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId); - // minHammingMatch->redundancy = true; - // matches[i].redundancy = true; - // } - // i--; - // } - // taxCnt[minHammingTaxId]++; - // } - unordered_map cladeCnt; - // cout << "8" << endl; getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); - // // print cladeCnt - // for (auto it = cladeCnt.begin(); it != cladeCnt.end(); it++) { - // cout << it->first << " " << it->second.taxCount << " " << it->second.cladeCount << endl; - // } - // cout << "9" << endl; if (accessionLevel == 2) { // Don't do accession-level classification // Remove leaf nodes - // cout << "10" << endl; for (auto it = cladeCnt.begin(); it != cladeCnt.end(); it++) { TaxonNode const * taxon = taxonomy->taxonNode(it->first); if (strcmp(taxonomy->getString(taxon->rankIdx), "") == 0) { @@ -429,10 +216,8 @@ TaxID Taxonomer::lowerRankClassification(vector &matches, TaxID spTaxId) it->first)); } } - return BFS(cladeCnt, spTaxId); } else { - // cout << "10-2" << endl; return BFS(cladeCnt, spTaxId); } } @@ -441,8 +226,6 @@ void Taxonomer::getSpeciesCladeCounts(const unordered_map & unordered_map & cladeCount, TaxID speciesTaxID) { for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) { -// cladeCount[it->first].taxCount = it->second; -// cladeCount[it->first].cladeCount += it->second; TaxonNode const * taxon = taxonomy->taxonNode(it->first); cladeCount[taxon->taxId].taxCount = it->second; cladeCount[taxon->taxId].cladeCount += it->second; @@ -487,21 +270,22 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, size_t end, size_t offset, int queryLength) { - TaxID currentSpecies; - vector filteredMatches; vector> matchesForEachSpecies; - vector speciesScores; TaxonScore bestScore; - size_t i = offset; - uint8_t curFrame; vector curFrameMatches; vector matchPaths; + unordered_map species2score; + unordered_map> species2matchPaths; + float bestSpScore = 0; + unordered_map> speciesMatchRange; - while (i < end + 1) { - currentSpecies = matchList[i].speciesId; + size_t i = offset; + while (i < end + 1) { + TaxID currentSpecies = matchList[i].speciesId; + size_t start = i; // For current species while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { - curFrame = matchList[i].qInfo.frame; + uint8_t curFrame = matchList[i].qInfo.frame; curFrameMatches.clear(); // For current frame while ((i < end + 1) && currentSpecies == matchList[i].speciesId && curFrame == matchList[i].qInfo.frame) { @@ -512,44 +296,61 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, remainConsecutiveMatches(curFrameMatches, matchPaths, currentSpecies); } } - // Construct a match combination using filtered matches of current species - // so that it can best cover the query, and score the combination - if (!filteredMatches.empty()) { - matchesForEachSpecies.push_back(filteredMatches); - speciesScores.push_back(scoreTaxon(filteredMatches, currentSpecies, queryLength)); + speciesMatchRange[currentSpecies] = make_pair(start, i); + // Combine MatchPaths + if (!matchPaths.empty()) { + // cout << currentSpecies << endl; + float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], queryLength); + // cout << endl; + if (score > 1.0f) {score = 1.0f;} + species2score[currentSpecies] = score; + if (score > bestSpScore) { + bestSpScore = score; + } } - filteredMatches.clear(); + matchPaths.clear(); } // If there are no meaningful species - if (speciesScores.empty()) { + if (species2score.empty()) { bestScore.score = 0; return bestScore; } - TaxonScore maxScore = *max_element(speciesScores.begin(), speciesScores.end(), - [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); - - vector maxIdx; - for (size_t g = 0; g < speciesScores.size(); g++) { - if (speciesScores[g].score == maxScore.score) { - maxIdx.push_back(g); + vector maxSpecies; + for (auto & spScore : species2score) { + if (spScore.second > bestSpScore * 0.99) { + maxSpecies.push_back(spScore.first); } } - bestScore = maxScore; - for (unsigned long g : maxIdx) { - for (const Match * m : matchesForEachSpecies[g]) { - speciesMatches.push_back(*m); + // More than one species --> LCA + if (maxSpecies.size() > 1) { + bestScore.LCA = true; + bestScore.taxId = taxonomy->LCA(maxSpecies)->taxId; + for (auto & sp : maxSpecies) { + bestScore.score += species2score[sp]; } + bestScore.score /= maxSpecies.size(); + return bestScore; } - // More than one species - if (maxIdx.size() > 1) { - bestScore.taxId = 0; + // One species + bestScore.taxId = maxSpecies[0]; + bestScore.score = species2score[maxSpecies[0]]; + float coveredLength = 0.f; + int hammingDist = 0; + for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { + coveredLength += matchPath.end - matchPath.start + 1; + hammingDist += matchPath.hammingDist; + for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) { + speciesMatches.push_back(matchList[i]); + } } - - return bestScore; + bestScore.coverage = coveredLength / queryLength; + bestScore.hammingDist = hammingDist; + + return bestScore; } TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, @@ -558,24 +359,22 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, size_t offset, int readLength1, int readLength2) { - TaxID currentSpecies; - vector filteredMatches; vector> matchesForEachSpecies; - vector speciesScores; - TaxonScore bestScore; - size_t i = offset; - uint8_t curFrame; + TaxonScore bestScore; vector curFrameMatches; vector matchPaths; unordered_map species2score; unordered_map> species2matchPaths; float bestSpScore = 0; - - while (i < end + 1) { - currentSpecies = matchList[i].speciesId; + unordered_map> speciesMatchRange; + + size_t i = offset; + while (i < end + 1) { + TaxID currentSpecies = matchList[i].speciesId; + size_t start = i; // For current species while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { - curFrame = matchList[i].qInfo.frame; + uint8_t curFrame = matchList[i].qInfo.frame; curFrameMatches.clear(); // For current frame while ((i < end + 1) && currentSpecies == matchList[i].speciesId && curFrame == matchList[i].qInfo.frame) { @@ -583,19 +382,16 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, i ++; } if (curFrameMatches.size() > 1) { - // cout << "1" << endl; remainConsecutiveMatches(curFrameMatches, matchPaths, currentSpecies); } } + speciesMatchRange[currentSpecies] = make_pair(start, i); // Combine MatchPaths - // so that it can best cover the query, and score the combination if (!matchPaths.empty()) { - // Initialize species2matchPaths - species2matchPaths[currentSpecies].emplace_back(0, 0, 0, 0); - // cout << "2" << endl; // cout << currentSpecies << endl; - float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], readLength1 + readLength2, matchList); + float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], readLength1 + readLength2); // cout << endl; + if (score > 1.0f) {score = 1.0f;} species2score[currentSpecies] = score; if (score > bestSpScore) { bestSpScore = score; @@ -609,15 +405,14 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, bestScore.score = 0; return bestScore; } - // cout << "4" << endl; + vector maxSpecies; for (auto & spScore : species2score) { - // cout << spScore.first << " " << spScore.second << endl; - if (spScore.second == bestSpScore) { + if (spScore.second > bestSpScore * 0.99) { maxSpecies.push_back(spScore.first); } } - // cout << "5" << endl; + // More than one species --> LCA if (maxSpecies.size() > 1) { bestScore.LCA = true; @@ -635,26 +430,21 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, float coveredLength = 0.f; int hammingDist = 0; for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { - // cout << "here" << endl; coveredLength += matchPath.end - matchPath.start + 1; hammingDist += matchPath.hammingDist; - for (auto match : matchPath.matches) { - // cout << match->targetId << endl; - // match->printMatch(); - speciesMatches.push_back(*match); - // speciesMatches.back().printMatch(); + for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) { + speciesMatches.push_back(matchList[i]); } } bestScore.coverage = coveredLength / (readLength1 + readLength2); bestScore.hammingDist = hammingDist; -// cout << "6" << endl; return bestScore; } float Taxonomer::combineMatchPaths(vector & matchPaths, vector & combinedMatchPaths, - int readLength, const Match * matchList) { + int readLength) { combinedMatchPaths.clear(); // Sort matchPaths by the their score sort(matchPaths.begin(), matchPaths.end(), @@ -665,35 +455,24 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, // 2. Add the matchPath with the highest score that is not overlapped with the matchPath in combinedMatchPaths // 3. Repeat 2 until no matchPath can be added for (size_t i = 0; i < matchPaths.size(); i++) { - // cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << " " << matchPaths[i].matches.back()->targetId << " " << matchPaths[i].matches.back()->qInfo.frame <printMatch(); - // } } else { bool isOverlapped = false; for (size_t j = 0; j < combinedMatchPaths.size(); j++) { if (isMatchPathOverlapped(matchPaths[i], combinedMatchPaths[j])) { // overlap! if (isMatchPathLinked(matchPaths[i], combinedMatchPaths[j])) { // merge two linked matchPaths by editing the combinedMatchPaths[j] - mergeMatchPaths(matchPaths[i], combinedMatchPaths[j]); - break; + trimMatchPath(matchPaths[i], combinedMatchPaths[j]); + continue; } else { + isOverlapped = true; break; } - isOverlapped = true; } } if (!isOverlapped) { combinedMatchPaths.push_back(matchPaths[i]); - // combinedMatchPaths.back().matches = matchPaths[i].matches; - // cout << matchPaths[i].start << " " << matchPaths[i].end << " " << matchPaths[i].score << endl; - // for (auto & match : matchPaths[i].matches) { - // match->printMatch(); - // } } } } @@ -707,7 +486,7 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, bool Taxonomer::isMatchPathLinked(const MatchPath & matchPath1, const MatchPath & matchPath2) { int overlappedLength = min(matchPath1.end, matchPath2.end) - max(matchPath1.start, matchPath2.start) + 1; - if (!(20 < overlappedLength && overlappedLength < 24)) { + if (20 >= overlappedLength || overlappedLength >= 24) { return false; } const Match * last; @@ -722,6 +501,7 @@ bool Taxonomer::isMatchPathLinked(const MatchPath & matchPath1, const MatchPath if (overlappedLength == 21) { return isConsecutive(last, first); } else { + return isConsecutive_diffFrame(last, first); } return false; @@ -758,114 +538,32 @@ void Taxonomer::mergeMatchPaths(const MatchPath & source, MatchPath & target) { } } - -// if (matchPath1.start > matchPath2.start) { -// return isMatchPathOverlapped(matchPath2, matchPath1, readLength); -// } -// if (matchPath1.end < matchPath2.start) { -// return false; -// } -// if (matchPath1.endPos >= matchPath2.startPos) { -// if (matchPath1.endPos <= matchPath2.endPos) { -// return true; -// } else { -// if (matchPath1.startPos + readLength - 1 >= matchPath2.startPos) { -// return true; -// } else { -// return false; -// } -// } -// } -// return false; -// } - - -// TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, -// size_t offset, int readLength1, int readLength2) { -// TaxID currentGenus; -// TaxID currentSpecies; - -// vector filteredMatches; -// vector> matchesForEachGenus; -// vector genusScores; -// TaxonScore bestScore; -// size_t i = offset; -// uint8_t curFrame; -// vector curFrameMatches; -// while (i < end + 1) { -// // currentGenus = taxId2genusId[matchList[i].targetId]; -// currentGenus = matchList[i].genusId; -// // For current genus -// while ((i < end + 1) && currentGenus == matchList[i].genusId) { -// // currentSpecies = taxId2speciesId[matchList[i].targetId]; -// currentSpecies = matchList[i].speciesId; -// // if (par.printLog) { -// // cout << currentGenus << " " << currentSpecies << endl; -// // } -// // For current species -// while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { -// curFrame = matchList[i].qInfo.frame; -// curFrameMatches.clear(); - -// // For current frame -// while ((i < end + 1) && currentSpecies == matchList[i].speciesId -// && curFrame == matchList[i].qInfo.frame) { -// curFrameMatches.push_back(&matchList[i]); -// i ++; -// } -// if (curFrameMatches.size() > 1) { -// remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus); -// } -// } -// } - -// // Construct a match combination using filtered matches of current genus -// // so that it can best cover the query, and score the combination -// if (!filteredMatches.empty()) { -// matchesForEachGenus.push_back(filteredMatches); -// genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, readLength1, readLength2)); -// } -// filteredMatches.clear(); -// } - -// // If there are no meaningful genus -// if (genusScores.empty()) { -// bestScore.score = 0; -// return bestScore; -// } - -// TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), -// [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); - -// vector maxIdx; -// for (size_t g = 0; g < genusScores.size(); g++) { -// if (genusScores[g].score > maxScore.score * 0.95f) { -// maxIdx.push_back(g); -// } -// } -// bestScore = maxScore; - -// for (unsigned long g : maxIdx) { -// for (const Match * m : matchesForEachGenus[g]) { -// genusMatches.push_back(*m); -// } -// } - - - -// // More than one genus -// if (maxIdx.size() > 1) { -// bestScore.taxId = 0; -// return bestScore; -// } - -// return bestScore; - -// //Three cases -// //1. one genus -// //2. more than one genus -// //4. no genus -// } +void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2) { + int margin = min(path1.end, path2.end) - max(path1.start, path2.start) + 1 - 21; + if (path1.start < path2.start) { + path1.end = path2.start - 1; + uint8_t lastEndHamming = GET_2_BITS(path1.matches.back()->rightEndHamming); + path1.hammingDist = path1.hammingDist - (path1.matches.back()->hamming - lastEndHamming); + path1.score = path1.score - path1.matches.back()->getScore() - margin; + if (lastEndHamming == 0) { + path1.score += 3.0f; + } else { + path1.score += 2.0f - 0.5f * lastEndHamming; + } + path1.matches.pop_back(); + } else { + path1.start = path2.end + 1; + uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14); + path1.hammingDist = path1.hammingDist - (path1.matches.front()->hamming - lastEndHamming); + path1.score = path1.score - path1.matches.front()->getScore() - margin; + if (lastEndHamming == 0) { + path1.score += 3.0f; + } else { + path1.score += 2.0f - 0.5f * lastEndHamming; + } + path1.matches.erase(path1.matches.begin()); + } +} void Taxonomer::remainConsecutiveMatches(const vector & curFrameMatches, vector & matchPaths, @@ -983,39 +681,6 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM // } } -// size_t Taxonomer::DFS(size_t curMatchIdx, const map> & linkedMatches, -// vector& filteredMatches, size_t depth, size_t MIN_DEPTH, unordered_set& used, -// unordered_map & idx2depth) { -// depth++; -// size_t maxDepth = 0; -// size_t returnDepth = 0; -// if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { -// // reached a leaf node -// idx2depth[curMatchIdx] = depth; -// if (depth > MIN_DEPTH) { -// filteredMatches.push_back(curMatchIdx); -// } -// return depth; -// } else { // not a leaf node -// for (auto &nextMatchIdx: linkedMatches.at(curMatchIdx)) { -// used.insert(nextMatchIdx); -// if (idx2depth.find(nextMatchIdx) != idx2depth.end()) { -// returnDepth = idx2depth[nextMatchIdx]; -// maxDepth = max(maxDepth, returnDepth); -// continue; -// } -// returnDepth = DFS(nextMatchIdx, linkedMatches, filteredMatches, depth, MIN_DEPTH, used, idx2depth); -// maxDepth = max(maxDepth, returnDepth); -// } -// if (maxDepth > MIN_DEPTH) { -// filteredMatches.push_back(curMatchIdx); -// idx2depth[curMatchIdx] = maxDepth; -// } -// } -// return maxDepth; -// } - -// return: end depthScore Taxonomer::DFS(const vector &matches, size_t curMatchIdx, const map> &linkedMatches, @@ -1069,304 +734,6 @@ depthScore Taxonomer::DFS(const vector &matches, return bestDepthScore; } -// TaxonScore Taxonomer::getBestGenusMatches_spaced(vector &genusMatches, const Match *matchList, size_t end, -// size_t offset, int readLength1, int readLength2) { -// TaxID currentGenus; -// TaxID currentSpecies; - -// vector tempMatchContainer; -// vector filteredMatches; -// vector> matchesForEachGenus; -// vector conservedWithinGenus; -// vector genusScores; -// TaxonScore bestScore; -// size_t i = offset; -// bool lastIn; -// while (i + 1 < end + 1) { -// currentGenus = matchList[i].genusId; -// // For current genus -// while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) { -// // currentSpecies = taxId2speciesId[matchList[i].targetId]; -// currentSpecies = matchList[i].speciesId; -// // For current species -// // Filter un-consecutive matches (probably random matches) -// lastIn = false; -// int distance = 0; -// int diffPosCntOfCurrRange = 1; -// int dnaDist = 0; - -// // For the same species -// while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) { -// distance = matchList[i+1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3; -// dnaDist = matchList[i+1].qInfo.pos - matchList[i].qInfo.pos; -// if (distance == 0) { // At the same position -// tempMatchContainer.push_back(matchList + i); -// } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping -// lastIn = true; -// tempMatchContainer.push_back(matchList + i); -// diffPosCntOfCurrRange ++; -// } else { // Not consecutive --> End range -// if (lastIn){ -// tempMatchContainer.push_back(matchList + i); -// if (diffPosCntOfCurrRange >= minCoveredPos) { -// filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), -// tempMatchContainer.end()); -// } -// } -// lastIn = false; -// // Initialize range info -// tempMatchContainer.clear(); -// diffPosCntOfCurrRange = 1; -// } -// i++; -// } - -// // Met next species -// if (lastIn) { -// tempMatchContainer.push_back(matchList + i); -// if (diffPosCntOfCurrRange >= minCoveredPos) { -// filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), -// tempMatchContainer.end()); -// } -// } -// tempMatchContainer.clear(); -// i++; -// } - -// // Construct a match combination using filtered matches of current genus -// // so that it can best cover the query, and score the combination -// if (!filteredMatches.empty()) { -// genusScores.push_back(scoreTaxon(filteredMatches, readLength1, readLength2)); -// } -// filteredMatches.clear(); -// } - -// // If there are no meaningful genus -// if (genusScores.empty()) { -// bestScore.score = 0; -// return bestScore; -// } - -// TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), -// [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); - -// vector maxIdx; -// for (size_t g = 0; g < genusScores.size(); g++) { -// if (genusScores[g].score > maxScore.score * 0.95f) { -// maxIdx.push_back(g); -// } -// } -// bestScore = maxScore; - -// for (unsigned long g : maxIdx) { -// for (const Match * m : matchesForEachGenus[g]) { -// genusMatches.push_back(*m); -// } -// } - -// // More than one genus -// if (maxIdx.size() > 1) { -// bestScore.taxId = 0; -// return bestScore; -// } -// return bestScore; - -// //Three cases -// //1. one genus -// //2. more than one genus -// //4. no genus -// } - -// TaxonScore Taxonomer::getBestGenusMatches(vector &genusMatches, const Match *matchList, size_t end, -// size_t offset, int queryLength) { -// TaxID currentGenus; -// TaxID currentSpecies; - -// vector filteredMatches; -// vector> matchesForEachGenus; -// vector genusScores; -// TaxonScore bestScore; -// size_t i = offset; -// uint8_t curFrame; -// vector curFrameMatches; -// while (i < end + 1) { -// currentGenus = matchList[i].genusId; -// // For current genus -// while ((i < end + 1) && currentGenus == matchList[i].genusId) { -// currentSpecies = matchList[i].speciesId; - -// // For current species -// while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { -// curFrame = matchList[i].qInfo.frame; -// curFrameMatches.clear(); - -// // For current frame -// while ((i < end + 1) && currentSpecies == matchList[i].speciesId -// && curFrame == matchList[i].qInfo.frame) { -// curFrameMatches.push_back(&matchList[i]); -// i ++; -// } -// if (curFrameMatches.size() > 1) { -// remainConsecutiveMatches(curFrameMatches, filteredMatches, currentGenus); -// } -// } -// } - -// // Construct a match combination using filtered matches of current genus -// // so that it can best cover the query, and score the combination - -// if (!filteredMatches.empty()) { -// matchesForEachGenus.push_back(filteredMatches); -// genusScores.push_back(scoreTaxon(filteredMatches, currentGenus, queryLength)); -// } -// filteredMatches.clear(); -// } - -// // If there are no meaningful genus -// if (genusScores.empty()) { -// bestScore.score = 0; -// return bestScore; -// } - -// TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), -// [](const TaxonScore & a, const TaxonScore & b) { return a.score < b.score; }); - -// vector maxIdx; -// for (size_t g = 0; g < genusScores.size(); g++) { -// if (genusScores[g].score > maxScore.score * 0.95f) { -// maxIdx.push_back(g); -// } -// } - -// bestScore = maxScore; - -// for (unsigned long g : maxIdx) { -// for (const Match * m : matchesForEachGenus[g]) { -// genusMatches.push_back(*m); -// } -// } - -// // More than one genus -// if (maxIdx.size() > 1) { -// bestScore.taxId = 0; -// return bestScore; -// } -// return bestScore; - -// //Three cases -// //1. one genus -// //2. more than one genus -// //4. no genus -// } - -// TaxonScore Taxonomer::getBestGenusMatches_spaced(vector &genusMatches, const Match *matchList, size_t end, -// size_t offset, int readLength) { -// TaxID currentGenus; -// TaxID currentSpecies; - -// vector tempMatchContainer; -// vector filteredMatches; -// vector> matchesForEachGenus; -// vector conservedWithinGenus; -// vector genusScores; -// TaxonScore bestScore; -// size_t i = offset; -// bool lastIn; -// size_t speciesMatchCnt; -// while (i + 1 < end + 1) { -// currentGenus = matchList[i].genusId; -// // For current genus -// while ((i + 1 < end + 1) && currentGenus == matchList[i].genusId) { -// currentSpecies = matchList[i].speciesId; -// // For current species -// // Filter un-consecutive matches (probably random matches) -// lastIn = false; -// int distance = 0; -// int diffPosCntOfCurrRange = 1; -// int dnaDist = 0; - -// // For the same species -// while ((i + 1 < end + 1) && currentSpecies == matchList[i + 1].speciesId) { -// distance = matchList[i + 1].qInfo.pos / 3 - matchList[i].qInfo.pos / 3; -// dnaDist = matchList[i + 1].qInfo.pos - matchList[i].qInfo.pos; -// if (distance == 0) { // At the same position -// tempMatchContainer.push_back(matchList + i); -// } else if (dnaDist < (8 + spaceNum + maxGap) * 3) { // Overlapping -// lastIn = true; -// tempMatchContainer.push_back(matchList + i); -// diffPosCntOfCurrRange++; -// } else { // Not consecutive --> End range -// if (lastIn) { -// tempMatchContainer.push_back(matchList + i); -// if (diffPosCntOfCurrRange >= minCoveredPos) { -// filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), -// tempMatchContainer.end()); -// } -// } -// lastIn = false; -// // Initialize range info -// tempMatchContainer.clear(); -// diffPosCntOfCurrRange = 1; -// } -// i++; -// } - -// // Met next species -// if (lastIn) { -// tempMatchContainer.push_back(matchList + i); -// if (diffPosCntOfCurrRange >= minCoveredPos) { -// filteredMatches.insert(filteredMatches.end(), tempMatchContainer.begin(), -// tempMatchContainer.end()); -// } -// } -// tempMatchContainer.clear(); -// i++; -// } - -// // Construct a match combination using filtered matches of current genus -// // so that it can best cover the query, and score the combination -// if (!filteredMatches.empty()) { -// genusScores.push_back(scoreTaxon(filteredMatches, readLength)); -// } -// filteredMatches.clear(); -// } - -// // If there are no meaningful genus -// if (genusScores.empty()) { -// bestScore.score = 0; -// return bestScore; -// } - -// TaxonScore maxScore = *max_element(genusScores.begin(), genusScores.end(), -// [](const TaxonScore &a, const TaxonScore &b) { return a.score < b.score; }); - -// vector maxIdx; -// for (size_t g = 0; g < genusScores.size(); g++) { -// if (genusScores[g].score > maxScore.score * 0.95f) { -// maxIdx.push_back(g); -// } -// } -// bestScore = maxScore; - -// for (unsigned long g: maxIdx) { -// genusMatches.insert(genusMatches.end(), -// matchesForEachGenus[g].begin(), -// matchesForEachGenus[g].end()); -// } - -// // More than one genus -// if (maxIdx.size() > 1) { -// bestScore.taxId = 0; -// return bestScore; -// } -// return bestScore; - -// //Three cases -// //1. one genus -// //2. more than one genus -// //4. no genus -// } - TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, TaxID taxId, int queryLength) { @@ -1730,6 +1097,8 @@ bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) { bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * match2) { // int hamming1 = match1->hamming - GET_2_BITS(match1->rightEndHamming); // int hamming2 = match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14); + // cout << match1->rightEndHamming << " " << match2->rightEndHamming << endl; + // cout << hamming1 << " " << hamming2 << endl; // match1 87654321 -> 08765432 // match2 98765432 -> 08765432 return (match1->hamming - GET_2_BITS(match1->rightEndHamming)) == (match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14)); diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 722ced8b..ee131df1 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -85,13 +85,6 @@ class Taxonomer { const Match *matchList, vector & queryList, const LocalParameters &par); - - void chooseBestTaxon2(uint32_t currentQuery, - size_t offset, - size_t end, - const Match *matchList, - vector & queryList, - const LocalParameters &par); void remainConsecutiveMatches(const vector & curFrameMatches, vector & matchPaths, @@ -99,7 +92,7 @@ class Taxonomer { float combineMatchPaths(vector & matchPaths, vector & combinedMatchPaths, - int readLength, const Match * matchList); + int readLength); bool isMatchPathOverlapped(const MatchPath & matchPath1, const MatchPath & matchPath2); @@ -107,6 +100,8 @@ class Taxonomer { void mergeMatchPaths(const MatchPath & source, MatchPath & target); + void trimMatchPath(MatchPath & path1, const MatchPath & path2); + depthScore DFS(const vector &matches, size_t curMatchIdx, const map> &linkedMatches, size_t depth, size_t MIN_DEPTH, unordered_set &used, From ba7375ea3e856398fb8e62616353ef781f8389ed Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 9 Nov 2023 17:10:46 +0900 Subject: [PATCH 59/65] undo changes in getExtendedORFs function --- src/commons/SeqIterator.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp index 46cf14a8..6d4cdd24 100644 --- a/src/commons/SeqIterator.cpp +++ b/src/commons/SeqIterator.cpp @@ -539,11 +539,11 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect frame = (genes[0].begin - 1) % 3; leftEnd = 0; while (leftEnd % 3 != frame) leftEnd++; - blocks.emplace_back(leftEnd, genes[1].begin - 2, 1); + blocks.emplace_back(leftEnd, genes[1].begin - 1 + 22, 1); blockIdx++; } else { frame = (genes[0].end - 1) % 3; - rightEnd = genes[1].begin - 2; + rightEnd = genes[1].begin - 1 + 22; while (rightEnd % 3 != frame) rightEnd--; blocks.emplace_back(0, rightEnd, -1); blockIdx++; @@ -583,12 +583,12 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect } else { if (!isReverse) { //forward frame = (genes[geneIdx].begin - 1) % 3; - leftEnd = genes[geneIdx - 1].end; + leftEnd = genes[geneIdx - 1].end -1 -22; while (leftEnd % 3 != frame) leftEnd++; blocks.emplace_back(leftEnd, genes[geneIdx].end - 1, 1); blockIdx++; } else { // reverse - blocks.emplace_back(genes[geneIdx - 1].end, genes[geneIdx].end - 1, -1); + blocks.emplace_back(genes[geneIdx - 1].end - 22 - 1, genes[geneIdx].end - 1, -1); blockIdx++; } } @@ -597,24 +597,24 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect if (hasBeenExtendedToLeft) { if (!isReverse) { //forward frame = (genes[geneIdx].begin - 1) % 3; - leftEnd = genes[geneIdx - 1].end; + leftEnd = genes[geneIdx - 1].end - 1 - 22; while (leftEnd % 3 != frame) leftEnd++; - blocks.emplace_back(leftEnd, genes[geneIdx + 1].begin - 2, 1); + blocks.emplace_back(leftEnd, genes[geneIdx + 1].begin - 1 + 22, 1); blockIdx++; } else { frame = (genes[geneIdx].end - 1) % 3; - rightEnd = genes[geneIdx + 1].begin - 2; + rightEnd = genes[geneIdx + 1].begin - 1 + 22; while (rightEnd % 3 != frame) rightEnd--; - blocks.emplace_back(genes[geneIdx - 1].end, rightEnd, -1); + blocks.emplace_back(genes[geneIdx - 1].end - 1 - 22, rightEnd, -1); blockIdx++; } } else { if (!isReverse) { //forward - blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx + 1].begin - 2, 1); + blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx + 1].begin - 1 + 22, 1); blockIdx++; } else { frame = (genes[geneIdx].end - 1) % 3; - rightEnd = genes[geneIdx + 1].begin - 2; + rightEnd = genes[geneIdx + 1].begin - 1 + 22; while (rightEnd % 3 != frame) rightEnd--; blocks.emplace_back(genes[geneIdx].begin - 1, rightEnd, -1); blockIdx++; @@ -639,7 +639,7 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect // If left region is not covered, cover it. leftEnd = genes[numOfGene - 1].begin - 1; if (hasBeenExtendedToLeft) { - leftEnd = genes[numOfGene - 2].end; + leftEnd = genes[numOfGene - 2].end - 1 - 22; if (!isReverse) { frame = (genes[numOfGene - 1].begin - 1) % 3; while (leftEnd % 3 != frame) leftEnd++; From cfb163522a7c9cef1a3138536ac0fd83741ccc00 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Sun, 12 Nov 2023 15:21:06 +0900 Subject: [PATCH 60/65] error in subspecies level classification --- src/commons/QueryIndexer.cpp | 10 +-- src/commons/Taxonomer.cpp | 147 ++++++++++++++++++++++------------- src/commons/Taxonomer.h | 11 ++- 3 files changed, 105 insertions(+), 63 deletions(-) diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp index 66ec70a4..4fba5072 100644 --- a/src/commons/QueryIndexer.cpp +++ b/src/commons/QueryIndexer.cpp @@ -51,11 +51,11 @@ void QueryIndexer::indexQueryFile() { } querySplits.emplace_back(start, readNum_1, kmerCnt); // Print elements - for (auto & querySplit : querySplits) { - std::cout << "start: " << querySplit.start << "\t"; - std::cout << "end: " << querySplit.end << "\t"; - std::cout << "kmerCnt: " << querySplit.kmerCnt << "\n"; - } + // for (auto & querySplit : querySplits) { + // std::cout << "start: " << querySplit.start << "\t"; + // std::cout << "end: " << querySplit.end << "\t"; + // std::cout << "kmerCnt: " << querySplit.kmerCnt << "\n"; + // } delete kseq; } else { KSeqWrapper* kseq_1 = KSeqFactory(queryPath_1.c_str()); diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index c0760d8c..4b0f230f 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -91,7 +91,7 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, // } // } // Get the best species for current query - vector speciesMatches; + vector speciesMatches; speciesMatches.reserve(end - offset + 1); TaxonScore speciesScore(0, 0, 0, 0, 0); if (par.seqMode == 2) { @@ -132,6 +132,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, return; } + // Filter redundant matches + vector filteredMatches; + unordered_map taxCnt; + filterRedundantMatches(speciesMatches, filteredMatches, taxCnt); + // If score is not enough, classify to the parent of the selected species if (speciesScore.score < par.minSpScore) { queryList[currentQuery].isClassified = true; @@ -140,21 +145,18 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, queryList[currentQuery].score = speciesScore.score; queryList[currentQuery].coverage = speciesScore.coverage; queryList[currentQuery].hammingDist = speciesScore.hammingDist; - for (auto & spMatch : speciesMatches) { - queryList[currentQuery].taxCnt[spMatch.targetId]++; + for (auto spMatch : filteredMatches) { + queryList[currentQuery].taxCnt[spMatch->targetId]++; } return; } - // Sort matches by the coordinate of the query - sort(speciesMatches.begin(), speciesMatches.end(), - [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); - - TaxID result = lowerRankClassification(speciesMatches, speciesScore.taxId); + // Lower rank classification + TaxID result = lowerRankClassification(taxCnt, speciesScore.taxId); // Record matches of selected species - for (auto & spMatch : speciesMatches) { - queryList[currentQuery].taxCnt[spMatch.targetId]++; + for (auto & spMatch : filteredMatches) { + queryList[currentQuery].taxCnt[spMatch->targetId]++; } // Store classification results @@ -177,31 +179,59 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, // } } -TaxID Taxonomer::lowerRankClassification(vector &matches, TaxID spTaxId) { - unordered_map taxCnt; - size_t matchNum = matches.size(); - +void Taxonomer::filterRedundantMatches(vector & speciesMatches, + vector & filteredMatches, + unordered_map & taxCnt) { + filteredMatches.reserve(speciesMatches.size()); + // Sort matches by the coordinate on the query + sort(speciesMatches.begin(), speciesMatches.end(), + [](const Match * a, const Match * b) { return a->qInfo.pos < b->qInfo.pos; }); + + // Remove redundant matches + size_t matchNum = speciesMatches.size(); for (size_t i = 0; i < matchNum; i++) { - // cout << matches[i].targetId << endl; - // taxCnt[matches[i].targetId] ++; - size_t currQuotient = matches[i].qInfo.pos / 3; - uint8_t minHamming = matches[i].hamming; - Match * minHammingMatch = & matches[i]; + size_t currQuotient = speciesMatches[i]->qInfo.pos / 3; + uint8_t minHamming = speciesMatches[i]->hamming; + const Match * minHammingMatch = speciesMatches[i]; TaxID minHammingTaxId = minHammingMatch->targetId; - while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) { - if (matches[i].hamming < minHamming) { - minHamming = matches[i].hamming; - minHammingMatch = & matches[i]; + while ((i < matchNum) && (currQuotient == speciesMatches[i]->qInfo.pos / 3)) { + if (speciesMatches[i]->hamming < minHamming) { + minHamming = speciesMatches[i]->hamming; + minHammingMatch = speciesMatches[i]; minHammingTaxId = minHammingMatch->targetId; - } else if (matches[i].hamming == minHamming) { - minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId); - minHammingMatch->redundancy = true; - matches[i].redundancy = true; + } else if (speciesMatches[i]->hamming == minHamming) { + minHammingTaxId = taxonomy->LCA(minHammingTaxId, speciesMatches[i]->targetId); } i++; } - taxCnt[minHammingTaxId]++; + filteredMatches.push_back(&*minHammingMatch); + taxCnt[minHammingTaxId]++; } +} + +TaxID Taxonomer::lowerRankClassification(const unordered_map & taxCnt, TaxID spTaxId) { + // size_t matchNum = matches.size(); + // for (size_t i = 0; i < matchNum; i++) { + // // cout << matches[i].targetId << endl; + // // taxCnt[matches[i].targetId] ++; + // size_t currQuotient = matches[i].qInfo.pos / 3; + // uint8_t minHamming = matches[i].hamming; + // Match * minHammingMatch = & matches[i]; + // TaxID minHammingTaxId = minHammingMatch->targetId; + // while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) { + // if (matches[i].hamming < minHamming) { + // minHamming = matches[i].hamming; + // minHammingMatch = & matches[i]; + // minHammingTaxId = minHammingMatch->targetId; + // } else if (matches[i].hamming == minHamming) { + // minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId); + // minHammingMatch->redundancy = true; + // matches[i].redundancy = true; + // } + // i++; + // } + // taxCnt[minHammingTaxId]++; + // } unordered_map cladeCnt; getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); @@ -265,7 +295,7 @@ TaxID Taxonomer::BFS(const unordered_map & cladeCnt, TaxID r } } -TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, +TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, const Match *matchList, size_t end, size_t offset, @@ -319,7 +349,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, vector maxSpecies; for (auto & spScore : species2score) { - if (spScore.second > bestSpScore * 0.99) { + if (spScore.second > bestSpScore * 0.95) { maxSpecies.push_back(spScore.first); } } @@ -343,9 +373,12 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { coveredLength += matchPath.end - matchPath.start + 1; hammingDist += matchPath.hammingDist; - for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) { - speciesMatches.push_back(matchList[i]); - } + } + speciesMatches.reserve(speciesMatchRange[bestScore.taxId].second + - speciesMatchRange[bestScore.taxId].first + 1); + + for (size_t j = speciesMatchRange[bestScore.taxId].first; j < speciesMatchRange[bestScore.taxId].second; j++) { + speciesMatches.push_back(& matchList[j]); } bestScore.coverage = coveredLength / queryLength; bestScore.hammingDist = hammingDist; @@ -353,7 +386,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, return bestScore; } -TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, +TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, const Match *matchList, size_t end, size_t offset, @@ -408,7 +441,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, vector maxSpecies; for (auto & spScore : species2score) { - if (spScore.second > bestSpScore * 0.99) { + if (spScore.second > bestSpScore * 0.95) { maxSpecies.push_back(spScore.first); } } @@ -432,9 +465,12 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector &speciesMatches, for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { coveredLength += matchPath.end - matchPath.start + 1; hammingDist += matchPath.hammingDist; - for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) { - speciesMatches.push_back(matchList[i]); - } + } + speciesMatches.reserve(speciesMatchRange[bestScore.taxId].second + - speciesMatchRange[bestScore.taxId].first + 1); + + for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) { + speciesMatches.push_back(&matchList[i]); } bestScore.coverage = coveredLength / (readLength1 + readLength2); bestScore.hammingDist = hammingDist; @@ -550,7 +586,7 @@ void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2) { } else { path1.score += 2.0f - 0.5f * lastEndHamming; } - path1.matches.pop_back(); + // path1.matches.pop_back(); } else { path1.start = path2.end + 1; uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14); @@ -561,7 +597,7 @@ void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2) { } else { path1.score += 2.0f - 0.5f * lastEndHamming; } - path1.matches.erase(path1.matches.begin()); + // path1.matches.erase(path1.matches.begin()); } } @@ -691,6 +727,8 @@ depthScore Taxonomer::DFS(const vector &matches, depth++; depthScore bestDepthScore = depthScore(0, 0, 0); depthScore returnDepthScore; + depthScore curDepthScore; + float recievedScore = score; if (linkedMatches.find(curMatchIdx) == linkedMatches.end()) { // reached a leaf node uint8_t lastEndHamming = (matches[curMatchIdx]->rightEndHamming >> 14); if (lastEndHamming == 0) { @@ -698,7 +736,7 @@ depthScore Taxonomer::DFS(const vector &matches, } else { score += 2.0f - 0.5f * lastEndHamming; } - idx2depthScore[curMatchIdx] = depthScore(depth, score, hammingDist + lastEndHamming); + idx2depthScore[curMatchIdx] = depthScore(1, score - recievedScore, lastEndHamming); return depthScore(depth, score, hammingDist + lastEndHamming); } else { // not a leaf node uint8_t lastEndHamming = (matches[curMatchIdx]->rightEndHamming >> 14); @@ -709,26 +747,25 @@ depthScore Taxonomer::DFS(const vector &matches, } for (auto &nextMatchIdx: linkedMatches.at(curMatchIdx)) { used.insert(nextMatchIdx); - // Reuse the depth score of nextMatchIdx if it has been calculated - if (idx2depthScore.find(nextMatchIdx) != idx2depthScore.end()) { + if (idx2depthScore.find(nextMatchIdx) != idx2depthScore.end()){ returnDepthScore = idx2depthScore[nextMatchIdx]; - if (returnDepthScore.score > bestDepthScore.score - && returnDepthScore.depth > MIN_DEPTH) { - bestDepthScore = returnDepthScore; - edges[matches[curMatchIdx]] = matches[nextMatchIdx]; - } - continue; + curDepthScore = depthScore(returnDepthScore.depth + depth, + returnDepthScore.score + score, + returnDepthScore.hammingDist + hammingDist + lastEndHamming); + } else { + curDepthScore = DFS(matches, nextMatchIdx, linkedMatches, depth, MIN_DEPTH, used, idx2depthScore, edges, score, hammingDist + lastEndHamming); } - returnDepthScore = DFS(matches, nextMatchIdx, linkedMatches, depth, MIN_DEPTH, used, idx2depthScore, edges, score, hammingDist + lastEndHamming); - if (returnDepthScore.score > bestDepthScore.score - && returnDepthScore.depth > MIN_DEPTH) { - bestDepthScore = returnDepthScore; + if (curDepthScore.score > bestDepthScore.score + && curDepthScore.depth > MIN_DEPTH) { + bestDepthScore = curDepthScore; edges[matches[curMatchIdx]] = matches[nextMatchIdx]; - } + } } if (bestDepthScore.depth > MIN_DEPTH) { - idx2depthScore[curMatchIdx] = bestDepthScore; + idx2depthScore[curMatchIdx] = depthScore(bestDepthScore.depth - depth + 1, + bestDepthScore.score - recievedScore, + bestDepthScore.hammingDist - hammingDist); } } return bestDepthScore; diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index ee131df1..f5810e43 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -5,6 +5,7 @@ #include "Match.h" #include "common.h" #include "BitManipulateMacros.h" +#include #include using namespace std; @@ -102,6 +103,10 @@ class Taxonomer { void trimMatchPath(MatchPath & path1, const MatchPath & path2); + void filterRedundantMatches(vector & matchPaths, + vector & filteredMatches, + unordered_map & taxCnt); + depthScore DFS(const vector &matches, size_t curMatchIdx, const map> &linkedMatches, size_t depth, size_t MIN_DEPTH, unordered_set &used, @@ -124,10 +129,10 @@ class Taxonomer { TaxonScore getBestGenusMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, int readLength1, int readLength2); - TaxonScore getBestSpeciesMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, + TaxonScore getBestSpeciesMatches(vector &speciesMatches, const Match *matchList, size_t end, size_t offset, int queryLength); - TaxonScore getBestSpeciesMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, + TaxonScore getBestSpeciesMatches(vector &speciesMatches, const Match *matchList, size_t end, size_t offset, int readLength1, int readLength2); // TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, @@ -171,7 +176,7 @@ class Taxonomer { int queryLength, int queryLength2); - TaxID lowerRankClassification(vector &matches, TaxID speciesID); + TaxID lowerRankClassification(const unordered_map & matches, TaxID speciesID); void getSpeciesCladeCounts(const unordered_map & taxCnt, unordered_map & cladeCnt, From a0a2f31ca2613cc90e349ad5c2ac5d9e480f309d Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Sun, 12 Nov 2023 16:51:31 +0900 Subject: [PATCH 61/65] removed a bug in filterRedundantMatches --- src/commons/Taxonomer.cpp | 31 +++++++++++-------------------- src/commons/Taxonomer.h | 9 ++++----- 2 files changed, 15 insertions(+), 25 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 4b0f230f..bac23385 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -133,9 +133,9 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, } // Filter redundant matches - vector filteredMatches; - unordered_map taxCnt; - filterRedundantMatches(speciesMatches, filteredMatches, taxCnt); + // vector filteredMatches; + // cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered" << endl; + filterRedundantMatches(speciesMatches, queryList[currentQuery].taxCnt); // If score is not enough, classify to the parent of the selected species if (speciesScore.score < par.minSpScore) { @@ -145,19 +145,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, queryList[currentQuery].score = speciesScore.score; queryList[currentQuery].coverage = speciesScore.coverage; queryList[currentQuery].hammingDist = speciesScore.hammingDist; - for (auto spMatch : filteredMatches) { - queryList[currentQuery].taxCnt[spMatch->targetId]++; - } return; } // Lower rank classification - TaxID result = lowerRankClassification(taxCnt, speciesScore.taxId); - - // Record matches of selected species - for (auto & spMatch : filteredMatches) { - queryList[currentQuery].taxCnt[spMatch->targetId]++; - } + TaxID result = lowerRankClassification(queryList[currentQuery].taxCnt, speciesScore.taxId); // Store classification results queryList[currentQuery].isClassified = true; @@ -180,16 +172,14 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, } void Taxonomer::filterRedundantMatches(vector & speciesMatches, - vector & filteredMatches, - unordered_map & taxCnt) { - filteredMatches.reserve(speciesMatches.size()); + map & taxCnt) { // Sort matches by the coordinate on the query sort(speciesMatches.begin(), speciesMatches.end(), [](const Match * a, const Match * b) { return a->qInfo.pos < b->qInfo.pos; }); // Remove redundant matches size_t matchNum = speciesMatches.size(); - for (size_t i = 0; i < matchNum; i++) { + for (size_t i = 0; i < matchNum;) { size_t currQuotient = speciesMatches[i]->qInfo.pos / 3; uint8_t minHamming = speciesMatches[i]->hamming; const Match * minHammingMatch = speciesMatches[i]; @@ -204,12 +194,13 @@ void Taxonomer::filterRedundantMatches(vector & speciesMatches, } i++; } - filteredMatches.push_back(&*minHammingMatch); + // cout << minHammingMatch->targetId << " " << minHammingMatch->qInfo.frame << " " << minHammingMatch->qInfo.pos << " " << int(minHammingMatch->hamming) << " " << int(minHammingMatch->redundancy) << endl; + taxCnt[minHammingTaxId]++; } } -TaxID Taxonomer::lowerRankClassification(const unordered_map & taxCnt, TaxID spTaxId) { +TaxID Taxonomer::lowerRankClassification(const map & taxCnt, TaxID spTaxId) { // size_t matchNum = matches.size(); // for (size_t i = 0; i < matchNum; i++) { // // cout << matches[i].targetId << endl; @@ -252,7 +243,7 @@ TaxID Taxonomer::lowerRankClassification(const unordered_map &taxCnt, +void Taxonomer::getSpeciesCladeCounts(const map &taxCnt, unordered_map & cladeCount, TaxID speciesTaxID) { for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) { @@ -586,7 +577,7 @@ void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2) { } else { path1.score += 2.0f - 0.5f * lastEndHamming; } - // path1.matches.pop_back(); + path1.matches.pop_back(); } else { path1.start = path2.end + 1; uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14); diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index f5810e43..23f23b3b 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -103,9 +103,8 @@ class Taxonomer { void trimMatchPath(MatchPath & path1, const MatchPath & path2); - void filterRedundantMatches(vector & matchPaths, - vector & filteredMatches, - unordered_map & taxCnt); + void filterRedundantMatches(vector & speciesMatches, + map & taxCnt); depthScore DFS(const vector &matches, size_t curMatchIdx, const map> &linkedMatches, @@ -176,9 +175,9 @@ class Taxonomer { int queryLength, int queryLength2); - TaxID lowerRankClassification(const unordered_map & matches, TaxID speciesID); + TaxID lowerRankClassification(const map & matches, TaxID speciesID); - void getSpeciesCladeCounts(const unordered_map & taxCnt, + void getSpeciesCladeCounts(const map & taxCnt, unordered_map & cladeCnt, TaxID spciesID); From eae409a9ec8e1314cd791d934dce25d7fc4445ac Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Mon, 13 Nov 2023 14:46:57 +0900 Subject: [PATCH 62/65] an error found in combineMatchPaths() --- src/commons/KmerMatcher.cpp | 2 +- src/commons/LocalParameters.cpp | 2 +- src/commons/Taxonomer.cpp | 56 +++++++++++++++++++-------------- src/commons/Taxonomer.h | 10 ++---- src/workflow/classify.cpp | 1 - 5 files changed, 36 insertions(+), 35 deletions(-) diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 084b919b..248539fd 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -513,7 +513,7 @@ void KmerMatcher::compareDna(uint64_t query, // Select target k-mers that passed hamming criteria for (size_t h = 0; h < size; h++) { - if (hammingSums[h] <= min(minHammingSum * 2, 6)) { + if (hammingSums[h] <= min(minHammingSum * 2, 8)) { selectedMatches.push_back(h); selectedHammingSum.push_back(hammingSums[h]); if (frame < 3) { diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index dc9ffe04..d4f8c2e9 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -307,7 +307,7 @@ LocalParameters::LocalParameters() : classify.push_back(&RAM_USAGE); classify.push_back(&MATCH_PER_KMER); classify.push_back(&ACCESSION_LEVEL); - classify.push_back(&MIN_SS_MATCH); + // classify.push_back(&MIN_SS_MATCH); // filter filter.push_back(&PARAM_THREADS); diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index bac23385..ac2ad621 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -84,12 +84,12 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, vector & queryList, const LocalParameters &par) { -// if (true) { -// cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; -// for (size_t i = offset; i < end + 1; i++) { -// cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; -// } -// } + if (true) { + cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; + for (size_t i = offset; i < end + 1; i++) { + cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; + } + } // Get the best species for current query vector speciesMatches; speciesMatches.reserve(end - offset + 1); @@ -149,7 +149,9 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, } // Lower rank classification - TaxID result = lowerRankClassification(queryList[currentQuery].taxCnt, speciesScore.taxId); + TaxID result = lowerRankClassification(queryList[currentQuery].taxCnt, + speciesScore.taxId, + queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2); // Store classification results queryList[currentQuery].isClassified = true; @@ -200,7 +202,7 @@ void Taxonomer::filterRedundantMatches(vector & speciesMatches, } } -TaxID Taxonomer::lowerRankClassification(const map & taxCnt, TaxID spTaxId) { +TaxID Taxonomer::lowerRankClassification(const map & taxCnt, TaxID spTaxId, int queryLength) { // size_t matchNum = matches.size(); // for (size_t i = 0; i < matchNum; i++) { // // cout << matches[i].targetId << endl; @@ -223,7 +225,7 @@ TaxID Taxonomer::lowerRankClassification(const map & taxCnt, TaxID s // } // taxCnt[minHammingTaxId]++; // } - + unsigned int maxCnt = (queryLength - 1)/100 + 1; unordered_map cladeCnt; getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); if (accessionLevel == 2) { // Don't do accession-level classification @@ -237,9 +239,9 @@ TaxID Taxonomer::lowerRankClassification(const map & taxCnt, TaxID s it->first)); } } - return BFS(cladeCnt, spTaxId); + return BFS(cladeCnt, spTaxId, maxCnt); } else { - return BFS(cladeCnt, spTaxId); + return BFS(cladeCnt, spTaxId, maxCnt); } } @@ -262,11 +264,11 @@ void Taxonomer::getSpeciesCladeCounts(const map &taxCnt, } } -TaxID Taxonomer::BFS(const unordered_map & cladeCnt, TaxID root) { +TaxID Taxonomer::BFS(const unordered_map & cladeCnt, TaxID root, unsigned int maxCnt) { + unsigned int maxCnt2 = maxCnt; if (cladeCnt.at(root).children.empty()) { // root is a leaf return root; } - unsigned int maxCnt = minSSMatch; unsigned int currentCnt; vector bestChildren; for (auto it = cladeCnt.at(root).children.begin(); it != cladeCnt.at(root).children.end(); it++) { @@ -280,7 +282,7 @@ TaxID Taxonomer::BFS(const unordered_map & cladeCnt, TaxID r } } if (bestChildren.size() == 1) { - return BFS(cladeCnt, bestChildren[0]); + return BFS(cladeCnt, bestChildren[0], maxCnt2); } else { return root; } @@ -346,20 +348,27 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatch } // More than one species --> LCA + float coveredLength = 0.f; if (maxSpecies.size() > 1) { bestScore.LCA = true; bestScore.taxId = taxonomy->LCA(maxSpecies)->taxId; for (auto & sp : maxSpecies) { bestScore.score += species2score[sp]; + coveredLength = 0; + for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { + coveredLength += matchPath.end - matchPath.start + 1; + } + bestScore.coverage += coveredLength / queryLength; } bestScore.score /= maxSpecies.size(); + bestScore.coverage /= maxSpecies.size(); return bestScore; } + // One species bestScore.taxId = maxSpecies[0]; bestScore.score = species2score[maxSpecies[0]]; - float coveredLength = 0.f; int hammingDist = 0; for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { coveredLength += matchPath.end - matchPath.start + 1; @@ -438,20 +447,27 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatch } // More than one species --> LCA + float coveredLength = 0.f; if (maxSpecies.size() > 1) { bestScore.LCA = true; bestScore.taxId = taxonomy->LCA(maxSpecies)->taxId; for (auto & sp : maxSpecies) { bestScore.score += species2score[sp]; + coveredLength = 0; + for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { + coveredLength += matchPath.end - matchPath.start + 1; + } + bestScore.coverage += coveredLength / (readLength1 + readLength2); } bestScore.score /= maxSpecies.size(); + bestScore.coverage /= maxSpecies.size(); return bestScore; } // One species bestScore.taxId = maxSpecies[0]; bestScore.score = species2score[maxSpecies[0]]; - float coveredLength = 0.f; + int hammingDist = 0; for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { coveredLength += matchPath.end - matchPath.start + 1; @@ -698,14 +714,6 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM } } } - -// if (par.printLog) { -// cout << "filteredMatchIdx: "; -// for (auto &idx: filteredMatchIdx) { -// cout << idx << " "; -// } -// cout << endl; -// } } depthScore Taxonomer::DFS(const vector &matches, diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 23f23b3b..0d00e77d 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -111,12 +111,6 @@ class Taxonomer { size_t depth, size_t MIN_DEPTH, unordered_set &used, unordered_map &idx2depthScore, unordered_map & edges, float score, int hammingDist); - // depthScore DFS(const vector & curFrameMatches, - // size_t curMatchIdx, - // const map>& linkedMatches, - // size_t depth, size_t MIN_DEPTH, unordered_set& used, - // unordered_map & idx2depth, - // size_t startPos, vector & matchPaths); static bool isConsecutive(const Match * match1, const Match * match2); @@ -175,13 +169,13 @@ class Taxonomer { int queryLength, int queryLength2); - TaxID lowerRankClassification(const map & matches, TaxID speciesID); + TaxID lowerRankClassification(const map & matches, TaxID speciesID, int queryLength); void getSpeciesCladeCounts(const map & taxCnt, unordered_map & cladeCnt, TaxID spciesID); - TaxID BFS(const unordered_map & cladeCnt, TaxID root); + TaxID BFS(const unordered_map & cladeCnt, TaxID root, unsigned int maxCnt); // Getters unordered_map & getTaxCounts() { return taxCounts; } diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp index 0ce445ff..460bab34 100644 --- a/src/workflow/classify.cpp +++ b/src/workflow/classify.cpp @@ -25,7 +25,6 @@ void setClassifyDefaults(LocalParameters & par){ par.maskProb = 0.9; par.matchPerKmer = 4; par.accessionLevel = 0; - par.minSSMatch = 3; } int classify(int argc, const char **argv, const Command& command) From 1e50b1e603f2518854a3fe10af3503beed0d1741 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Mon, 13 Nov 2023 17:22:59 +0900 Subject: [PATCH 63/65] 1. Imporved combineMatchPaths() 2.Undo changes in getExtendedORFs --- src/commons/Match.h | 36 ++++++++-- src/commons/SeqIterator.cpp | 130 ++++++++++++++++++------------------ src/commons/Taxonomer.cpp | 57 +++++++++------- src/commons/Taxonomer.h | 2 +- 4 files changed, 132 insertions(+), 93 deletions(-) diff --git a/src/commons/Match.h b/src/commons/Match.h index 9bc1fb44..d2a02846 100644 --- a/src/commons/Match.h +++ b/src/commons/Match.h @@ -29,8 +29,8 @@ struct Match { // 20 byte << targetId << " " << speciesId << " " << rightEndHamming << " " << (int)hamming << " " << getScore() << std::endl; } - float getScore(float score = 0.0f, int cnt = 0) const { - int currentHamming = GET_2_BITS(rightEndHamming >> cnt * 2); + float getScore(float score = 0.0f, int cnt = 0) const { + int currentHamming = GET_2_BITS(rightEndHamming >> (cnt * 2)); if (currentHamming == 0) { score += 3.0f; } else { @@ -39,9 +39,37 @@ struct Match { // 20 byte if (cnt == 7) { return score; } else { - return getScore(score, cnt + 1); + return getScore(score, cnt + 1); } } + + // 87654321에서 678을 알고 싶은 거임 + float getRightPartScore(const int range, float score = 0.0f, int cnt = 0) const { + if (cnt == range) { + return score; + } + int currentHamming = GET_2_BITS(rightEndHamming >> (14 - cnt * 2)); + if (currentHamming == 0) { + score += 3.0f; + } else { + score += 2.0f - 0.5f * currentHamming; + } + return getRightPartScore(range, score, cnt + 1); + } + + // 87654321 + float getLeftPartScore(const int range, float score = 0.0f, int cnt = 0) const { + if (cnt == range) { + return score; + } + int currentHamming = GET_2_BITS(rightEndHamming >> (cnt * 2)); + if (currentHamming == 0) { + score += 3.0f; + } else { + score += 2.0f - 0.5f * currentHamming; + } + return getLeftPartScore(range, score, cnt + 1); + } }; -#endif //ADCLASSIFIER2_MATCH_H +#endif //ADCLASSIFIER2_MATCH_H \ No newline at end of file diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp index 6d4cdd24..19fa8f7d 100644 --- a/src/commons/SeqIterator.cpp +++ b/src/commons/SeqIterator.cpp @@ -628,76 +628,78 @@ void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vect } } - // For the last gene - // Extend to the end of the genome - isReverse = !(nodes[genes[numOfGene - 1].start_ndx].strand == 1); - rightEnd = length - 1; - if (isReverse) { - frame = (genes[numOfGene - 1].end - 1) % 3; - while (rightEnd % 3 != frame) rightEnd--; - } - // If left region is not covered, cover it. - leftEnd = genes[numOfGene - 1].begin - 1; - if (hasBeenExtendedToLeft) { - leftEnd = genes[numOfGene - 2].end - 1 - 22; - if (!isReverse) { + // // For the last gene + // // Extend to the end of the genome + // isReverse = !(nodes[genes[numOfGene - 1].start_ndx].strand == 1); + // rightEnd = length - 1; + // if (isReverse) { + // frame = (genes[numOfGene - 1].end - 1) % 3; + // while (rightEnd % 3 != frame) rightEnd--; + // } + // // If left region is not covered, cover it. + // leftEnd = genes[numOfGene - 1].begin - 1; + // if (hasBeenExtendedToLeft) { + // leftEnd = genes[numOfGene - 2].end - 1 - 22; + // if (!isReverse) { + // frame = (genes[numOfGene - 1].begin - 1) % 3; + // while (leftEnd % 3 != frame) leftEnd++; + // } + // } + // blocks.emplace_back(leftEnd, rightEnd, isReverse ? -1 : 1); + // if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { + // intergenicKmerList.push_back(rightKmerHash); + // } + + //For the last gene + if (find(intergenicKmerList.begin(), intergenicKmerList.end(), leftKmerHash) != + intergenicKmerList.end()) { //extension to left + if (!isReverse) { //forward frame = (genes[numOfGene - 1].begin - 1) % 3; + leftEnd = genes[numOfGene - 2].end - 1 - 22; while (leftEnd % 3 != frame) leftEnd++; + blocks.emplace_back(leftEnd, length - 1, 1); + blockIdx++; + } else { // reverse + frame = (genes[numOfGene - 1].end - 1) % 3; + rightEnd = length - 1; + while (rightEnd % 3 != frame) rightEnd--; + blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1); + blockIdx++; } - } - blocks.emplace_back(leftEnd, rightEnd, isReverse ? -1 : 1); - if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { + } else { //extension to right + if (hasBeenExtendedToLeft) { + if (!isReverse) { //forward + frame = (genes[numOfGene - 1].begin - 1) % 3; + leftEnd = genes[numOfGene - 2].end - 1 - 22; + while (leftEnd % 3 != frame) leftEnd++; + blocks.emplace_back(leftEnd, length - 1, 1); + blockIdx++; + } else { + frame = (genes[numOfGene - 1].end - 1) % 3; + rightEnd = length - 1; + while (rightEnd % 3 != frame) rightEnd--; + blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1); + blockIdx++; + } + } else { + if (!isReverse) { + blocks.emplace_back(genes[numOfGene - 1].begin, length - 1, 1); + blockIdx++; + } else { + frame = (genes[numOfGene - 1].end - 1) % 3; + rightEnd = length - 1; + while (rightEnd % 3 != frame) rightEnd--; + blocks.emplace_back(genes[numOfGene - 1].begin - 1, rightEnd, -1); + blockIdx++; + } + } + + //If current intergenic sequences is new, update intergenicKmerList. + if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { intergenicKmerList.push_back(rightKmerHash); + } } - // if (find(intergenicKmerList.begin(), intergenicKmerList.end(), leftKmerHash) != - // intergenicKmerList.end()) { //extension to left - // if (!isReverse) { //forward - // frame = (genes[numOfGene - 1].begin - 1) % 3; - // leftEnd = genes[numOfGene - 2].end - 1 - 22; - // while (leftEnd % 3 != frame) leftEnd++; - // blocks.emplace_back(leftEnd, length - 1, 1); - // blockIdx++; - // } else { // reverse - // frame = (genes[numOfGene - 1].end - 1) % 3; - // rightEnd = length - 1; - // while (rightEnd % 3 != frame) rightEnd--; - // blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1); - // blockIdx++; - // } - // } else { //extension to right - // if (hasBeenExtendedToLeft) { - // if (!isReverse) { //forward - // frame = (genes[numOfGene - 1].begin - 1) % 3; - // leftEnd = genes[numOfGene - 2].end - 1 - 22; - // while (leftEnd % 3 != frame) leftEnd++; - // blocks.emplace_back(leftEnd, length - 1, 1); - // blockIdx++; - // } else { - // frame = (genes[numOfGene - 1].end - 1) % 3; - // rightEnd = length - 1; - // while (rightEnd % 3 != frame) rightEnd--; - // blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1); - // blockIdx++; - // } - // } else { - // if (!isReverse) { - // blocks.emplace_back(genes[numOfGene - 1].begin, length - 1, 1); - // blockIdx++; - // } else { - // frame = (genes[numOfGene - 1].end - 1) % 3; - // rightEnd = length - 1; - // while (rightEnd % 3 != frame) rightEnd--; - // blocks.emplace_back(genes[numOfGene - 1].begin - 1, rightEnd, -1); - // blockIdx++; - // } - // } - - // //If current intergenic sequences is new, update intergenicKmerList. - // if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { - // intergenicKmerList.push_back(rightKmerHash); - // } - // } free(newIntergenicKmer); free(leftKmer); diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index ac2ad621..5ced0f7e 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -84,12 +84,12 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, vector & queryList, const LocalParameters &par) { - if (true) { - cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; - for (size_t i = offset; i < end + 1; i++) { - cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; - } - } +// if (true) { +// cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; +// for (size_t i = offset; i < end + 1; i++) { +// cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; +// } +// } // Get the best species for current query vector speciesMatches; speciesMatches.reserve(end - offset + 1); @@ -504,14 +504,23 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, bool isOverlapped = false; for (size_t j = 0; j < combinedMatchPaths.size(); j++) { if (isMatchPathOverlapped(matchPaths[i], combinedMatchPaths[j])) { // overlap! - if (isMatchPathLinked(matchPaths[i], combinedMatchPaths[j])) { - // merge two linked matchPaths by editing the combinedMatchPaths[j] - trimMatchPath(matchPaths[i], combinedMatchPaths[j]); + int overlappedLength = min(matchPaths[i].end, combinedMatchPaths[j].end) + - max(matchPaths[i].start, combinedMatchPaths[j].start) + 1; + if (overlappedLength < 24) { + trimMatchPath(matchPaths[i], combinedMatchPaths[j], overlappedLength); continue; } else { isOverlapped = true; break; } + // if (isMatchPathLinked(matchPaths[i], combinedMatchPaths[j])) { + // // merge two linked matchPaths by editing the combinedMatchPaths[j] + // trimMatchPath(matchPaths[i], combinedMatchPaths[j]); + // continue; + // } else { + // isOverlapped = true; + // break; + // } } } if (!isOverlapped) { @@ -581,29 +590,29 @@ void Taxonomer::mergeMatchPaths(const MatchPath & source, MatchPath & target) { } } -void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2) { - int margin = min(path1.end, path2.end) - max(path1.start, path2.start) + 1 - 21; +void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength) { + // int margin = min(path1.end, path2.end) - max(path1.start, path2.start) + 1 - 21; if (path1.start < path2.start) { path1.end = path2.start - 1; uint8_t lastEndHamming = GET_2_BITS(path1.matches.back()->rightEndHamming); path1.hammingDist = path1.hammingDist - (path1.matches.back()->hamming - lastEndHamming); - path1.score = path1.score - path1.matches.back()->getScore() - margin; - if (lastEndHamming == 0) { - path1.score += 3.0f; - } else { - path1.score += 2.0f - 0.5f * lastEndHamming; - } - path1.matches.pop_back(); + path1.score = path1.score - path1.matches.back()->getRightPartScore(overlapLength/3) - (overlapLength % 3); + // if (lastEndHamming == 0) { + // path1.score += 3.0f; + // } else { + // path1.score += 2.0f - 0.5f * lastEndHamming; + // } + // path1.matches.pop_back(); // unnecessary without checking isLikned } else { path1.start = path2.end + 1; uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14); path1.hammingDist = path1.hammingDist - (path1.matches.front()->hamming - lastEndHamming); - path1.score = path1.score - path1.matches.front()->getScore() - margin; - if (lastEndHamming == 0) { - path1.score += 3.0f; - } else { - path1.score += 2.0f - 0.5f * lastEndHamming; - } + path1.score = path1.score - path1.matches.front()->getLeftPartScore(overlapLength/3) - (overlapLength % 3); + // if (lastEndHamming == 0) { + // path1.score += 3.0f; + // } else { + // path1.score += 2.0f - 0.5f * lastEndHamming; + // } // path1.matches.erase(path1.matches.begin()); } } diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 0d00e77d..70211255 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -101,7 +101,7 @@ class Taxonomer { void mergeMatchPaths(const MatchPath & source, MatchPath & target); - void trimMatchPath(MatchPath & path1, const MatchPath & path2); + void trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength); void filterRedundantMatches(vector & speciesMatches, map & taxCnt); From c81d4265474ad45b635eae819c8ad6f360a4c0c9 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Mon, 13 Nov 2023 19:24:15 +0900 Subject: [PATCH 64/65] Ignore k-mer matches with DNA sequence identity < 70% --- src/commons/KmerMatcher.cpp | 2 +- src/commons/Match.h | 2 -- src/commons/Taxonomer.cpp | 32 +++++++------------------------- src/commons/Taxonomer.h | 5 ++++- 4 files changed, 12 insertions(+), 29 deletions(-) diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 248539fd..c7331405 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -513,7 +513,7 @@ void KmerMatcher::compareDna(uint64_t query, // Select target k-mers that passed hamming criteria for (size_t h = 0; h < size; h++) { - if (hammingSums[h] <= min(minHammingSum * 2, 8)) { + if (hammingSums[h] <= min(minHammingSum * 2, 7)) { selectedMatches.push_back(h); selectedHammingSum.push_back(hammingSums[h]); if (frame < 3) { diff --git a/src/commons/Match.h b/src/commons/Match.h index d2a02846..48950664 100644 --- a/src/commons/Match.h +++ b/src/commons/Match.h @@ -43,7 +43,6 @@ struct Match { // 20 byte } } - // 87654321에서 678을 알고 싶은 거임 float getRightPartScore(const int range, float score = 0.0f, int cnt = 0) const { if (cnt == range) { return score; @@ -57,7 +56,6 @@ struct Match { // 20 byte return getRightPartScore(range, score, cnt + 1); } - // 87654321 float getLeftPartScore(const int range, float score = 0.0f, int cnt = 0) const { if (cnt == range) { return score; diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 5ced0f7e..575663a9 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -27,6 +27,12 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon minConsCnt = par.minConsCnt; minConsCntEuk = par.minConsCntEuk; eukaryotaTaxId = par.eukaryotaTaxId; + + if (par.seqMode == 1 || par.seqMode == 2) { + denominator = 100; + } else { + denominator = 1000; + } } Taxonomer::~Taxonomer() { @@ -196,36 +202,12 @@ void Taxonomer::filterRedundantMatches(vector & speciesMatches, } i++; } - // cout << minHammingMatch->targetId << " " << minHammingMatch->qInfo.frame << " " << minHammingMatch->qInfo.pos << " " << int(minHammingMatch->hamming) << " " << int(minHammingMatch->redundancy) << endl; - taxCnt[minHammingTaxId]++; } } TaxID Taxonomer::lowerRankClassification(const map & taxCnt, TaxID spTaxId, int queryLength) { - // size_t matchNum = matches.size(); - // for (size_t i = 0; i < matchNum; i++) { - // // cout << matches[i].targetId << endl; - // // taxCnt[matches[i].targetId] ++; - // size_t currQuotient = matches[i].qInfo.pos / 3; - // uint8_t minHamming = matches[i].hamming; - // Match * minHammingMatch = & matches[i]; - // TaxID minHammingTaxId = minHammingMatch->targetId; - // while ((i < matchNum) && (currQuotient == matches[i].qInfo.pos / 3)) { - // if (matches[i].hamming < minHamming) { - // minHamming = matches[i].hamming; - // minHammingMatch = & matches[i]; - // minHammingTaxId = minHammingMatch->targetId; - // } else if (matches[i].hamming == minHamming) { - // minHammingTaxId = taxonomy->LCA(minHammingTaxId, matches[i].targetId); - // minHammingMatch->redundancy = true; - // matches[i].redundancy = true; - // } - // i++; - // } - // taxCnt[minHammingTaxId]++; - // } - unsigned int maxCnt = (queryLength - 1)/100 + 1; + unsigned int maxCnt = (queryLength - 1)/denominator + 1; unordered_map cladeCnt; getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); if (accessionLevel == 2) { // Don't do accession-level classification diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 70211255..6da92429 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -48,7 +48,7 @@ class Taxonomer { int unmaskedPos[9]; int spaceNum; - // Parameters + // Parameters from user int maxGap; int minCoveredPos; int accessionLevel; @@ -57,6 +57,9 @@ class Taxonomer { int minConsCntEuk; int eukaryotaTaxId; + // Internal + int denominator; + struct MatchBlock { MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {} MatchBlock() : start(0), end(0), id(0) {} From 4f617158acf81cfc9944a2d93c580d7fb2a3a52f Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Mon, 13 Nov 2023 23:37:15 +0900 Subject: [PATCH 65/65] solve merge conflict in mapping2taxon.cpp --- src/util/mapping2taxon.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/util/mapping2taxon.cpp b/src/util/mapping2taxon.cpp index 543ad9fe..d7aedfdd 100644 --- a/src/util/mapping2taxon.cpp +++ b/src/util/mapping2taxon.cpp @@ -100,8 +100,4 @@ int mapping2taxon(int argc, const char **argv, const Command &command) { } return 0; -<<<<<<< HEAD } -======= -} ->>>>>>> newScore