From cd80a7f76baa2786583c45ef5a56d7a86dba85b9 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Wed, 6 Dec 2023 16:30:32 +0900 Subject: [PATCH 1/5] build mac OS first --- azure-pipelines.yml | 52 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d19f87b6..87a962d2 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -7,6 +7,32 @@ variables: regression: 1 jobs: + - job: build_macos_11 + displayName: macOS 11 + pool: + vmImage: 'macos-11' + steps: + - checkout: self + submodules: true + - script: | + rustup update + rustup target install x86_64-apple-darwin + rustup target install aarch64-apple-darwin + displayName: Install Rust Toolchain + - script: | + cd ${BUILD_SOURCESDIRECTORY} + ./util/build_osx.sh . build metabuli + displayName: Build Metabuli + - script: | + cd ${BUILD_SOURCESDIRECTORY} + ./util/Metabuli-regression/run_regression.sh ./build/metabuli examples + displayName: Run Regression Suite + condition: eq(variables['regression'], 1) + - task: PublishPipelineArtifact@0 + inputs: + targetPath: $(Build.SourcesDirectory)/build/metabuli + artifactName: metabuli-darwin-universal + - job: build_ubuntu_2004 displayName: Ubuntu 2004 pool: @@ -121,32 +147,6 @@ jobs: targetPath: $(Build.SourcesDirectory)/build/src/metabuli artifactName: metabuli-linux-$(SIMD) - - job: build_macos_11 - displayName: macOS 11 - pool: - vmImage: 'macos-11' - steps: - - checkout: self - submodules: true - - script: | - rustup update - rustup target install x86_64-apple-darwin - rustup target install aarch64-apple-darwin - displayName: Install Rust Toolchain - - script: | - cd ${BUILD_SOURCESDIRECTORY} - ./util/build_osx.sh . build metabuli - displayName: Build Metabuli - - script: | - cd ${BUILD_SOURCESDIRECTORY} - ./util/Metabuli-regression/run_regression.sh ./build/metabuli examples - displayName: Run Regression Suite - condition: eq(variables['regression'], 1) - - task: PublishPipelineArtifact@0 - inputs: - targetPath: $(Build.SourcesDirectory)/build/metabuli - artifactName: metabuli-darwin-universal - - job: upload_artifacts displayName: Upload Artifacts condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest')) From afd72db6b045d024282e030a44eb58cda90c52ed Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Wed, 6 Dec 2023 17:54:10 +0900 Subject: [PATCH 2/5] update README, undo the change in azure.. --- README.md | 20 +++++++++++------ azure-pipelines.yml | 52 ++++++++++++++++++++++----------------------- 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 38f8bf09..49d3525f 100644 --- a/README.md +++ b/README.md @@ -49,22 +49,26 @@ The built binary can be found in `./build/src`. ## Pre-built databases You can download [pre-built databases](https://metabuli.steineggerlab.workers.dev/) using `databases` workflow. ``` +Usage: +metabuli databases DB_NAME OUTDIR tmp + # RefSeq Complete/Chromosome (115.6 GiB) # - Complete Genome or Chromosome level assemblies of virus and prokaryotes in RefSeq (2023-04-04) and human genome (GRCh38.p14) -metabuli databases RefSeq DBDIR tmp +metabuli databases RefSeq OUTDIR tmp # RefSeq Releases 217 (480.5 GiB) # - Viral and prokaryotic genomes of RefSeq release 217 and human genome (GRCh38.p14) -metabuli databases RefSeq217 DBDIR tmp +metabuli databases RefSeq217 OUTDIR tmp # GTDB 207 (81.2 GiB) # - Complete Genome or Chromosome level assemblies in GTDB207 (CheckM Completeness > 90, CheckM Contamination < 5) with GTDB taxonomy. -metabuli databases GTDB207 DBDIR tmp +metabuli databases GTDB207 OUTDIR tmp # RefSeq Virus (1.5 GiB) # - Viral RefSeq genomes and five SARS-CoV-2 variants (alpha, beta, delta, gamma, and omicron) -metabuli databases RefSeq_virus DBDIR tmp +metabuli databases RefSeq_virus OUT_DIR tmp ``` +Downloaded files are stored in `OUTDIR/DB_NAME` directory, which can be provided for `classify` module as `DBDIR`. ## Classification @@ -254,7 +258,11 @@ Classifying RNA-seq reads from a COVID-19 patient to identify the culprit varian The whole process must take less than 10 mins using a personal machine. #### 1. Download RefSeq Virus DB (1.5 GiB) -`metabuli databases RefSeq_virus refseq_virus tmp` +``` +metabuli databases RefSeq_virus OUTDIR tmp +``` +`OUTDIR/refseq_virus` is the `DBDIR` in step 3. + #### 2. Download an RNA-seq result (SRR14484345) Option 1. Download using SRA Toolkit @@ -270,7 +278,7 @@ cat SRR14484345.fastq | paste - - - - - - - - | tee >(cut -f 1-4 | tr "\t" "\n" #### 3. Classify the reads using metabuli ``` - metabuli classify SRR14484345_1.fq SRR14484345_2.fq refseq_virus RESULT_DIR JOB_ID --max-ram RAM_SIZE + metabuli classify SRR14484345_1.fq SRR14484345_2.fq DBDIR RESULT_DIR JOB_ID --max-ram RAM_SIZE ``` #### 4. Check RESULT_DIR/JOB_ID_report.tsv Find a section like the example below diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 87a962d2..d19f87b6 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -7,32 +7,6 @@ variables: regression: 1 jobs: - - job: build_macos_11 - displayName: macOS 11 - pool: - vmImage: 'macos-11' - steps: - - checkout: self - submodules: true - - script: | - rustup update - rustup target install x86_64-apple-darwin - rustup target install aarch64-apple-darwin - displayName: Install Rust Toolchain - - script: | - cd ${BUILD_SOURCESDIRECTORY} - ./util/build_osx.sh . build metabuli - displayName: Build Metabuli - - script: | - cd ${BUILD_SOURCESDIRECTORY} - ./util/Metabuli-regression/run_regression.sh ./build/metabuli examples - displayName: Run Regression Suite - condition: eq(variables['regression'], 1) - - task: PublishPipelineArtifact@0 - inputs: - targetPath: $(Build.SourcesDirectory)/build/metabuli - artifactName: metabuli-darwin-universal - - job: build_ubuntu_2004 displayName: Ubuntu 2004 pool: @@ -147,6 +121,32 @@ jobs: targetPath: $(Build.SourcesDirectory)/build/src/metabuli artifactName: metabuli-linux-$(SIMD) + - job: build_macos_11 + displayName: macOS 11 + pool: + vmImage: 'macos-11' + steps: + - checkout: self + submodules: true + - script: | + rustup update + rustup target install x86_64-apple-darwin + rustup target install aarch64-apple-darwin + displayName: Install Rust Toolchain + - script: | + cd ${BUILD_SOURCESDIRECTORY} + ./util/build_osx.sh . build metabuli + displayName: Build Metabuli + - script: | + cd ${BUILD_SOURCESDIRECTORY} + ./util/Metabuli-regression/run_regression.sh ./build/metabuli examples + displayName: Run Regression Suite + condition: eq(variables['regression'], 1) + - task: PublishPipelineArtifact@0 + inputs: + targetPath: $(Build.SourcesDirectory)/build/metabuli + artifactName: metabuli-darwin-universal + - job: upload_artifacts displayName: Upload Artifacts condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest')) From b5dde073a96770faaa3e700c76dadaed8cf2dfa5 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Fri, 8 Dec 2023 02:30:25 +0900 Subject: [PATCH 3/5] don't store all matche of paths. just store the fisrt and last --- README.md | 4 +- src/commons/Classifier.cpp | 2 +- src/commons/Taxonomer.cpp | 106 +++++++------------------------------ src/commons/Taxonomer.h | 16 +++--- src/metabuli.cpp | 35 ------------ 5 files changed, 30 insertions(+), 133 deletions(-) diff --git a/README.md b/README.md index 49d3525f..2d8ebd2d 100644 --- a/README.md +++ b/README.md @@ -261,8 +261,6 @@ The whole process must take less than 10 mins using a personal machine. ``` metabuli databases RefSeq_virus OUTDIR tmp ``` -`OUTDIR/refseq_virus` is the `DBDIR` in step 3. - #### 2. Download an RNA-seq result (SRR14484345) Option 1. Download using SRA Toolkit @@ -278,7 +276,7 @@ cat SRR14484345.fastq | paste - - - - - - - - | tee >(cut -f 1-4 | tr "\t" "\n" #### 3. Classify the reads using metabuli ``` - metabuli classify SRR14484345_1.fq SRR14484345_2.fq DBDIR RESULT_DIR JOB_ID --max-ram RAM_SIZE + metabuli classify SRR14484345_1.fq SRR14484345_2.fq OUTDIR/refseq_virus RESULT_DIR JOB_ID --max-ram RAM_SIZE ``` #### 4. Check RESULT_DIR/JOB_ID_report.tsv Find a section like the example below diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index 70a6d59a..fe4a4b74 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -103,7 +103,7 @@ void Classifier::startClassify(const LocalParameters &par) { kmerMatcher->sortMatches(&matchBuffer); // Classify queries based on the matches. - // omp_set_num_threads(1); + // omp_set_num_threads(1); taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par); processedSeqCnt += queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start; cout << "The number of processed sequences: " << processedSeqCnt << " (" << (double) processedSeqCnt / (double) numOfSeq << ")" << endl; diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 76e2830c..f139be2d 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -412,13 +412,11 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatch } matchPaths.clear(); } - // If there are no meaningful species if (species2score.empty()) { bestScore.score = 0; return bestScore; } - vector maxSpecies; for (auto & spScore : species2score) { if (spScore.second > bestSpScore * 0.95) { @@ -513,71 +511,23 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, return score / readLength; } -bool Taxonomer::isMatchPathLinked(const MatchPath & matchPath1, const MatchPath & matchPath2) { - int overlappedLength = min(matchPath1.end, matchPath2.end) - max(matchPath1.start, matchPath2.start) + 1; - if (20 >= overlappedLength || overlappedLength >= 24) { - return false; - } - const Match * last; - const Match * first; - if (matchPath1.start < matchPath2.start) { - last = matchPath1.matches.back(); - first = matchPath2.matches.front(); - } else { - last = matchPath2.matches.back(); - first = matchPath1.matches.front(); - } - if (overlappedLength == 21) { - return isConsecutive(last, first); - } else { - - return isConsecutive_diffFrame(last, first); - } - return false; -} - bool Taxonomer::isMatchPathOverlapped(const MatchPath & matchPath1, const MatchPath & matchPath2) { return !((matchPath1.end < matchPath2.start) || (matchPath2.end < matchPath1.start)); } -void Taxonomer::mergeMatchPaths(const MatchPath & source, MatchPath & target) { - if (source.start < target.start) { - target.start = source.start; - uint8_t lastEndHamming = GET_2_BITS(target.matches.front()->rightEndHamming); - target.hammingDist += source.hammingDist - (source.matches.back()->hamming - lastEndHamming); - target.score += source.score - source.matches.back()->getScore(); - if (lastEndHamming == 0) { - target.score += 3.0f; - } else { - target.score += 2.0f - 0.5f * lastEndHamming; - } - target.matches.insert(target.matches.begin(), source.matches.begin(), source.matches.end() - 1); - } else { - target.end = source.end; - uint8_t lastEndHamming = GET_2_BITS(source.matches.front()->rightEndHamming >> 14); - target.hammingDist += source.hammingDist - (source.matches.front()->hamming - lastEndHamming); - target.score += source.score - source.matches.front()->getScore(); - if (lastEndHamming == 0) { - target.score += 3.0f; - } else { - target.score += 2.0f - 0.5f * lastEndHamming; - } - target.matches.insert(target.matches.end(), source.matches.begin() + 1, source.matches.end()); - } -} void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength) { if (path1.start < path2.start) { path1.end = path2.start - 1; // uint8_t lastEndHamming = GET_2_BITS(path1.matches.back()->rightEndHamming); - path1.hammingDist = max(0, path1.hammingDist - path1.matches.back()->getRightPartHammingDist(overlapLength/3)); - path1.score = path1.score - path1.matches.back()->getRightPartScore(overlapLength/3) - (overlapLength % 3); + path1.hammingDist = max(0, path1.hammingDist - path1.endMatch->getRightPartHammingDist(overlapLength/3)); + path1.score = path1.score - path1.endMatch->getRightPartScore(overlapLength/3) - (overlapLength % 3); } else { path1.start = path2.end + 1; // uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14); - path1.hammingDist = max(0, path1.hammingDist - path1.matches.front()->getLeftPartHammingDist(overlapLength/3)); - path1.score = path1.score - path1.matches.front()->getLeftPartScore(overlapLength/3) - (overlapLength % 3); + path1.hammingDist = max(0, path1.hammingDist - path1.startMatch->getLeftPartHammingDist(overlapLength/3)); + path1.score = path1.score - path1.startMatch->getLeftPartScore(overlapLength/3) - (overlapLength % 3); } } @@ -649,34 +599,20 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM currPos = nextPos; } } - - // Print linkedMatches -// if (par.printLog) { -// cout << "linkedMatches: " << endl; -// for (const auto &entry: linkedMatches) { -// cout << entry.first << ": "; -// for (auto &idx: entry.second) { -// cout << idx << " "; -// } -// cout << endl; -// } -// } // Iterate linkedMatches to get filteredMatches - //(ignore matches not enoughly consecutive) + // (ignore matches not enoughly consecutive) size_t MIN_DEPTH = minConsCnt - 1; if (taxonomy->IsAncestor(eukaryotaTaxId, genusId)) { MIN_DEPTH = minConsCntEuk - 1; } unordered_set used; unordered_map idx2depthScore; - unordered_map edges; - + for (const auto& entry : linkedMatches) { if (!used.count(entry.first)) { used.insert(entry.first); depthScore bestPath{}; - const Match * bestNextMatch = nullptr; for (size_t j = 0; j < entry.second.size(); j++) { used.insert(entry.second[j]); depthScore curPath = DFS(curFrameMatches, @@ -686,11 +622,9 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM MIN_DEPTH, used, idx2depthScore, - edges, entry.first->getScore(), entry.first->hamming); if (curPath.score > bestPath.score && curPath.depth > MIN_DEPTH) { - bestNextMatch = entry.second[j]; bestPath = curPath; } } @@ -698,14 +632,9 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM if (bestPath.depth > MIN_DEPTH) { matchPaths.emplace_back(entry.first->qInfo.pos, // start coordinate on query entry.first->qInfo.pos + bestPath.depth * 3 + 20, // end coordinate on query - bestPath.score, bestPath.hammingDist); - const Match * curMatch = entry.first; - edges[curMatch] = bestNextMatch; - matchPaths.back().matches.push_back(curMatch); - while (edges.find(curMatch) != edges.end()) { - matchPaths.back().matches.push_back(edges[curMatch]); - curMatch = edges[curMatch]; - } + bestPath.score, bestPath.hammingDist, + entry.first, + bestPath.endMatch); } } } @@ -717,9 +646,9 @@ depthScore Taxonomer::DFS(const vector &matches, size_t depth, size_t MIN_DEPTH, unordered_set &used, unordered_map & match2depthScore, - unordered_map & edges, float score, int hammingDist) { + float score, int hammingDist) { depth++; - depthScore bestDepthScore = depthScore(0, 0, 0); + depthScore bestDepthScore = depthScore{}; depthScore returnDepthScore; depthScore curDepthScore; float recievedScore = score; @@ -730,8 +659,8 @@ depthScore Taxonomer::DFS(const vector &matches, } else { score += 2.0f - 0.5f * lastEndHamming; } - match2depthScore[curMatch] = depthScore(1, score - recievedScore, lastEndHamming); - return depthScore(depth, score, hammingDist + lastEndHamming); + match2depthScore[curMatch] = depthScore(1, score - recievedScore, lastEndHamming, curMatch); + return depthScore(depth, score, hammingDist + lastEndHamming, curMatch); } else { // not a leaf node uint8_t lastEndHamming = (curMatch->rightEndHamming >> 14); if (lastEndHamming == 0) { @@ -746,20 +675,21 @@ depthScore Taxonomer::DFS(const vector &matches, returnDepthScore = match2depthScore[nextMatch]; curDepthScore = depthScore(returnDepthScore.depth + depth, returnDepthScore.score + score, - returnDepthScore.hammingDist + hammingDist + lastEndHamming); + returnDepthScore.hammingDist + hammingDist + lastEndHamming, + returnDepthScore.endMatch); } else { - curDepthScore = DFS(matches, nextMatch, linkedMatches, depth, MIN_DEPTH, used, match2depthScore, edges, score, hammingDist + lastEndHamming); + curDepthScore = DFS(matches, nextMatch, linkedMatches, depth, MIN_DEPTH, used, match2depthScore, score, hammingDist + lastEndHamming); } if (curDepthScore.score > bestDepthScore.score && curDepthScore.depth > MIN_DEPTH) { bestDepthScore = curDepthScore; - edges[curMatch] = nextMatch; } } if (bestDepthScore.depth > MIN_DEPTH) { match2depthScore[curMatch] = depthScore(bestDepthScore.depth - depth + 1, bestDepthScore.score - recievedScore, - bestDepthScore.hammingDist - hammingDist); + bestDepthScore.hammingDist - hammingDist, + bestDepthScore.endMatch); } } return bestDepthScore; diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index b94c2348..c3186d43 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -23,21 +23,25 @@ struct TaxonScore { }; struct depthScore { - depthScore(size_t depth, float score, int hammingDist) : depth(depth), score(score), hammingDist(hammingDist) {} - depthScore() : depth(0), score(0.f), hammingDist(0) {} + depthScore(size_t depth, float score, int hammingDist, const Match * endMatch) : + depth(depth), score(score), hammingDist(hammingDist), endMatch(endMatch) {} + depthScore() : depth(0), score(0.f), hammingDist(0), endMatch(nullptr) {} size_t depth; float score; int hammingDist; + const Match * endMatch; }; struct MatchPath { - MatchPath(int start, int end, float score, int hammingDist) : start(start), end(end), score(score), hammingDist(hammingDist) {} - MatchPath() : start(0), end(0), score(0.f), hammingDist(0) {} + MatchPath(int start, int end, float score, int hammingDist, const Match * startMatch, const Match * endMatch) : + start(start), end(end), score(score), hammingDist(hammingDist), startMatch(startMatch), endMatch(endMatch) {} + MatchPath() : start(0), end(0), score(0.f), hammingDist(0), startMatch(nullptr), endMatch(nullptr) {} int start; int end; float score; int hammingDist; - vector matches; + const Match * startMatch; + const Match * endMatch; }; @@ -114,7 +118,7 @@ class Taxonomer { const map> &linkedMatches, size_t depth, size_t MIN_DEPTH, unordered_set &used, unordered_map &match2depthScore, - unordered_map & edges, float score, int hammingDist); + float score, int hammingDist); static bool isConsecutive(const Match * match1, const Match * match2); diff --git a/src/metabuli.cpp b/src/metabuli.cpp index b7285d0d..2458d7c1 100644 --- a/src/metabuli.cpp +++ b/src/metabuli.cpp @@ -58,14 +58,6 @@ std::vector commands = { {"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}, {"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}, {"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, - // {"filter", filter, &localPar.filter, COMMAND_MAIN, - // "Filtering reads based on the classification result", - // nullptr, - // "Jaebeom Kim ", - // " ", - // CITATION_SPACEPHARER, - // {{"READ FILE", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile}, - // {"FILTER DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}, {"grade", grade, &localPar.grade, COMMAND_EXPERT, "Grade the classification result (only for benchmarking)", nullptr, @@ -85,14 +77,6 @@ std::vector commands = { {"List of answer sheets (Query ID 2 tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"List of assembly accessions of reference sequences", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"Taxonomy directory", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::directory}}}, - // {"seqHeader2TaxId", seqHeader2TaxId, &localPar.seqHeader2TaxId, COMMAND_EXPERT, - // "It extracts k-mers from query sequences, and compares them to the target database", - // nullptr, - // "Jaebeom Kim ", - // " ", - // CITATION_SPACEPHARER, - // {{"read-classification", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, - // {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, {"add-to-library", addToLibrary, &localPar.addToLibrary, COMMAND_DATABASE_CREATION, "It bins sequences into files according to their species.", nullptr, @@ -102,16 +86,6 @@ std::vector commands = { {{"List of absolute paths of files to be added. One path per line.", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"NCBI style accession2taxid file. It should be consistent to tax dump files.", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"DB directory", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}, - // {"apply-threshold", applyThreshold, &localPar.applyThreshold, COMMAND_EXPERT, - // "Assigning taxonomy label to query reads", - // nullptr, - // "Jaebeom Kim ", - // " ", - // CITATION_SPACEPHARER, - // {{"Old Results", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, - // {"OUT DIR", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}, - // {"JOB ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}, - // {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}}, {"binning2report", binning2report, &localPar.binning2report, COMMAND_FORMAT_CONVERSION, "It generates Kraken style report file from binning results", nullptr, @@ -122,15 +96,6 @@ std::vector commands = { {"OUT DIR", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}, {"JOB ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}}, - // {"filter-by-genus", filterByGenus, &localPar.filterByGenus, COMMAND_EXPERT, - // "It filters out reads classified as a specific genus", - // nullptr, - // "Jaebeom Kim ", - // " ", - // CITATION_SPACEPHARER, - // {{"Binning Result", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, - // {"Genus list", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}, - // {"TAXONOMY DIR", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}}, {"mapping2taxon", mapping2taxon, &localPar.mapping2taxon, COMMAND_EXPERT, "It takes a mapping file (multiple targets for each read) and generates a read2taxon file (one target for each read)", nullptr, From 79acbc8c6d196d46aa1b5bb1765de351bba38cd5 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Fri, 8 Dec 2023 02:37:15 +0900 Subject: [PATCH 4/5] copy Matches of speices --- src/commons/Taxonomer.cpp | 34 +++++++++++++++++----------------- src/commons/Taxonomer.h | 6 +++--- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index f139be2d..cff5d5c1 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -97,7 +97,7 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, // } // } // Get the best species for current query - vector speciesMatches; + vector speciesMatches; speciesMatches.reserve(end - offset + 1); TaxonScore speciesScore(0, 0, 0, 0, 0); if (par.seqMode == 2) { @@ -177,26 +177,26 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, // } } -void Taxonomer::filterRedundantMatches(vector & speciesMatches, +void Taxonomer::filterRedundantMatches(vector & speciesMatches, map & taxCnt) { // Sort matches by the coordinate on the query sort(speciesMatches.begin(), speciesMatches.end(), - [](const Match * a, const Match * b) { return a->qInfo.pos < b->qInfo.pos; }); + [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); // Remove redundant matches size_t matchNum = speciesMatches.size(); for (size_t i = 0; i < matchNum;) { - size_t currQuotient = speciesMatches[i]->qInfo.pos / 3; - uint8_t minHamming = speciesMatches[i]->hamming; - const Match * minHammingMatch = speciesMatches[i]; - TaxID minHammingTaxId = minHammingMatch->targetId; - while ((i < matchNum) && (currQuotient == speciesMatches[i]->qInfo.pos / 3)) { - if (speciesMatches[i]->hamming < minHamming) { - minHamming = speciesMatches[i]->hamming; + size_t currQuotient = speciesMatches[i].qInfo.pos / 3; + uint8_t minHamming = speciesMatches[i].hamming; + Match minHammingMatch = speciesMatches[i]; + TaxID minHammingTaxId = minHammingMatch.targetId; + while ((i < matchNum) && (currQuotient == speciesMatches[i].qInfo.pos / 3)) { + if (speciesMatches[i].hamming < minHamming) { + minHamming = speciesMatches[i].hamming; minHammingMatch = speciesMatches[i]; - minHammingTaxId = minHammingMatch->targetId; - } else if (speciesMatches[i]->hamming == minHamming) { - minHammingTaxId = taxonomy->LCA(minHammingTaxId, speciesMatches[i]->targetId); + minHammingTaxId = minHammingMatch.targetId; + } else if (speciesMatches[i].hamming == minHamming) { + minHammingTaxId = taxonomy->LCA(minHammingTaxId, speciesMatches[i].targetId); } i++; } @@ -268,7 +268,7 @@ TaxID Taxonomer::BFS(const unordered_map & cladeCnt, TaxID r } } -TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, +TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, const Match *matchList, size_t end, size_t offset, @@ -358,7 +358,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatch - speciesMatchRange[bestScore.taxId].first + 1); for (size_t j = speciesMatchRange[bestScore.taxId].first; j < speciesMatchRange[bestScore.taxId].second; j++) { - speciesMatches.push_back(& matchList[j]); + speciesMatches.push_back(matchList[j]); } bestScore.coverage = coveredLength / queryLength; bestScore.hammingDist = hammingDist; @@ -366,7 +366,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatch return bestScore; } -TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, +TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, const Match *matchList, size_t end, size_t offset, @@ -455,7 +455,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatch - speciesMatchRange[bestScore.taxId].first + 1); for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) { - speciesMatches.push_back(&matchList[i]); + speciesMatches.push_back(matchList[i]); } bestScore.coverage = coveredLength / (readLength1 + readLength2); bestScore.hammingDist = hammingDist; diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index c3186d43..a9bc0486 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -111,7 +111,7 @@ class Taxonomer { void trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength); - void filterRedundantMatches(vector & speciesMatches, + void filterRedundantMatches(vector & speciesMatches, map & taxCnt); depthScore DFS(const vector &matches, const Match * curMatchIdx, @@ -130,10 +130,10 @@ class Taxonomer { TaxonScore getBestGenusMatches(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, int readLength1, int readLength2); - TaxonScore getBestSpeciesMatches(vector &speciesMatches, const Match *matchList, size_t end, + TaxonScore getBestSpeciesMatches(vector &speciesMatches, const Match *matchList, size_t end, size_t offset, int queryLength); - TaxonScore getBestSpeciesMatches(vector &speciesMatches, const Match *matchList, size_t end, + TaxonScore getBestSpeciesMatches(vector &speciesMatches, const Match *matchList, size_t end, size_t offset, int readLength1, int readLength2); // TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, From 79e743d5994a4ea2db46f3706ba9a032a7224528 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Fri, 8 Dec 2023 13:22:00 +0900 Subject: [PATCH 5/5] remove unessary copies --- src/commons/KmerMatcher.cpp | 12 ++----- src/commons/LocalUtil.cpp | 6 ++-- src/commons/QueryIndexer.cpp | 7 ++-- src/commons/Taxonomer.cpp | 70 +++++++++++++----------------------- src/commons/Taxonomer.h | 6 ++-- 5 files changed, 37 insertions(+), 64 deletions(-) diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 9c87d527..7627f0aa 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -331,11 +331,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI } for (int k = 0; k < currMatchNum; k++) { idx = selectedMatches[k]; - // Check if candidateKmerInfos[idx].sequenceID is valid - // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || - // taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { - // cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; - // } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, taxId2speciesId[candidateKmerInfos[idx].sequenceID], @@ -474,10 +469,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI free(splitCheckList); queryKmerNum = 0; -#ifdef OPENMP - omp_set_num_threads(threads); -#endif - totalMatchCnt += matchBuffer->startIndexOfReserve; return 1; } @@ -520,8 +511,9 @@ void KmerMatcher::compareDna(uint64_t query, } // Select target k-mers that passed hamming criteria + uint8_t maxHamming = min(minHammingSum * 2, 7); for (size_t h = 0; h < size; h++) { - if (hammingSums[h] <= min(minHammingSum * 2, 7)) { + if (hammingSums[h] <= maxHamming) { selectedMatches.push_back(h); selectedHammingSum.push_back(hammingSums[h]); if (frame < 3) { // Frame of query k-mer diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp index d981f238..6b87319b 100644 --- a/src/commons/LocalUtil.cpp +++ b/src/commons/LocalUtil.cpp @@ -35,11 +35,11 @@ void LocalUtil::splitQueryFile(std::vector & sequences, const std int LocalUtil::getMaxCoveredLength(int queryLength) { if (queryLength % 3 == 2) { - return queryLength - 2; // 2 + return queryLength - 2; } else if (queryLength % 3 == 1) { - return queryLength - 4; // 4 + return queryLength - 4; } else { - return queryLength - 3; // 3 + return queryLength - 3; } } diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp index 29191f8e..7846a969 100644 --- a/src/commons/QueryIndexer.cpp +++ b/src/commons/QueryIndexer.cpp @@ -14,6 +14,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) { maxRam = par.ramUsage; threads = par.threads; bytesPerKmer = sizeof(QueryKmer) + matchPerKmer * sizeof(Match); + // std::cout << "bytesPerKmer: " << bytesPerKmer << "\n"; readNum_1 = 0; readNum_2 = 0; spaceNum = par.spaceMask.length() - kmerLength; @@ -25,6 +26,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) { void QueryIndexer::setAvailableRam() { availableRam = ((size_t) maxRam * (size_t) 1024 * 1024 * 1024) - ((size_t) 128 * 1024 * 1024 * (size_t) threads); + // std::cout << "availableRam: " << availableRam << "\n"; } void QueryIndexer::indexQueryFile() { @@ -40,7 +42,6 @@ void QueryIndexer::indexQueryFile() { totalReadLength += kseq->entry.sequence.l; size_t currentKmerCnt = LocalUtil::getQueryKmerNumber(kseq->entry.sequence.l, spaceNum); kmerCnt += currentKmerCnt; - // std::cout << "currentKmerCnt: " << kmerCnt << "\n"; if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) { querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt); @@ -72,7 +73,6 @@ void QueryIndexer::indexQueryFile() { seqCnt_1++; totalReadLength += kseq_1->entry.sequence.l; currentKmerCnt = LocalUtil::getQueryKmerNumber(kseq_1->entry.sequence.l, spaceNum); - kmerCnt += currentKmerCnt; } else { end = true; } @@ -82,7 +82,7 @@ void QueryIndexer::indexQueryFile() { seqCnt_2++; totalReadLength += kseq_2->entry.sequence.l; currentKmerCnt += LocalUtil::getQueryKmerNumber(kseq_2->entry.sequence.l, spaceNum); - kmerCnt += currentKmerCnt; + } else { end = true; } @@ -92,6 +92,7 @@ void QueryIndexer::indexQueryFile() { EXIT(EXIT_FAILURE); } + kmerCnt += currentKmerCnt; if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) { querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt); kmerCnt = currentKmerCnt; diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index cff5d5c1..63fb5731 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -89,13 +89,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, const Match *matchList, vector & queryList, const LocalParameters &par) { - -// if (true) { -// cout << "# " << currentQuery << " " << queryList[currentQuery].name << endl; -// for (size_t i = offset; i < end + 1; i++) { -// cout << matchList[i].targetId << " " << matchList[i].qInfo.frame << " " << matchList[i].qInfo.pos << " " << int(matchList[i].hamming) << " " << int(matchList[i].redundancy) << endl; -// } -// } // Get the best species for current query vector speciesMatches; speciesMatches.reserve(end - offset + 1); @@ -109,14 +102,6 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, queryList[currentQuery].queryLength); } -// if (true) { -// cout << "# " << currentQuery << " " << queryList[currentQuery].name << " filtered\n"; -// for (size_t i = 0; i < genusMatches.size(); i++) { -// cout << genusMatches[i].targetId << " " << genusMatches[i].qInfo.frame << " " << genusMatches[i].qInfo.pos << " " << int(genusMatches[i].hamming) << " " << int(genusMatches[i].redundancy) << endl; -// } -// cout << "Genus score: " << genusScore.score << "\n"; -// } - // If there is no proper species for current query, it is un-classified. if (speciesScore.score == 0 || speciesScore.score < par.minScore) { queryList[currentQuery].isClassified = false; @@ -138,8 +123,12 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, return; } - // Filter redundant matches - filterRedundantMatches(speciesMatches, queryList[currentQuery].taxCnt); + // Filter redundant matches + unordered_map taxCnt; + filterRedundantMatches(speciesMatches, taxCnt); + for (auto & tax : taxCnt) { + queryList[currentQuery].taxCnt[tax.first] = tax.second; + } // If score is not enough, classify to the parent of the selected species if (speciesScore.score < par.minSpScore) { @@ -153,7 +142,7 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, } // Lower rank classification - TaxID result = lowerRankClassification(queryList[currentQuery].taxCnt, + TaxID result = lowerRankClassification(taxCnt, speciesScore.taxId, queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2); @@ -164,21 +153,10 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, queryList[currentQuery].coverage = speciesScore.coverage; queryList[currentQuery].hammingDist = speciesScore.hammingDist; queryList[currentQuery].newSpecies = false; -// if (par.printLog) { -// cout << "# " << currentQuery << endl; -// for (size_t i = 0; i < genusMatches.size(); i++) { -// cout << i << " " << genusMatches[i].qInfo.pos << " " << -// genusMatches[i].targetId << " " << int(genusMatches[i].hamming) << endl; -// } -// cout << "Score: " << speciesScore.score << " " << selectedSpecies << " " -// << taxonomy->getString(taxonomy->taxonNode(selectedSpecies)->rankIdx) -// -// << endl; -// } } void Taxonomer::filterRedundantMatches(vector & speciesMatches, - map & taxCnt) { + unordered_map & taxCnt) { // Sort matches by the coordinate on the query sort(speciesMatches.begin(), speciesMatches.end(), [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); @@ -204,7 +182,7 @@ void Taxonomer::filterRedundantMatches(vector & speciesMatches, } } -TaxID Taxonomer::lowerRankClassification(const map & taxCnt, TaxID spTaxId, int queryLength) { +TaxID Taxonomer::lowerRankClassification(const unordered_map & taxCnt, TaxID spTaxId, int queryLength) { unsigned int maxCnt = (queryLength - 1)/denominator + 1; unordered_map cladeCnt; getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); @@ -225,7 +203,7 @@ TaxID Taxonomer::lowerRankClassification(const map & taxCnt, TaxID s } } -void Taxonomer::getSpeciesCladeCounts(const map &taxCnt, +void Taxonomer::getSpeciesCladeCounts(const unordered_map &taxCnt, unordered_map & cladeCount, TaxID speciesTaxID) { for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) { @@ -475,9 +453,11 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, // 1. Add the matchPath with the highest score to combinedMatchPaths // 2. Add the matchPath with the highest score that is not overlapped with the matchPath in combinedMatchPaths // 3. Repeat 2 until no matchPath can be added + float score = 0; for (size_t i = 0; i < matchPaths.size(); i++) { if (combinedMatchPaths.empty()) { combinedMatchPaths.push_back(matchPaths[i]); + score += matchPaths[i].score; } else { bool isOverlapped = false; for (size_t j = 0; j < combinedMatchPaths.size(); j++) { @@ -486,27 +466,26 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, - max(matchPaths[i].start, combinedMatchPaths[j].start) + 1; if (overlappedLength < 24) { trimMatchPath(matchPaths[i], combinedMatchPaths[j], overlappedLength); + continue; } else { isOverlapped = true; break; } - // Current path trimmed too much - if (matchPaths[i].end - matchPaths[i].start + 1 < 24) { + if (matchPaths[i].end - matchPaths[i].start + 1 < 24) { // Current path trimmed too much isOverlapped = true; break; - } + } } } if (!isOverlapped) { combinedMatchPaths.push_back(matchPaths[i]); + score += matchPaths[i].score; } } - } - // Calculate the score of combinedMatchPaths - float score = 0; - for (auto & matchPath : combinedMatchPaths) { - score += matchPath.score; + // if (score / readLength >= 1) { + // break; + // } } return score / readLength; } @@ -520,12 +499,10 @@ bool Taxonomer::isMatchPathOverlapped(const MatchPath & matchPath1, void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength) { if (path1.start < path2.start) { path1.end = path2.start - 1; - // uint8_t lastEndHamming = GET_2_BITS(path1.matches.back()->rightEndHamming); path1.hammingDist = max(0, path1.hammingDist - path1.endMatch->getRightPartHammingDist(overlapLength/3)); path1.score = path1.score - path1.endMatch->getRightPartScore(overlapLength/3) - (overlapLength % 3); } else { path1.start = path2.end + 1; - // uint8_t lastEndHamming = GET_2_BITS(path1.matches.front()->rightEndHamming >> 14); path1.hammingDist = max(0, path1.hammingDist - path1.startMatch->getLeftPartHammingDist(overlapLength/3)); path1.score = path1.score - path1.startMatch->getLeftPartScore(overlapLength/3) - (overlapLength % 3); } @@ -558,7 +535,7 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM // Compare curPosMatches and nextPosMatches for (auto &curPosMatch: curPosMatches) { for (auto &nextPosMatch: nextPosMatches) { - if (curPosMatch->isConsecutive_DNA(nextPosMatch)) { + if (isConsecutive(curPosMatch, nextPosMatch)){ linkedMatches[curPosMatch].push_back(nextPosMatch); } } @@ -586,9 +563,12 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM // Compare curPosMatches and nextPosMatches for (auto &curPosMatch: curPosMatches) { for (auto &nextPosMatch: nextPosMatches) { - if (nextPosMatch->isConsecutive_DNA(curPosMatch)) { + if (isConsecutive(nextPosMatch, curPosMatch)){ linkedMatches[curPosMatch].push_back(nextPosMatch); } + // if (nextPosMatch->isConsecutive_DNA(curPosMatch)) { + // linkedMatches[curPosMatch].push_back(nextPosMatch); + // } } } @@ -1052,7 +1032,7 @@ TaxonScore Taxonomer::scoreSpecies(const vector &matches, bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) { // match1 87654321 -> 08765432 // match2 98765432 -> 08765432 - return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF); + return (match1->dnaEncoding >> 3) == (match2->dnaEncoding & 0x1FFFFF); } bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * match2) { diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index a9bc0486..0279fe38 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -112,7 +112,7 @@ class Taxonomer { void trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength); void filterRedundantMatches(vector & speciesMatches, - map & taxCnt); + unordered_map & taxCnt); depthScore DFS(const vector &matches, const Match * curMatchIdx, const map> &linkedMatches, @@ -177,9 +177,9 @@ class Taxonomer { int queryLength, int queryLength2); - TaxID lowerRankClassification(const map & matches, TaxID speciesID, int queryLength); + TaxID lowerRankClassification(const unordered_map & taxCnt, TaxID speciesID, int queryLength); - void getSpeciesCladeCounts(const map & taxCnt, + void getSpeciesCladeCounts(const unordered_map & taxCnt, unordered_map & cladeCnt, TaxID spciesID);