From 797e7d67af7e47263e6ebcc05b6958fe9819fe85 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 22 Aug 2024 09:38:13 +0900 Subject: [PATCH 01/21] Query k-mer extraction process is optimized. previous 1. Load thread num of sequence 2. Each thread process one sequence Now 1. One thread keeps loading sequences for other threads to process - for each thread, 1000 sequences are loaded 2. When 1000 seuquences are loaded, one of idle threads take them to process --- src/commons/IndexCreator.cpp | 19 +- src/commons/KmerExtractor.cpp | 435 +++++++++++++++++++++++--------- src/commons/KmerExtractor.h | 8 + src/commons/ProdigalWrapper.cpp | 225 ++++++++++++++++- src/commons/ProdigalWrapper.h | 6 + src/commons/SeqIterator.cpp | 256 ++----------------- src/commons/SeqIterator.h | 31 +-- src/commons/common.h | 12 + 8 files changed, 599 insertions(+), 393 deletions(-) diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index c89ae373..8a584c00 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -866,6 +866,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, kseq_buffer_t buffer; kseq_t *seq; vector intergenicKmers; + vector aaSeq; #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < fnaSplits.size(); i++) { if (!checker[i] && !hasOverflow) { @@ -917,7 +918,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, prodigal->getPredictedGenes(seq->seq.s); seqIterator.generateIntergenicKmerList(prodigal->genes, prodigal->nodes, prodigal->getNumberOfPredictedGenes(), - intergenicKmers,seq->seq.s); + intergenicKmers, seq->seq.s); // Get min k-mer hash list for determining strandness seqIterator.getMinHashList(standardList, seq->seq.s); @@ -941,7 +942,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, // Get extended ORFs prodigal->getPredictedGenes(seq->seq.s); prodigal->removeCompletelyOverlappingGenes(); - seqIterator.getExtendedORFs(prodigal->finalGenes, prodigal->nodes, extendedORFs, + prodigal->getExtendedORFs(prodigal->finalGenes, prodigal->nodes, extendedORFs, prodigal->fng, strlen(seq->seq.s), orfNum, intergenicKmers, seq->seq.s); // Get masked sequence @@ -955,14 +956,16 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, // Get k-mers from extended ORFs for (size_t orfCnt = 0; orfCnt < orfNum; orfCnt++) { - seqIterator.translateBlock(maskedSeq, extendedORFs[orfCnt]); + aaSeq.clear(); + seqIterator.translateBlock(maskedSeq, extendedORFs[orfCnt], aaSeq); tempCheck = seqIterator.fillBufferWithKmerFromBlock( extendedORFs[orfCnt], maskedSeq, kmerBuffer, posToWrite, int(processedSeqCnt[fnaSplits[i].file_idx] + fnaSplits[i].offset + s_cnt), - fnaSplits[i].speciesID); + fnaSplits[i].speciesID, + aaSeq); if (tempCheck == -1) { cout << "ERROR: Buffer overflow " << seq->name.s << seq->seq.l << endl; } @@ -976,7 +979,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, // Get extended ORFs prodigal->getPredictedGenes(reverseCompliment); prodigal->removeCompletelyOverlappingGenes(); - seqIterator.getExtendedORFs(prodigal->finalGenes, prodigal->nodes, extendedORFs, + prodigal->getExtendedORFs(prodigal->finalGenes, prodigal->nodes, extendedORFs, prodigal->fng, strlen(reverseCompliment), orfNum, intergenicKmers, reverseCompliment); @@ -990,14 +993,16 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer, } for (size_t orfCnt = 0; orfCnt < orfNum; orfCnt++) { - seqIterator.translateBlock(maskedSeq, extendedORFs[orfCnt]); + aaSeq.clear(); + seqIterator.translateBlock(maskedSeq, extendedORFs[orfCnt], aaSeq); tempCheck = seqIterator.fillBufferWithKmerFromBlock( extendedORFs[orfCnt], maskedSeq, kmerBuffer, posToWrite, int(processedSeqCnt[fnaSplits[i].file_idx] + fnaSplits[i].offset + s_cnt), - fnaSplits[i].speciesID); + fnaSplits[i].speciesID, + aaSeq); if (tempCheck == -1) { cout << "ERROR: Buffer overflow " << seq->name.s << seq->seq.l << endl; } diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp index c9edff9b..81f90495 100644 --- a/src/commons/KmerExtractor.cpp +++ b/src/commons/KmerExtractor.cpp @@ -1,8 +1,8 @@ #include "KmerExtractor.h" -#include "common.h" #include KmerExtractor::KmerExtractor(const LocalParameters &par) { + seqIterator = new SeqIterator(par); spaceNum = 0; maskMode = par.maskMode; maskProb = par.maskProb; @@ -11,6 +11,7 @@ KmerExtractor::KmerExtractor(const LocalParameters &par) { } KmerExtractor::~KmerExtractor() { + delete seqIterator; delete probMatrix; delete subMat; } @@ -46,72 +47,112 @@ void KmerExtractor::extractQueryKmers(QueryKmerBuffer &kmerBuffer, cout << "Time spent for sorting query metamer list: " << double(time(nullptr) - beforeQueryKmerSort) << endl; } -void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq1, +void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq, QueryKmerBuffer &kmerBuffer, - vector &queryList, + std::vector &queryList, const QuerySplit ¤tSplit, - const LocalParameters &par) { + const LocalParameters &par) { + size_t readLength = 1000; size_t processedQueryNum = 0; + size_t chunkSize = 1000; - // Array to store reads of thread number - vector reads1(par.threads); - - while (processedQueryNum < currentSplit.readCnt) { - size_t currentQueryNum = min(currentSplit.readCnt - processedQueryNum, (size_t) par.threads); - size_t count = 0; - while (count < currentQueryNum) { - // Read query - kseq1->ReadEntry(); - const KSeqWrapper::KSeqEntry & e1 = kseq1->entry; - - // Get k-mer count - int kmerCnt = LocalUtil::getQueryKmerNumber((int) e1.sequence.l, spaceNum); - - // Query Info - queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) e1.sequence.l); - queryList[processedQueryNum].name = string(e1.name.s); - queryList[processedQueryNum].kmerCnt = (int) (kmerCnt); - - // Store reads - reads1[count] = string(kseq1->entry.sequence.s); - - processedQueryNum ++; - count ++; + // Reserve memory for each read of the chunk for each thread + std::vector> chunkReads_thread(par.threads); + for (size_t i = 0; i < par.threads; ++i) { + chunkReads_thread[i].resize(chunkSize); + for (size_t j = 0; j < chunkSize; ++j) { + chunkReads_thread[i][j].reserve(readLength); } -#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, count, reads1) + } + + // Initialize atomic variable for active tasks + std::vector> busyThreads(par.threads); + for (size_t i = 0; i < par.threads; ++i) { + busyThreads[i].store(false); + } + + // OpenMP parallel region with tasks +#pragma omp parallel default(none) shared(par, readLength, kmerBuffer, \ +queryList, currentSplit, processedQueryNum, kseq, chunkSize, chunkReads_thread, busyThreads) + { + char *maskedSeq = new char[readLength]; + char *seq = nullptr; + vector aaFrames_read[6]; + // reserve memory for each frame + for (int i = 0; i < 6; ++i) { + aaFrames_read[i].reserve(readLength/3); + } +#pragma omp single nowait { - SeqIterator seqIterator(par); - size_t posToWrite; -#pragma omp for schedule(dynamic, 1) - for (size_t i = 0; i < currentQueryNum; i ++) { - size_t queryIdx = processedQueryNum - currentQueryNum + i; - // Get k-mer count - int kmerCnt = LocalUtil::getQueryKmerNumber(reads1[i].length(), spaceNum); + int masterThread = omp_get_thread_num(); + size_t count = 0; + while (processedQueryNum < currentSplit.readCnt) { + // Find an idle thread + int threadId; + for (threadId = 0; threadId < par.threads; ++threadId) { + if (threadId == masterThread) { continue; } + if (!busyThreads[threadId].load()) { + busyThreads[threadId].store(true); + break; + } + } - // Ignore short read - if (kmerCnt < 1) { continue; } - - // Get masked sequence - char *maskedSeq1 = nullptr; - if (maskMode) { - maskedSeq1 = new char[reads1[i].length() + 1]; - SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat); - } else { - maskedSeq1 = const_cast(reads1[i].c_str()); + if (threadId == par.threads) { + continue; } - posToWrite = kmerBuffer.reserveMemory(kmerCnt); - - // Process Read 1 - seqIterator.sixFrameTranslation(maskedSeq1, (int) reads1[i].length()); - seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) reads1[i].length(), kmerBuffer, posToWrite, - (uint32_t) queryIdx); + size_t chunkEnd = min(processedQueryNum + chunkSize, currentSplit.readCnt); + count = 0; - if (maskMode) { - delete[] maskedSeq1; + for (size_t i = 0; i < chunkSize && processedQueryNum < chunkEnd; ++i, ++processedQueryNum) { + kseq->ReadEntry(); + queryList[processedQueryNum].name = string(kseq->entry.name.s); + queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) kseq->entry.sequence.l); + + // Check if the read is too short + int kmerCnt = LocalUtil::getQueryKmerNumber((int) kseq->entry.sequence.l, spaceNum); + if (kmerCnt < 1) { + chunkReads_thread[threadId][i] = ""; + queryList[processedQueryNum].kmerCnt = 0; + } else { + chunkReads_thread[threadId][i] = string(kseq->entry.sequence.s); + queryList[processedQueryNum].kmerCnt = kmerCnt; + } + count++; } + + // Process each chunk by idle threads + #pragma omp task firstprivate(count, processedQueryNum, threadId) + { + for (size_t i = 0; i < count; ++i) { + size_t queryIdx = processedQueryNum - count + i; + if (chunkReads_thread[threadId][i].empty()) { continue; } + + // Get masked sequence + if (maskMode) { + if (readLength < chunkReads_thread[threadId][i].length() + 1) { + readLength = chunkReads_thread[threadId][i].length() + 1; + delete[] maskedSeq; + maskedSeq = new char[readLength]; + } + SeqIterator::maskLowComplexityRegions(chunkReads_thread[threadId][i].c_str(), maskedSeq, *probMatrix, maskProb, subMat); + seq = maskedSeq; + } else { + seq = const_cast(chunkReads_thread[threadId][i].c_str()); + } + + size_t posToWrite = kmerBuffer.reserveMemory(queryList[queryIdx].kmerCnt); + + // Process Read 1 + seqIterator->sixFrameTranslation(seq, (int) chunkReads_thread[threadId][i].length(), aaFrames_read); + seqIterator->fillQueryKmerBuffer(seq, (int) chunkReads_thread[threadId][i].length(), kmerBuffer, posToWrite, + (uint32_t) queryIdx, aaFrames_read); + } + busyThreads[threadId].store(false); + } } } + delete[] maskedSeq; } } @@ -121,89 +162,231 @@ void KmerExtractor::fillQueryKmerBufferParallel_paired(KSeqWrapper *kseq1, vector &queryList, const QuerySplit ¤tSplit, const LocalParameters &par) { + size_t readLength = 1000; size_t processedQueryNum = 0; - - // Array to store reads of thread number - vector reads1(par.threads); - vector reads2(par.threads); - - while (processedQueryNum < currentSplit.readCnt) { - size_t currentQueryNum = min(currentSplit.readCnt - processedQueryNum, (size_t) par.threads); - size_t count = 0; - - // Fill reads in sequential - while (count < currentQueryNum) { - // Read query - kseq1->ReadEntry(); - kseq2->ReadEntry(); - const KSeqWrapper::KSeqEntry & e1 = kseq1->entry; - const KSeqWrapper::KSeqEntry & e2 = kseq2->entry; - - // Get k-mer count - int kmerCnt = LocalUtil::getQueryKmerNumber((int) e1.sequence.l, spaceNum); - int kmerCnt2 = LocalUtil::getQueryKmerNumber((int) e2.sequence.l, spaceNum); - - // Query Info - queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) e1.sequence.l); - queryList[processedQueryNum].queryLength2 = LocalUtil::getMaxCoveredLength((int) e2.sequence.l); - queryList[processedQueryNum].name = string(e1.name.s); - queryList[processedQueryNum].kmerCnt = (int) (kmerCnt + kmerCnt2); - - // Store reads - reads1[count] = string(kseq1->entry.sequence.s); - reads2[count] = string(kseq2->entry.sequence.s); - - processedQueryNum ++; - count ++; + size_t chunkSize = 1000; + // Reserve memory for each read of the chunk for each thread + std::vector> chunkReads1_thread(par.threads); + std::vector> chunkReads2_thread(par.threads); + for (size_t i = 0; i < par.threads; ++i) { + chunkReads1_thread[i].resize(chunkSize); + chunkReads2_thread[i].resize(chunkSize); + for (size_t j = 0; j < chunkSize; ++j) { + chunkReads1_thread[i][j].reserve(readLength); + chunkReads2_thread[i][j].reserve(readLength); } + } + + // Initialize atomic variable for active tasks + std::vector> busyThreads(par.threads); + for (size_t i = 0; i < par.threads; ++i) { + busyThreads[i].store(false); + } - // Process reads in parallel -#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1, reads2) + // OpenMP parallel region with tasks +#pragma omp parallel default(none) shared(par, readLength, \ +kmerBuffer, queryList, currentSplit, processedQueryNum, kseq1, kseq2, \ +chunkSize, chunkReads1_thread, chunkReads2_thread, busyThreads, cout) + { + char *maskedSeq1 = new char[readLength]; + char *maskedSeq2 = new char[readLength]; + char *seq1 = nullptr; + char *seq2 = nullptr; + vector aaFrames_read1[6]; + vector aaFrames_read2[6]; + // reserve memory for each frame + for (int i = 0; i < 6; ++i) { + aaFrames_read1[i].reserve(readLength/3); + aaFrames_read2[i].reserve(readLength/3); + } +#pragma omp single nowait { - SeqIterator seqIterator(par); - SeqIterator seqIterator2(par); - size_t posToWrite; -#pragma omp for schedule(dynamic, 1) - for (size_t i = 0; i < currentQueryNum; i ++) { - size_t queryIdx = processedQueryNum - currentQueryNum + i; - // Get k-mer count - int kmerCnt = LocalUtil::getQueryKmerNumber(reads1[i].length(), spaceNum); - int kmerCnt2 = LocalUtil::getQueryKmerNumber(reads2[i].length(), spaceNum); - - // Ignore short read - if (kmerCnt2 < 1 || kmerCnt < 1) { continue; } - - // Get masked sequence - char *maskedSeq1 = nullptr; - char *maskedSeq2 = nullptr; - if (maskMode) { - maskedSeq1 = new char[reads1[i].length() + 1]; - maskedSeq2 = new char[reads2[i].length() + 1]; - SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat); - SeqIterator::maskLowComplexityRegions(reads2[i].c_str(),maskedSeq2, *probMatrix, maskProb, subMat); - } else { - maskedSeq1 = const_cast(reads1[i].c_str()); - maskedSeq2 = const_cast(reads2[i].c_str()); + int masterThread = omp_get_thread_num(); + size_t count = 0; + while (processedQueryNum < currentSplit.readCnt) { + // Find an idle thread + int threadId; + for (threadId = 0; threadId < par.threads; ++threadId) { + if (threadId == masterThread) { continue; } + if (!busyThreads[threadId].load()) { + busyThreads[threadId].store(true); + break; + } } - posToWrite = kmerBuffer.reserveMemory(kmerCnt + kmerCnt2); - - // Process Read 1 - seqIterator.sixFrameTranslation(maskedSeq1, (int) reads1[i].length()); - seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) reads1[i].length(), kmerBuffer, posToWrite, - (uint32_t) queryIdx); + if (threadId == par.threads) { + continue; + } - // Process Read 2 - seqIterator2.sixFrameTranslation(maskedSeq2, (int) reads2[i].length()); - seqIterator2.fillQueryKmerBuffer(maskedSeq2, (int) reads2[i].length(), kmerBuffer, posToWrite, - (uint32_t) queryIdx, queryList[queryIdx].queryLength+3); + size_t chunkEnd = min(processedQueryNum + chunkSize, currentSplit.readCnt); + count = 0; - if (maskMode) { - delete[] maskedSeq1; - delete[] maskedSeq2; + for (size_t i = 0; i < chunkSize && processedQueryNum < chunkEnd; ++i, ++processedQueryNum) { + kseq1->ReadEntry(); + kseq2->ReadEntry(); + queryList[processedQueryNum].name = string(kseq1->entry.name.s); + queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) kseq1->entry.sequence.l); + queryList[processedQueryNum].queryLength2 = LocalUtil::getMaxCoveredLength((int) kseq2->entry.sequence.l); + + // Check if the read is too short + int kmerCnt1 = LocalUtil::getQueryKmerNumber((int) kseq1->entry.sequence.l, spaceNum); + int kmerCnt2 = LocalUtil::getQueryKmerNumber((int) kseq2->entry.sequence.l, spaceNum); + if (kmerCnt1 < 1 || kmerCnt2 < 1) { + chunkReads1_thread[threadId][i] = ""; + chunkReads2_thread[threadId][i] = ""; + queryList[processedQueryNum].kmerCnt = 0; + } else { + // cout << threadId << endl + chunkReads1_thread[threadId][i] = string(kseq1->entry.sequence.s); + chunkReads2_thread[threadId][i] = string(kseq2->entry.sequence.s); + queryList[processedQueryNum].kmerCnt = kmerCnt1 + kmerCnt2; + } + count++; } + + + // Process each chunk by idle threads + #pragma omp task firstprivate(count, processedQueryNum, threadId) + { + for (size_t i = 0; i < count; ++i) { + size_t queryIdx = processedQueryNum - count + i; + if (chunkReads1_thread[threadId][i].empty() || chunkReads2_thread[threadId][i].empty()) { continue; } + + // Get masked sequence + if (maskMode) { + if (readLength < chunkReads1_thread[threadId][i].length() + 1 || readLength < chunkReads2_thread[threadId][i].length() + 1) { + readLength = max(chunkReads1_thread[threadId][i].length() + 1, chunkReads2_thread[threadId][i].length() + 1); + delete[] maskedSeq1; + delete[] maskedSeq2; + maskedSeq1 = new char[readLength]; + maskedSeq2 = new char[readLength]; + } + SeqIterator::maskLowComplexityRegions(chunkReads1_thread[threadId][i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat); + SeqIterator::maskLowComplexityRegions(chunkReads2_thread[threadId][i].c_str(),maskedSeq2, *probMatrix, maskProb, subMat); + seq1 = maskedSeq1; + seq2 = maskedSeq2; + } else { + seq1 = const_cast(chunkReads1_thread[threadId][i].c_str()); + seq2 = const_cast(chunkReads2_thread[threadId][i].c_str()); + } + + size_t posToWrite = kmerBuffer.reserveMemory(queryList[queryIdx].kmerCnt); + + // Process Read 1 + seqIterator->sixFrameTranslation(seq1, (int) chunkReads1_thread[threadId][i].length(), aaFrames_read1); + seqIterator->fillQueryKmerBuffer(seq1, (int) chunkReads1_thread[threadId][i].length(), kmerBuffer, posToWrite, + (uint32_t) queryIdx, aaFrames_read1); + + // Process Read 2 + seqIterator->sixFrameTranslation(seq2, (int) chunkReads2_thread[threadId][i].length(), aaFrames_read2); + seqIterator->fillQueryKmerBuffer(seq2, (int) chunkReads2_thread[threadId][i].length(), kmerBuffer, posToWrite, + (uint32_t) queryIdx, aaFrames_read2, queryList[queryIdx].queryLength+3); + } + busyThreads[threadId].store(false); + } } } + delete[] maskedSeq1; + delete[] maskedSeq2; } } + + + +// void KmerExtractor::fillQueryKmerBufferParallel_paired2(KSeqWrapper *kseq1, +// KSeqWrapper *kseq2, +// QueryKmerBuffer &kmerBuffer, +// vector &queryList, +// const QuerySplit ¤tSplit, +// const LocalParameters &par) { +// size_t processedQueryNum = 0; + +// // Array to store reads of thread number +// vector reads1(par.threads); +// vector reads2(par.threads); + +// while (processedQueryNum < currentSplit.readCnt) { +// size_t currentQueryNum = min(currentSplit.readCnt - processedQueryNum, (size_t) par.threads); +// size_t count = 0; + +// // Fill reads in sequential +// while (count < currentQueryNum) { +// // Read query +// kseq1->ReadEntry(); +// kseq2->ReadEntry(); +// const KSeqWrapper::KSeqEntry & e1 = kseq1->entry; +// const KSeqWrapper::KSeqEntry & e2 = kseq2->entry; + +// // Get k-mer count +// int kmerCnt = LocalUtil::getQueryKmerNumber((int) e1.sequence.l, spaceNum); +// int kmerCnt2 = LocalUtil::getQueryKmerNumber((int) e2.sequence.l, spaceNum); + +// // Query Info +// queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) e1.sequence.l); +// queryList[processedQueryNum].queryLength2 = LocalUtil::getMaxCoveredLength((int) e2.sequence.l); +// queryList[processedQueryNum].name = string(e1.name.s); +// queryList[processedQueryNum].kmerCnt = (int) (kmerCnt + kmerCnt2); + +// // Store reads +// reads1[count] = string(kseq1->entry.sequence.s); +// reads2[count] = string(kseq2->entry.sequence.s); + +// processedQueryNum ++; +// count ++; +// } + +// // Process reads in parallel +// #pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1, reads2) +// { +// size_t posToWrite; +// size_t readLength = 1000; +// char *maskedSeq1 = new char[readLength]; +// char *maskedSeq2 = new char[readLength]; +// char *seq1 = nullptr; +// char *seq2 = nullptr; +// #pragma omp for schedule(dynamic, 1) +// for (size_t i = 0; i < currentQueryNum; i ++) { +// size_t queryIdx = processedQueryNum - currentQueryNum + i; +// // Get k-mer count +// int kmerCnt = LocalUtil::getQueryKmerNumber(reads1[i].length(), spaceNum); +// int kmerCnt2 = LocalUtil::getQueryKmerNumber(reads2[i].length(), spaceNum); + +// // Ignore short read +// if (kmerCnt2 < 1 || kmerCnt < 1) { continue; } + +// // Get masked sequence +// if (maskMode) { +// if (readLength < reads1[i].length() + 1 || readLength < reads2[i].length() + 1) { +// readLength = max(reads1[i].length() + 1, reads2[i].length() + 1); +// delete[] maskedSeq1; +// delete[] maskedSeq2; +// maskedSeq1 = new char[readLength]; +// maskedSeq2 = new char[readLength]; +// } +// SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat); +// SeqIterator::maskLowComplexityRegions(reads2[i].c_str(),maskedSeq2, *probMatrix, maskProb, subMat); +// seq1 = maskedSeq1; +// seq2 = maskedSeq2; +// } else { +// seq1 = const_cast(reads1[i].c_str()); +// seq2 = const_cast(reads2[i].c_str()); +// } + +// posToWrite = kmerBuffer.reserveMemory(kmerCnt + kmerCnt2); + +// // Process Read 1 +// seqIterator->sixFrameTranslation(seq1, (int) reads1[i].length()); +// seqIterator->fillQueryKmerBuffer(seq1, (int) reads1[i].length(), kmerBuffer, posToWrite, +// (uint32_t) queryIdx); + +// // Process Read 2 +// seqIterator->sixFrameTranslation(seq2, (int) reads2[i].length()); +// seqIterator->fillQueryKmerBuffer(seq2, (int) reads2[i].length(), kmerBuffer, posToWrite, +// (uint32_t) queryIdx, queryList[queryIdx].queryLength+3); +// } +// delete[] maskedSeq1; +// delete[] maskedSeq2; +// } +// } +// } diff --git a/src/commons/KmerExtractor.h b/src/commons/KmerExtractor.h index bac04a27..b77aa122 100644 --- a/src/commons/KmerExtractor.h +++ b/src/commons/KmerExtractor.h @@ -8,6 +8,7 @@ class KmerExtractor { private: + SeqIterator * seqIterator; // Parameters int spaceNum; int maskMode; @@ -31,6 +32,13 @@ class KmerExtractor { const QuerySplit & currentSplit, const LocalParameters &par); + void fillQueryKmerBufferParallel_paired2(KSeqWrapper* kseq1, + KSeqWrapper* kseq2, + QueryKmerBuffer &kmerBuffer, + vector &queryList, + const QuerySplit & currentSplit, + const LocalParameters &par); + public: explicit KmerExtractor(const LocalParameters & par); ~KmerExtractor(); diff --git a/src/commons/ProdigalWrapper.cpp b/src/commons/ProdigalWrapper.cpp index 9e9c6aec..ca6c381d 100644 --- a/src/commons/ProdigalWrapper.cpp +++ b/src/commons/ProdigalWrapper.cpp @@ -1,4 +1,5 @@ #include "ProdigalWrapper.h" +#include "SeqIterator.h" #include ProdigalWrapper::~ProdigalWrapper() { for(size_t i = 0; i < NUM_META; i++){ @@ -333,4 +334,226 @@ void ProdigalWrapper::setTrainingInfo(_training &tinf) { void ProdigalWrapper::updateTrainingInfo(_training &tinf2) { this->tinf = tinf2; -} \ No newline at end of file +} + +// It makes the blocks for translation +// Each block has a predicted gene part and an intergenic region. When another gene shows up, new block starts. +void ProdigalWrapper::getExtendedORFs(struct _gene *genes, struct _node *nodes, vector &blocks, + size_t numOfGene, size_t length, + size_t &blockIdx, vector &intergenicKmerList, const char *seq) { + + //Exceptional case 1: 0 prdicted gene + if (numOfGene == 0) { + blocks.emplace_back(0, length - 1, 1); + blockIdx++; + return; + } + + //Exceptional case 2: Only 1 prdicted gene + int frame; + int rightEnd = 0; + int leftEnd = 0; + if (numOfGene == 1) { + if (nodes[genes[0].start_ndx].strand == 1) { //forward + frame = (genes[0].begin - 1) % 3; + leftEnd = 0; + while (leftEnd % 3 != frame) leftEnd++; // y - (y - x) % 3 .. which would be faster? + blocks.emplace_back(leftEnd, length - 1, 1); + blockIdx++; + } else { //reverse + frame = (genes[0].end - 1) % 3; + rightEnd = length - 1; + while (rightEnd % 3 != frame) rightEnd--; + blocks.emplace_back(0, rightEnd, -1); + blockIdx++; + } + return; + } + + + /* Main routine */ + + bool hasBeenExtendedToLeft = false; + int k = 23; + char *newIntergenicKmer = (char *) malloc(sizeof(char) * (k + 1)); + char *leftKmer = (char *) malloc(sizeof(char) * (k + 1)); + char *rightKmer = (char *) malloc(sizeof(char) * (k + 1)); + char *leftKmerReverse = (char *) malloc(sizeof(char) * (k + 1)); + char *rightKmerReverse = (char *) malloc(sizeof(char) * (k + 1)); + bool isReverse = false; + uint64_t leftKmerHash = 0, rightKmerHash = 0; + + + // Extend the first gene to cover intergenic regions + if (nodes[genes[0].start_ndx].strand == 1) { //forward + frame = (genes[0].begin - 1) % 3; + leftEnd = 0; + while (leftEnd % 3 != frame) leftEnd++; + blocks.emplace_back(leftEnd, genes[1].begin - 1 + 22, 1); + blockIdx++; + } else { + frame = (genes[0].end - 1) % 3; + rightEnd = genes[1].begin - 1 + 22; + while (rightEnd % 3 != frame) rightEnd--; + blocks.emplace_back(0, rightEnd, -1); + blockIdx++; + } + + // From the second gene to the second last gene + for (size_t geneIdx = 1; geneIdx < numOfGene - 1; geneIdx++) { + isReverse = false; + + // Make two k-mer hash; each from left and right of current gene. They are used for choosing extension direction. + strncpy(leftKmer, seq + genes[geneIdx].begin - 1 - k, k); + strncpy(rightKmer, seq + genes[geneIdx].end, k); + if (nodes[genes[geneIdx].start_ndx].strand == 1) { + leftKmerHash = XXH64(leftKmer, k, 0); + rightKmerHash = XXH64(rightKmer, k, 0); + } else { + isReverse = true; + for (int j = k - 1; j >= 0; j--) { + leftKmerReverse[k - j - 1] = SeqIterator::iRCT[leftKmer[j]]; + rightKmerReverse[k - j - 1] = SeqIterator::iRCT[rightKmer[j]]; + } + leftKmerHash = XXH64(leftKmerReverse, k, 0); + rightKmerHash = XXH64(rightKmerReverse, k, 0); + } + + // Extend genes to cover intergenic regions + if (find(intergenicKmerList.begin(), intergenicKmerList.end(), leftKmerHash) != + intergenicKmerList.end()) { //Extension to left + if (!hasBeenExtendedToLeft) { + if (!isReverse) { // Forward + blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx].end - 1, 1); + blockIdx++; + } else { // Reverse + blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx].end - 1, -1); + blockIdx++; + } + } else { + if (!isReverse) { //forward + frame = (genes[geneIdx].begin - 1) % 3; + leftEnd = genes[geneIdx - 1].end -1 -22; + while (leftEnd % 3 != frame) leftEnd++; + blocks.emplace_back(leftEnd, genes[geneIdx].end - 1, 1); + blockIdx++; + } else { // reverse + blocks.emplace_back(genes[geneIdx - 1].end - 22 - 1, genes[geneIdx].end - 1, -1); + blockIdx++; + } + } + hasBeenExtendedToLeft = true; + } else { // Extension to right + if (hasBeenExtendedToLeft) { + if (!isReverse) { //forward + frame = (genes[geneIdx].begin - 1) % 3; + leftEnd = genes[geneIdx - 1].end - 1 - 22; + while (leftEnd % 3 != frame) leftEnd++; + blocks.emplace_back(leftEnd, genes[geneIdx + 1].begin - 1 + 22, 1); + blockIdx++; + } else { + frame = (genes[geneIdx].end - 1) % 3; + rightEnd = genes[geneIdx + 1].begin - 1 + 22; + while (rightEnd % 3 != frame) rightEnd--; + blocks.emplace_back(genes[geneIdx - 1].end - 1 - 22, rightEnd, -1); + blockIdx++; + } + } else { + if (!isReverse) { //forward + blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx + 1].begin - 1 + 22, 1); + blockIdx++; + } else { + frame = (genes[geneIdx].end - 1) % 3; + rightEnd = genes[geneIdx + 1].begin - 1 + 22; + while (rightEnd % 3 != frame) rightEnd--; + blocks.emplace_back(genes[geneIdx].begin - 1, rightEnd, -1); + blockIdx++; + } + } + hasBeenExtendedToLeft = false; + + if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { + intergenicKmerList.push_back(rightKmerHash); + } + } + } + + // // For the last gene + // // Extend to the end of the genome + // isReverse = !(nodes[genes[numOfGene - 1].start_ndx].strand == 1); + // rightEnd = length - 1; + // if (isReverse) { + // frame = (genes[numOfGene - 1].end - 1) % 3; + // while (rightEnd % 3 != frame) rightEnd--; + // } + // // If left region is not covered, cover it. + // leftEnd = genes[numOfGene - 1].begin - 1; + // if (hasBeenExtendedToLeft) { + // leftEnd = genes[numOfGene - 2].end - 1 - 22; + // if (!isReverse) { + // frame = (genes[numOfGene - 1].begin - 1) % 3; + // while (leftEnd % 3 != frame) leftEnd++; + // } + // } + // blocks.emplace_back(leftEnd, rightEnd, isReverse ? -1 : 1); + // if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { + // intergenicKmerList.push_back(rightKmerHash); + // } + + //For the last gene + if (find(intergenicKmerList.begin(), intergenicKmerList.end(), leftKmerHash) != + intergenicKmerList.end()) { //extension to left + if (!isReverse) { //forward + frame = (genes[numOfGene - 1].begin - 1) % 3; + leftEnd = genes[numOfGene - 2].end - 1 - 22; + while (leftEnd % 3 != frame) leftEnd++; + blocks.emplace_back(leftEnd, length - 1, 1); + blockIdx++; + } else { // reverse + frame = (genes[numOfGene - 1].end - 1) % 3; + rightEnd = length - 1; + while (rightEnd % 3 != frame) rightEnd--; + blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1); + blockIdx++; + } + } else { //extension to right + if (hasBeenExtendedToLeft) { + if (!isReverse) { //forward + frame = (genes[numOfGene - 1].begin - 1) % 3; + leftEnd = genes[numOfGene - 2].end - 1 - 22; + while (leftEnd % 3 != frame) leftEnd++; + blocks.emplace_back(leftEnd, length - 1, 1); + blockIdx++; + } else { + frame = (genes[numOfGene - 1].end - 1) % 3; + rightEnd = length - 1; + while (rightEnd % 3 != frame) rightEnd--; + blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1); + blockIdx++; + } + } else { + if (!isReverse) { + blocks.emplace_back(genes[numOfGene - 1].begin, length - 1, 1); + blockIdx++; + } else { + frame = (genes[numOfGene - 1].end - 1) % 3; + rightEnd = length - 1; + while (rightEnd % 3 != frame) rightEnd--; + blocks.emplace_back(genes[numOfGene - 1].begin - 1, rightEnd, -1); + blockIdx++; + } + } + + //If current intergenic sequences is new, update intergenicKmerList. + if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { + intergenicKmerList.push_back(rightKmerHash); + } + } + + + free(newIntergenicKmer); + free(leftKmer); + free(rightKmer); + free(leftKmerReverse); + free(rightKmerReverse); +} diff --git a/src/commons/ProdigalWrapper.h b/src/commons/ProdigalWrapper.h index 15d5c219..8570d1c4 100644 --- a/src/commons/ProdigalWrapper.h +++ b/src/commons/ProdigalWrapper.h @@ -16,6 +16,10 @@ #include "prodigalsequence.h" #include "training.h" #include "printBinary.h" +#include "common.h" +#include +#include "xxhash.h" +#include "SeqIterator.h" #define MIN_SINGLE_GENOME 20000 #define IDEAL_SINGLE_GENOME 100000 @@ -60,5 +64,7 @@ class ProdigalWrapper{ void printGenes(); ProdigalWrapper(); ~ProdigalWrapper(); + void getExtendedORFs(struct _gene *genes, struct _node *nodes, std::vector &blocks, size_t numOfGene, + size_t length, size_t &numOfBlocks, std::vector &intergenicKmerList, const char *seq); }; #endif //ADCLASSIFIER2_PRODIGALWRAPPER_H diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp index ddb1b132..1f42dbd0 100644 --- a/src/commons/SeqIterator.cpp +++ b/src/commons/SeqIterator.cpp @@ -10,8 +10,6 @@ const string SeqIterator::iRCT = ".............................................. "................................................................" "................................................................"; - - SeqIterator::~SeqIterator() { delete[] mask; delete[] mask_int; @@ -255,7 +253,7 @@ SeqIterator::SeqIterator(const LocalParameters &par) { } // It translates a DNA sequence to amino acid sequences with six frames -void SeqIterator::sixFrameTranslation(const char *seq, int len) { +void SeqIterator::sixFrameTranslation(const char *seq, int len, vector *aaFrames) { for (int i = 0; i < 6; i++) { aaFrames[i].clear(); } size_t end = len - 1; @@ -314,8 +312,15 @@ void SeqIterator::sixFrameTranslation(const char *seq, int len) { } } -void SeqIterator::fillQueryKmerBuffer(const char *seq, int seqLen, QueryKmerBuffer &kmerBuffer, size_t &posToWrite, uint32_t seqID, - uint32_t offset) { +void SeqIterator::fillQueryKmerBuffer( + const char *seq, + int seqLen, + QueryKmerBuffer &kmerBuffer, + size_t &posToWrite, + uint32_t seqID, + vector *aaFrames, + uint32_t offset) + { int forOrRev; uint64_t tempKmer = 0; int checkN; @@ -386,18 +391,18 @@ SeqIterator::addDNAInfo_QueryKmer(uint64_t &kmer, const char *seq, int forOrRev, } } -bool SeqIterator::translateBlock(const char *seq, PredictedBlock block) { - aaFrames[0].clear(); - if(aaFrames->capacity() < strlen(seq) / 3 + 1) { - aaFrames->reserve(strlen(seq) / 3 + 1); +bool SeqIterator::translateBlock(const char *seq, PredictedBlock block, vector & aaSeq) { + aaSeq.clear(); + if(aaSeq.capacity() < strlen(seq) / 3 + 1) { + aaSeq.reserve(strlen(seq) / 3 + 1); } if (block.strand == 1) { for (int i = block.start; i + 2 <= block.end; i = i + 3) { - aaFrames[0].push_back(nuc2aa[nuc2int(atcg[seq[i]])][nuc2int(atcg[seq[i + 1]])][nuc2int(atcg[seq[i + 2]])]); + aaSeq.push_back(nuc2aa[nuc2int(atcg[seq[i]])][nuc2int(atcg[seq[i + 1]])][nuc2int(atcg[seq[i + 2]])]); } } else { for (int i = block.end; i >= (int) block.start + 2; i = i - 3) { - aaFrames[0].push_back(nuc2aa[nuc2int(iRCT[atcg[seq[i]]])][nuc2int(iRCT[atcg[seq[i - 1]]])][ + aaSeq.push_back(nuc2aa[nuc2int(iRCT[atcg[seq[i]]])][nuc2int(iRCT[atcg[seq[i - 1]]])][ nuc2int(iRCT[atcg[seq[i - 2]]])]); } } @@ -427,19 +432,19 @@ char *SeqIterator::reverseCompliment(char *read, size_t length) const { // It extracts kmers from amino acid sequence with DNA information and fill the kmerBuffer with them. int SeqIterator::fillBufferWithKmerFromBlock(const PredictedBlock &block, const char *seq, TargetKmerBuffer &kmerBuffer, - size_t &posToWrite, int seqID, int taxIdAtRank) { + size_t &posToWrite, int seqID, int taxIdAtRank, const vector & aaSeq) { uint64_t tempKmer = 0; - int len = (int) aaFrames[0].size(); + int len = (int) aaSeq.size(); int checkN; for (int kmerCnt = 0; kmerCnt < len - kmerLength - spaceNum_int + 1; kmerCnt++) { tempKmer = 0; checkN = 0; for (uint32_t i = 0, j = 0; i < kmerLength + spaceNum; i++, j += mask[i]) { - if (-1 == aaFrames[0][kmerCnt + i]) { + if (-1 == aaSeq[kmerCnt + i]) { checkN = 1; break; } - tempKmer += aaFrames[0][kmerCnt + i] * powers[j] * mask[i]; + tempKmer += aaSeq[kmerCnt + i] * powers[j] * mask[i]; } if (checkN == 1) { kmerBuffer.buffer[posToWrite] = {UINT64_MAX, -1, 0, false}; @@ -485,227 +490,6 @@ size_t SeqIterator::getNumOfKmerForBlock(const PredictedBlock &block) { return len / 3 - 7; } -// It makes the blocks for translation -// Each block has a predicted gene part and an intergenic region. When another gene shows up, new block starts. -void SeqIterator::getExtendedORFs(struct _gene *genes, struct _node *nodes, vector &blocks, - size_t numOfGene, size_t length, - size_t &blockIdx, vector &intergenicKmerList, const char *seq) { - - //Exceptional case 1: 0 prdicted gene - if (numOfGene == 0) { - blocks.emplace_back(0, length - 1, 1); - blockIdx++; - return; - } - - //Exceptional case 2: Only 1 prdicted gene - int frame; - int rightEnd = 0; - int leftEnd = 0; - if (numOfGene == 1) { - if (nodes[genes[0].start_ndx].strand == 1) { //forward - frame = (genes[0].begin - 1) % 3; - leftEnd = 0; - while (leftEnd % 3 != frame) leftEnd++; // y - (y - x) % 3 .. which would be faster? - blocks.emplace_back(leftEnd, length - 1, 1); - blockIdx++; - } else { //reverse - frame = (genes[0].end - 1) % 3; - rightEnd = length - 1; - while (rightEnd % 3 != frame) rightEnd--; - blocks.emplace_back(0, rightEnd, -1); - blockIdx++; - } - return; - } - - - /* Main routine */ - - bool hasBeenExtendedToLeft = false; - int k = 23; - char *newIntergenicKmer = (char *) malloc(sizeof(char) * (k + 1)); - char *leftKmer = (char *) malloc(sizeof(char) * (k + 1)); - char *rightKmer = (char *) malloc(sizeof(char) * (k + 1)); - char *leftKmerReverse = (char *) malloc(sizeof(char) * (k + 1)); - char *rightKmerReverse = (char *) malloc(sizeof(char) * (k + 1)); - bool isReverse = false; - uint64_t leftKmerHash = 0, rightKmerHash = 0; - - - // Extend the first gene to cover intergenic regions - if (nodes[genes[0].start_ndx].strand == 1) { //forward - frame = (genes[0].begin - 1) % 3; - leftEnd = 0; - while (leftEnd % 3 != frame) leftEnd++; - blocks.emplace_back(leftEnd, genes[1].begin - 1 + 22, 1); - blockIdx++; - } else { - frame = (genes[0].end - 1) % 3; - rightEnd = genes[1].begin - 1 + 22; - while (rightEnd % 3 != frame) rightEnd--; - blocks.emplace_back(0, rightEnd, -1); - blockIdx++; - } - - // From the second gene to the second last gene - for (size_t geneIdx = 1; geneIdx < numOfGene - 1; geneIdx++) { - isReverse = false; - - // Make two k-mer hash; each from left and right of current gene. They are used for choosing extension direction. - strncpy(leftKmer, seq + genes[geneIdx].begin - 1 - k, k); - strncpy(rightKmer, seq + genes[geneIdx].end, k); - if (nodes[genes[geneIdx].start_ndx].strand == 1) { - leftKmerHash = XXH64(leftKmer, k, 0); - rightKmerHash = XXH64(rightKmer, k, 0); - } else { - isReverse = true; - for (int j = k - 1; j >= 0; j--) { - leftKmerReverse[k - j - 1] = iRCT[leftKmer[j]]; - rightKmerReverse[k - j - 1] = iRCT[rightKmer[j]]; - } - leftKmerHash = XXH64(leftKmerReverse, k, 0); - rightKmerHash = XXH64(rightKmerReverse, k, 0); - } - - // Extend genes to cover intergenic regions - if (find(intergenicKmerList.begin(), intergenicKmerList.end(), leftKmerHash) != - intergenicKmerList.end()) { //Extension to left - if (!hasBeenExtendedToLeft) { - if (!isReverse) { // Forward - blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx].end - 1, 1); - blockIdx++; - } else { // Reverse - blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx].end - 1, -1); - blockIdx++; - } - } else { - if (!isReverse) { //forward - frame = (genes[geneIdx].begin - 1) % 3; - leftEnd = genes[geneIdx - 1].end -1 -22; - while (leftEnd % 3 != frame) leftEnd++; - blocks.emplace_back(leftEnd, genes[geneIdx].end - 1, 1); - blockIdx++; - } else { // reverse - blocks.emplace_back(genes[geneIdx - 1].end - 22 - 1, genes[geneIdx].end - 1, -1); - blockIdx++; - } - } - hasBeenExtendedToLeft = true; - } else { // Extension to right - if (hasBeenExtendedToLeft) { - if (!isReverse) { //forward - frame = (genes[geneIdx].begin - 1) % 3; - leftEnd = genes[geneIdx - 1].end - 1 - 22; - while (leftEnd % 3 != frame) leftEnd++; - blocks.emplace_back(leftEnd, genes[geneIdx + 1].begin - 1 + 22, 1); - blockIdx++; - } else { - frame = (genes[geneIdx].end - 1) % 3; - rightEnd = genes[geneIdx + 1].begin - 1 + 22; - while (rightEnd % 3 != frame) rightEnd--; - blocks.emplace_back(genes[geneIdx - 1].end - 1 - 22, rightEnd, -1); - blockIdx++; - } - } else { - if (!isReverse) { //forward - blocks.emplace_back(genes[geneIdx].begin - 1, genes[geneIdx + 1].begin - 1 + 22, 1); - blockIdx++; - } else { - frame = (genes[geneIdx].end - 1) % 3; - rightEnd = genes[geneIdx + 1].begin - 1 + 22; - while (rightEnd % 3 != frame) rightEnd--; - blocks.emplace_back(genes[geneIdx].begin - 1, rightEnd, -1); - blockIdx++; - } - } - hasBeenExtendedToLeft = false; - - if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { - intergenicKmerList.push_back(rightKmerHash); - } - } - } - - // // For the last gene - // // Extend to the end of the genome - // isReverse = !(nodes[genes[numOfGene - 1].start_ndx].strand == 1); - // rightEnd = length - 1; - // if (isReverse) { - // frame = (genes[numOfGene - 1].end - 1) % 3; - // while (rightEnd % 3 != frame) rightEnd--; - // } - // // If left region is not covered, cover it. - // leftEnd = genes[numOfGene - 1].begin - 1; - // if (hasBeenExtendedToLeft) { - // leftEnd = genes[numOfGene - 2].end - 1 - 22; - // if (!isReverse) { - // frame = (genes[numOfGene - 1].begin - 1) % 3; - // while (leftEnd % 3 != frame) leftEnd++; - // } - // } - // blocks.emplace_back(leftEnd, rightEnd, isReverse ? -1 : 1); - // if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { - // intergenicKmerList.push_back(rightKmerHash); - // } - - //For the last gene - if (find(intergenicKmerList.begin(), intergenicKmerList.end(), leftKmerHash) != - intergenicKmerList.end()) { //extension to left - if (!isReverse) { //forward - frame = (genes[numOfGene - 1].begin - 1) % 3; - leftEnd = genes[numOfGene - 2].end - 1 - 22; - while (leftEnd % 3 != frame) leftEnd++; - blocks.emplace_back(leftEnd, length - 1, 1); - blockIdx++; - } else { // reverse - frame = (genes[numOfGene - 1].end - 1) % 3; - rightEnd = length - 1; - while (rightEnd % 3 != frame) rightEnd--; - blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1); - blockIdx++; - } - } else { //extension to right - if (hasBeenExtendedToLeft) { - if (!isReverse) { //forward - frame = (genes[numOfGene - 1].begin - 1) % 3; - leftEnd = genes[numOfGene - 2].end - 1 - 22; - while (leftEnd % 3 != frame) leftEnd++; - blocks.emplace_back(leftEnd, length - 1, 1); - blockIdx++; - } else { - frame = (genes[numOfGene - 1].end - 1) % 3; - rightEnd = length - 1; - while (rightEnd % 3 != frame) rightEnd--; - blocks.emplace_back(genes[numOfGene - 2].end - 22 - 1, rightEnd, -1); - blockIdx++; - } - } else { - if (!isReverse) { - blocks.emplace_back(genes[numOfGene - 1].begin, length - 1, 1); - blockIdx++; - } else { - frame = (genes[numOfGene - 1].end - 1) % 3; - rightEnd = length - 1; - while (rightEnd % 3 != frame) rightEnd--; - blocks.emplace_back(genes[numOfGene - 1].begin - 1, rightEnd, -1); - blockIdx++; - } - } - - //If current intergenic sequences is new, update intergenicKmerList. - if (find(intergenicKmerList.begin(), intergenicKmerList.end(), rightKmerHash) == intergenicKmerList.end()) { - intergenicKmerList.push_back(rightKmerHash); - } - } - - - free(newIntergenicKmer); - free(leftKmer); - free(rightKmer); - free(leftKmerReverse); - free(rightKmerReverse); -} bool SeqIterator::compareMinHashList(priority_queue list1, priority_queue &list2, size_t length1, size_t length2) { diff --git a/src/commons/SeqIterator.h b/src/commons/SeqIterator.h index b3be8638..c53093e0 100644 --- a/src/commons/SeqIterator.h +++ b/src/commons/SeqIterator.h @@ -33,23 +33,9 @@ KSEQ_INIT(kseq_buffer_t*, kseq_buffer_reader) using namespace std; -typedef struct PredictedBlock { - PredictedBlock(int start, int end, int strand) : start(start), end(end), strand(strand) {} - - void printPredictedBlock() { - cout << strand << " " << start << " " << end << endl; - } - - int start; - int end; - int strand; //true for forward -} PredictedBlock; - class SeqIterator { private: - static const string iRCT; - static const string atcg; - vector aaFrames[6]; + // vector aaFrames[6]; uint64_t powers[10]; int nuc2aa[8][8][8]; uint64_t nuc2num[4][4][4]; @@ -60,30 +46,29 @@ class SeqIterator { int bitsForCodon; int bitsFor8Codons; - void addDNAInfo_QueryKmer(uint64_t &kmer, const char *seq, int forOrRev, uint32_t kmerCnt, uint32_t frame, int readLength); void addDNAInfo_TargetKmer(uint64_t &kmer, const char *seq, const PredictedBlock &block, const int &kmerCnt); public: + static const string iRCT; + static const string atcg; + void fillQueryKmerBuffer(const char *seq, int seqLen, QueryKmerBuffer &kmerBuffer, size_t &posToWrite, - uint32_t seqID, uint32_t offset = 0); + uint32_t seqID, vector *aaFrames, uint32_t offset = 0); string reverseCompliment(string &read) const; char *reverseCompliment(char *read, size_t length) const; - void sixFrameTranslation(const char *seq, int seqLen); + void sixFrameTranslation(const char *seq, int seqLen, vector *aaFrames); - bool translateBlock(const char *seq, PredictedBlock block); + bool translateBlock(const char *seq, PredictedBlock block, vector & aaSeq); void generateIntergenicKmerList(struct _gene *genes, struct _node *nodes, int numberOfGenes, vector &intergenicKmerList, const char *seq); - void getExtendedORFs(struct _gene *genes, struct _node *nodes, vector &blocks, size_t numOfGene, - size_t length, size_t &numOfBlocks, vector &intergenicKmerList, const char *seq); - void getMinHashList(priority_queue &sortedHashQue, const char *seq); bool @@ -94,7 +79,7 @@ class SeqIterator { size_t getNumOfKmerForBlock(const PredictedBlock &block); int fillBufferWithKmerFromBlock(const PredictedBlock &block, const char *seq, TargetKmerBuffer &kmerBuffer, - size_t &posToWrite, int seqID, int taxIdAtRank); + size_t &posToWrite, int seqID, int taxIdAtRank, const vector & aaSeq); static void maskLowComplexityRegions(const char * seq, char * maskedSeq, ProbabilityMatrix & probMat, float maskProb, const BaseMatrix * subMat); diff --git a/src/commons/common.h b/src/commons/common.h index dc847fcb..2961060d 100644 --- a/src/commons/common.h +++ b/src/commons/common.h @@ -27,6 +27,18 @@ struct SequenceBlock{ size_t seqLength; }; +typedef struct PredictedBlock { + PredictedBlock(int start, int end, int strand) : start(start), end(end), strand(strand) {} + + void printPredictedBlock() { + std::cout << strand << " " << start << " " << end << std::endl; + } + + int start; + int end; + int strand; //true for forward +} PredictedBlock; + struct Query{ int queryId; int classification; From 60c62900319b74b3d8c82cace1936561cb88b600 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 22 Aug 2024 13:45:36 +0900 Subject: [PATCH 02/21] Working on optimizing match analysis process --- src/commons/Classifier.cpp | 53 +++++++++++++++-- src/commons/Classifier.h | 11 +++- src/commons/KmerExtractor.cpp | 106 +--------------------------------- src/commons/KmerExtractor.h | 10 +--- src/commons/Taxonomer.cpp | 25 ++++---- src/commons/Taxonomer.h | 17 +++--- 6 files changed, 85 insertions(+), 137 deletions(-) diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index ee49ce41..6531fcfe 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -23,7 +23,7 @@ Classifier::Classifier(LocalParameters & par) { } else { kmerMatcher = new KmerMatcher(par, taxonomy); } - taxonomer = new Taxonomer(par, taxonomy); + // taxonomer = new Taxonomer(par, taxonomy); reporter = new Reporter(par, taxonomy); } @@ -32,7 +32,7 @@ Classifier::~Classifier() { delete queryIndexer; delete kmerExtractor; delete kmerMatcher; - delete taxonomer; + // delete taxonomer; delete reporter; } @@ -113,7 +113,7 @@ void Classifier::startClassify(const LocalParameters &par) { kmerMatcher->sortMatches(&matchBuffer); // Classify queries based on the matches. - taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par); + assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par); // Write classification results reporter->writeReadClassification(queryList); @@ -148,8 +148,53 @@ void Classifier::startClassify(const LocalParameters &par) { reporter->closeReadClassificationFile(); // Write report files - reporter->writeReportFile(totalSeqCnt, taxonomer->getTaxCounts()); + reporter->writeReportFile(totalSeqCnt, taxCounts); // Memory deallocation free(matchBuffer.buffer); } + +void Classifier::assignTaxonomy(const Match *matchList, + size_t numOfMatches, + std::vector &queryList, + const LocalParameters &par) { + time_t beforeAnalyze = time(nullptr); + cout << "Analyzing matches ..." << endl; + + // Divide matches into blocks for multi threading + size_t seqNum = queryList.size(); + MatchBlock *matchBlocks = new MatchBlock[seqNum]; + size_t matchIdx = 0; + size_t blockIdx = 0; + uint32_t currentQuery; + while (matchIdx < numOfMatches) { + currentQuery = matchList[matchIdx].qInfo.sequenceID; + matchBlocks[blockIdx].id = currentQuery; + matchBlocks[blockIdx].start = matchIdx; + while ((currentQuery == matchList[matchIdx].qInfo.sequenceID) && (matchIdx < numOfMatches)) ++matchIdx; + matchBlocks[blockIdx].end = matchIdx - 1; + blockIdx++; + } + + // Process each block +#pragma omp parallel default(none), shared(cout, matchBlocks, matchList, seqNum, queryList, blockIdx, par) + { + Taxonomer taxonomer(par, taxonomy); +#pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < blockIdx; ++i) { + taxonomer.chooseBestTaxon(matchBlocks[i].id, + matchBlocks[i].start, + matchBlocks[i].end, + matchList, + queryList, + par); + } + } + + for (size_t i = 0; i < seqNum; i++) { + ++taxCounts[queryList[i].classification]; + } + delete[] matchBlocks; + cout << "Time spent for analyzing: " << double(time(nullptr) - beforeAnalyze) << endl; + +} \ No newline at end of file diff --git a/src/commons/Classifier.h b/src/commons/Classifier.h index 81a0b027..c2bd9962 100644 --- a/src/commons/Classifier.h +++ b/src/commons/Classifier.h @@ -45,17 +45,26 @@ class Classifier { QueryIndexer * queryIndexer; KmerExtractor * kmerExtractor; KmerMatcher * kmerMatcher; - Taxonomer * taxonomer; + // Taxonomer * taxonomer; Reporter * reporter; NcbiTaxonomy * taxonomy; + unordered_map taxCounts; + public: void startClassify(const LocalParameters &par); + void assignTaxonomy(const Match *matchList, + size_t numOfMatches, + std::vector & queryList, + const LocalParameters &par); + explicit Classifier(LocalParameters & par); virtual ~Classifier(); + unordered_map & getTaxCounts() { return taxCounts; } + }; diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp index 81f90495..0559748c 100644 --- a/src/commons/KmerExtractor.cpp +++ b/src/commons/KmerExtractor.cpp @@ -58,7 +58,7 @@ void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq, // Reserve memory for each read of the chunk for each thread std::vector> chunkReads_thread(par.threads); - for (size_t i = 0; i < par.threads; ++i) { + for (int i = 0; i < par.threads; ++i) { chunkReads_thread[i].resize(chunkSize); for (size_t j = 0; j < chunkSize; ++j) { chunkReads_thread[i][j].reserve(readLength); @@ -66,7 +66,7 @@ void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq, } // Initialize atomic variable for active tasks - std::vector> busyThreads(par.threads); + std::vector> busyThreads(par.threads); for (size_t i = 0; i < par.threads; ++i) { busyThreads[i].store(false); } @@ -289,104 +289,4 @@ chunkSize, chunkReads1_thread, chunkReads2_thread, busyThreads, cout) delete[] maskedSeq1; delete[] maskedSeq2; } -} - - - -// void KmerExtractor::fillQueryKmerBufferParallel_paired2(KSeqWrapper *kseq1, -// KSeqWrapper *kseq2, -// QueryKmerBuffer &kmerBuffer, -// vector &queryList, -// const QuerySplit ¤tSplit, -// const LocalParameters &par) { -// size_t processedQueryNum = 0; - -// // Array to store reads of thread number -// vector reads1(par.threads); -// vector reads2(par.threads); - -// while (processedQueryNum < currentSplit.readCnt) { -// size_t currentQueryNum = min(currentSplit.readCnt - processedQueryNum, (size_t) par.threads); -// size_t count = 0; - -// // Fill reads in sequential -// while (count < currentQueryNum) { -// // Read query -// kseq1->ReadEntry(); -// kseq2->ReadEntry(); -// const KSeqWrapper::KSeqEntry & e1 = kseq1->entry; -// const KSeqWrapper::KSeqEntry & e2 = kseq2->entry; - -// // Get k-mer count -// int kmerCnt = LocalUtil::getQueryKmerNumber((int) e1.sequence.l, spaceNum); -// int kmerCnt2 = LocalUtil::getQueryKmerNumber((int) e2.sequence.l, spaceNum); - -// // Query Info -// queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) e1.sequence.l); -// queryList[processedQueryNum].queryLength2 = LocalUtil::getMaxCoveredLength((int) e2.sequence.l); -// queryList[processedQueryNum].name = string(e1.name.s); -// queryList[processedQueryNum].kmerCnt = (int) (kmerCnt + kmerCnt2); - -// // Store reads -// reads1[count] = string(kseq1->entry.sequence.s); -// reads2[count] = string(kseq2->entry.sequence.s); - -// processedQueryNum ++; -// count ++; -// } - -// // Process reads in parallel -// #pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1, reads2) -// { -// size_t posToWrite; -// size_t readLength = 1000; -// char *maskedSeq1 = new char[readLength]; -// char *maskedSeq2 = new char[readLength]; -// char *seq1 = nullptr; -// char *seq2 = nullptr; -// #pragma omp for schedule(dynamic, 1) -// for (size_t i = 0; i < currentQueryNum; i ++) { -// size_t queryIdx = processedQueryNum - currentQueryNum + i; -// // Get k-mer count -// int kmerCnt = LocalUtil::getQueryKmerNumber(reads1[i].length(), spaceNum); -// int kmerCnt2 = LocalUtil::getQueryKmerNumber(reads2[i].length(), spaceNum); - -// // Ignore short read -// if (kmerCnt2 < 1 || kmerCnt < 1) { continue; } - -// // Get masked sequence -// if (maskMode) { -// if (readLength < reads1[i].length() + 1 || readLength < reads2[i].length() + 1) { -// readLength = max(reads1[i].length() + 1, reads2[i].length() + 1); -// delete[] maskedSeq1; -// delete[] maskedSeq2; -// maskedSeq1 = new char[readLength]; -// maskedSeq2 = new char[readLength]; -// } -// SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat); -// SeqIterator::maskLowComplexityRegions(reads2[i].c_str(),maskedSeq2, *probMatrix, maskProb, subMat); -// seq1 = maskedSeq1; -// seq2 = maskedSeq2; -// } else { -// seq1 = const_cast(reads1[i].c_str()); -// seq2 = const_cast(reads2[i].c_str()); -// } - -// posToWrite = kmerBuffer.reserveMemory(kmerCnt + kmerCnt2); - -// // Process Read 1 -// seqIterator->sixFrameTranslation(seq1, (int) reads1[i].length()); -// seqIterator->fillQueryKmerBuffer(seq1, (int) reads1[i].length(), kmerBuffer, posToWrite, -// (uint32_t) queryIdx); - -// // Process Read 2 -// seqIterator->sixFrameTranslation(seq2, (int) reads2[i].length()); -// seqIterator->fillQueryKmerBuffer(seq2, (int) reads2[i].length(), kmerBuffer, posToWrite, -// (uint32_t) queryIdx, queryList[queryIdx].queryLength+3); -// } -// delete[] maskedSeq1; -// delete[] maskedSeq2; -// } -// } -// } - +} \ No newline at end of file diff --git a/src/commons/KmerExtractor.h b/src/commons/KmerExtractor.h index b77aa122..a2699139 100644 --- a/src/commons/KmerExtractor.h +++ b/src/commons/KmerExtractor.h @@ -5,6 +5,7 @@ #include "KSeqWrapper.h" #include "common.h" #include +#include class KmerExtractor { private: @@ -31,14 +32,7 @@ class KmerExtractor { vector &queryList, const QuerySplit & currentSplit, const LocalParameters &par); - - void fillQueryKmerBufferParallel_paired2(KSeqWrapper* kseq1, - KSeqWrapper* kseq2, - QueryKmerBuffer &kmerBuffer, - vector &queryList, - const QuerySplit & currentSplit, - const LocalParameters &par); - + public: explicit KmerExtractor(const LocalParameters & par); ~KmerExtractor(); diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 60202c51..7a65481a 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -97,12 +97,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, speciesMatches.reserve(end - offset + 1); TaxonScore speciesScore(0, 0, 0, 0, 0); speciesScore = getBestSpeciesMatches(speciesMatches, - matchList, - end, - offset, - queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2); + matchList, + end, + offset, + queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2); - // If there is no proper species for current query, it is un-classified. if (speciesScore.score == 0 || speciesScore.score < par.minScore) { queryList[currentQuery].isClassified = false; @@ -128,18 +127,18 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, unordered_map taxCnt; filterRedundantMatches(speciesMatches, taxCnt); for (auto & tax : taxCnt) { - queryList[currentQuery].taxCnt[tax.first] = tax.second; + queryList[currentQuery].taxCnt[tax.first] = tax.second; } // If score is not enough, classify to the parent of the selected species if (speciesScore.score < par.minSpScore) { - queryList[currentQuery].isClassified = true; - queryList[currentQuery].classification = taxonomy->taxonNode( - taxonomy->getTaxIdAtRank(speciesScore.taxId, "species"))->parentTaxId; - queryList[currentQuery].score = speciesScore.score; - queryList[currentQuery].coverage = speciesScore.coverage; - queryList[currentQuery].hammingDist = speciesScore.hammingDist; - return; + queryList[currentQuery].isClassified = true; + queryList[currentQuery].classification = taxonomy->taxonNode( + taxonomy->getTaxIdAtRank(speciesScore.taxId, "species"))->parentTaxId; + queryList[currentQuery].score = speciesScore.score; + queryList[currentQuery].coverage = speciesScore.coverage; + queryList[currentQuery].hammingDist = speciesScore.hammingDist; + return; } // Lower rank classification diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 3cf4f9fa..fbe3d08f 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -45,6 +45,13 @@ struct MatchPath { }; +struct MatchBlock { + MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {} + MatchBlock() : start(0), end(0), id(0) {} + size_t start; + size_t end; + uint32_t id; +}; class Taxonomer { private: @@ -67,18 +74,12 @@ class Taxonomer { // Internal int denominator; - struct MatchBlock { - MatchBlock(size_t start, size_t end, int id) : start(start), end(end), id(id) {} - MatchBlock() : start(0), end(0), id(0) {} - size_t start; - size_t end; - uint32_t id; - }; + // Output - unordered_map taxCounts; + unordered_map taxCounts; public: From a12882cd3bff63111ea4aaf888320d0c4b0d0f1e Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 22 Aug 2024 14:30:12 +0900 Subject: [PATCH 03/21] vector containers generated in functions became member variables to reuse --- src/commons/Taxonomer.cpp | 481 ++------------------------------------ src/commons/Taxonomer.h | 11 + 2 files changed, 28 insertions(+), 464 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 7a65481a..e2611ae6 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -36,6 +36,14 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon } else { denominator = 1000; } + + // Reserve memoory for internal containers + speciesMatches.reserve(1000); + curFrameMatches.reserve(1000); + matchPaths.reserve(1000); + maxSpecies.reserve(1000); + curPosMatches.reserve(1000); + nextPosMatches.reserve(1000); } Taxonomer::~Taxonomer() { @@ -93,8 +101,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, vector & queryList, const LocalParameters &par) { // Get the best species and its matches for the current query - vector speciesMatches; - speciesMatches.reserve(end - offset + 1); + if (speciesMatches.size() < end - offset + 1) { + speciesMatches.reserve(end - offset + 1); + } + speciesMatches.clear(); + TaxonScore speciesScore(0, 0, 0, 0, 0); speciesScore = getBestSpeciesMatches(speciesMatches, matchList, @@ -251,10 +262,8 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, size_t end, size_t offset, int queryLength) { - vector> matchesForEachSpecies; TaxonScore bestScore; - vector curFrameMatches; - vector matchPaths; + matchPaths.clear(); unordered_map species2score; unordered_map> species2matchPaths; float bestSpScore = 0; @@ -298,7 +307,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, return bestScore; } - vector maxSpecies; + maxSpecies.clear(); for (auto & spScore : species2score) { if (spScore.second >= bestSpScore * tieRatio) { maxSpecies.push_back(spScore.first); @@ -344,103 +353,6 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, return bestScore; } -// TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, -// const Match *matchList, -// size_t end, -// size_t offset, -// Query & currentQuery) { -// vector> matchesForEachSpecies; -// TaxonScore bestScore; -// vector curFrameMatches; -// vector matchPaths; -// unordered_map species2score; -// unordered_map> species2matchPaths; -// float bestSpScore = 0; -// unordered_map> speciesMatchRange; - -// size_t i = offset; -// while (i < end + 1) { -// TaxID currentSpecies = matchList[i].speciesId; -// size_t start = i; -// // For current species -// while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { -// uint8_t curFrame = matchList[i].qInfo.frame; -// curFrameMatches.clear(); -// // For current frame -// while ((i < end + 1) && currentSpecies == matchList[i].speciesId && curFrame == matchList[i].qInfo.frame) { -// curFrameMatches.push_back(&matchList[i]); -// i ++; -// } -// if (curFrameMatches.size() > 1) { -// remainConsecutiveMatches(curFrameMatches, matchPaths, currentSpecies); -// } -// } -// speciesMatchRange[currentSpecies] = make_pair(start, i); - -// // Combine MatchPaths -// if (!matchPaths.empty()) { -// float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], currentQuery.queryLength + currentQuery.queryLength2); - -// score = min(score, 1.0f); -// if (score > 0.f) { -// species2score[currentSpecies] = score; -// } -// if (score > bestSpScore) { -// bestSpScore = score; -// } -// } -// matchPaths.clear(); -// } -// // If there are no meaningful species -// if (species2score.empty()) { -// bestScore.score = 0; -// return bestScore; -// } -// vector maxSpecies; -// for (auto & spScore : species2score) { -// if (spScore.second >= bestSpScore * tieRatio) { -// maxSpecies.push_back(spScore.first); -// } -// } - -// // More than one species --> LCA -// float coveredLength = 0.f; -// if (maxSpecies.size() > 1) { -// bestScore.LCA = true; -// bestScore.taxId = taxonomy->LCA(maxSpecies)->taxId; -// for (auto & sp : maxSpecies) { -// bestScore.score += species2score[sp]; -// coveredLength = 0; -// for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { -// coveredLength += matchPath.end - matchPath.start + 1; -// } -// bestScore.coverage += coveredLength / (currentQuery.queryLength + currentQuery.queryLength2); -// } -// bestScore.score /= maxSpecies.size(); -// bestScore.coverage /= maxSpecies.size(); -// return bestScore; -// } - -// // One species -// bestScore.taxId = maxSpecies[0]; -// bestScore.score = species2score[maxSpecies[0]]; - -// int hammingDist = 0; -// for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { -// coveredLength += matchPath.end - matchPath.start + 1; -// hammingDist += matchPath.hammingDist; -// } -// speciesMatches.reserve(speciesMatchRange[bestScore.taxId].second -// - speciesMatchRange[bestScore.taxId].first + 1); - -// for (size_t i = speciesMatchRange[bestScore.taxId].first; i < speciesMatchRange[bestScore.taxId].second; i++) { -// speciesMatches.push_back(matchList[i]); -// } -// bestScore.coverage = coveredLength / (currentQuery.queryLength + currentQuery.queryLength2); -// bestScore.hammingDist = hammingDist; - -// return bestScore; -// } float Taxonomer::combineMatchPaths(vector & matchPaths, vector & combinedMatchPaths, @@ -520,8 +432,8 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM TaxID speciesId) { size_t i = 0; size_t end = curFrameMatches.size(); - vector curPosMatches; - vector nextPosMatches; + curPosMatches.clear(); + nextPosMatches.clear(); map> linkedMatches; size_t currPos = curFrameMatches[0]->qInfo.pos; @@ -572,9 +484,6 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM if (isConsecutive(nextPosMatch, curPosMatch)){ linkedMatches[curPosMatch].push_back(nextPosMatch); } - // if (nextPosMatch->isConsecutive_DNA(curPosMatch)) { - // linkedMatches[curPosMatch].push_back(nextPosMatch); - // } } } @@ -681,360 +590,6 @@ depthScore Taxonomer::DFS(const vector &matches, return bestDepthScore; } -TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, - TaxID taxId, - int queryLength) { - // Calculate Hamming distance & covered length - int coveredPosCnt = 0; - uint16_t currHammings; - int aminoAcidNum = (int) queryLength / 3; - int currPos; - size_t matchNum = filteredMatches.size(); - size_t f = 0; - - // Get the smallest hamming distance at each position of query - auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1]; - memset(hammingsAtEachPos, 24, (aminoAcidNum + 1)); - while (f < matchNum) { - currPos = filteredMatches[f]->qInfo.pos / 3; - currHammings = filteredMatches[f]->rightEndHamming; - if (GET_2_BITS(currHammings) < hammingsAtEachPos[currPos + unmaskedPos[0]]) - hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); - if (GET_2_BITS(currHammings >> 2) < hammingsAtEachPos[currPos + unmaskedPos[1]]) - hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); - if (GET_2_BITS(currHammings >> 4) < hammingsAtEachPos[currPos + unmaskedPos[2]]) - hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); - if (GET_2_BITS(currHammings >> 6) < hammingsAtEachPos[currPos + unmaskedPos[3]]) - hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); - if (GET_2_BITS(currHammings >> 8) < hammingsAtEachPos[currPos + unmaskedPos[4]]) - hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); - if (GET_2_BITS(currHammings >> 10) < hammingsAtEachPos[currPos + unmaskedPos[5]]) - hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); - if (GET_2_BITS(currHammings >> 12) < hammingsAtEachPos[currPos + unmaskedPos[6]]) - hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); - if (GET_2_BITS(currHammings >> 14) < hammingsAtEachPos[currPos + unmaskedPos[7]]) - hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); - f++; - } - - // Sum up hamming distances and count the number of position covered by the matches. - float hammingSum = 0; - for (int h = 0; h < aminoAcidNum; h++) { - if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. - coveredPosCnt++; - } else if (hammingsAtEachPos[h] != 24) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively - hammingSum += 1.0f + (0.5f * hammingsAtEachPos[h]); - coveredPosCnt++; - } - } - delete[] hammingsAtEachPos; - - // Score current genus - int coveredLength = coveredPosCnt * 3; - if (coveredLength > queryLength) coveredLength = queryLength; - float score = ((float) coveredLength - hammingSum) / (float) queryLength; - float coverage = (float) (coveredLength) / (float) (queryLength); - - return {taxId, score, coverage, (int) hammingSum, 0}; -} - -TaxonScore Taxonomer::scoreTaxon(vector &filteredMatches, - TaxID taxId, - int readLength1, - int readLength2) { - - // Calculate Hamming distance & covered length - int aminoAcidNum_total = ((int) readLength1 / 3) + ((int) readLength2 / 3); - int aminoAcidNum_read1 = ((int) readLength1 / 3); - int currPos; - size_t matchNum = filteredMatches.size(); - size_t f = 0; - - // Get the smallest hamming distance at each position of query - auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3]; - memset(hammingsAtEachPos, 24, (aminoAcidNum_total + 3)); - while (f < matchNum) { - uint8_t minHammingDist = 24; - uint16_t currHammings = 0; - currPos = (int) filteredMatches[f]->qInfo.pos / 3; - // Find the closest match at current position - while ((f < matchNum) && currPos == (int) filteredMatches[f]->qInfo.pos / 3) { - if (filteredMatches[f]->hamming < minHammingDist) { - minHammingDist = filteredMatches[f]->hamming; - currHammings = filteredMatches[f]->rightEndHamming; - } - f++; - } - // Update hamming distance at each position - if (GET_2_BITS(currHammings) < hammingsAtEachPos[currPos + unmaskedPos[0]]) - hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); - if (GET_2_BITS(currHammings >> 2) < hammingsAtEachPos[currPos + unmaskedPos[1]]) - hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); - if (GET_2_BITS(currHammings >> 4) < hammingsAtEachPos[currPos + unmaskedPos[2]]) - hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); - if (GET_2_BITS(currHammings >> 6) < hammingsAtEachPos[currPos + unmaskedPos[3]]) - hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); - if (GET_2_BITS(currHammings >> 8) < hammingsAtEachPos[currPos + unmaskedPos[4]]) - hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); - if (GET_2_BITS(currHammings >> 10) < hammingsAtEachPos[currPos + unmaskedPos[5]]) - hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); - if (GET_2_BITS(currHammings >> 12) < hammingsAtEachPos[currPos + unmaskedPos[6]]) - hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); - if (GET_2_BITS(currHammings >> 14) < hammingsAtEachPos[currPos + unmaskedPos[7]]) - hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); - f++; - } - - // Sum up hamming distances and count the number of position covered by the matches. - float hammingSum = 0; - int coveredPosCnt_read1 = 0; - int coveredPosCnt_read2 = 0; - for (int h = 0; h < aminoAcidNum_total; h++) { - // Read 1 - if (h < aminoAcidNum_read1) { - if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. - coveredPosCnt_read1++; - } else if (hammingsAtEachPos[h] != 24) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively - hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); - coveredPosCnt_read1++; - } - } - // Read 2 - else { - if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. - coveredPosCnt_read2++; - } else if (hammingsAtEachPos[h] != 24) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively - hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); - coveredPosCnt_read2++; - } - } - } - delete[] hammingsAtEachPos; - - // Score current genus - int coveredLength_read1 = coveredPosCnt_read1 * 3; - int coveredLength_read2 = coveredPosCnt_read2 * 3; - if (coveredLength_read1 > readLength1) coveredLength_read1 = readLength1; - if (coveredLength_read2 > readLength2) coveredLength_read2 = readLength2; - float score = - ((float) (coveredLength_read1 + coveredLength_read2) - hammingSum) / (float) (readLength1 + readLength2); - float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (readLength1 + readLength2); - -// matchesForEachGenus.push_back(move(filteredMatches)); - return {taxId, score, coverage, (int) hammingSum, 0}; -} - -TaxonScore Taxonomer::chooseSpecies(const vector &matches, - int queryLength, - vector &species, - unordered_map> & speciesMatchRange) { - // Score each species - std::unordered_map speciesScores; - size_t i = 0; - TaxID currentSpeices; - size_t numOfMatch = matches.size(); - size_t speciesBegin, speciesEnd; - while (i < numOfMatch) { - currentSpeices = matches[i].speciesId; - speciesBegin = i; - while ((i < numOfMatch) && currentSpeices == matches[i].speciesId) { - i++; - } - speciesEnd = i; - speciesScores[currentSpeices] = scoreSpecies(matches, speciesBegin, speciesEnd, queryLength); - speciesMatchRange[currentSpeices] = {(int) speciesBegin, (int) speciesEnd}; - speciesScores[currentSpeices].taxId = currentSpeices; - } - - // Get the best species - TaxonScore bestScore; - for (auto & sp : speciesScores) { - if (sp.second.score > bestScore.score) { - species.clear(); - species.push_back(sp.first); - bestScore = sp.second; - } else if (sp.second.coverage == bestScore.coverage) { - species.push_back(sp.first); - } - } - return bestScore; -} - -TaxonScore Taxonomer::chooseSpecies(const vector &matches, - int read1Length, - int read2Length, - vector &species, - unordered_map> & speciesMatchRange) { - // Score each species - std::unordered_map speciesScores; - - size_t i = 0; - TaxID currentSpeices; - size_t numOfMatch = matches.size(); - size_t speciesBegin, speciesEnd; - while (i < numOfMatch) { - currentSpeices = matches[i].speciesId; - speciesBegin = i; - while ((i < numOfMatch) && currentSpeices == matches[i].speciesId) { - i++; - } - speciesEnd = i; - speciesScores[currentSpeices] = scoreSpecies(matches, speciesBegin, speciesEnd, read1Length, read2Length); - speciesMatchRange[currentSpeices] = {(int) speciesBegin, (int) speciesEnd}; - speciesScores[currentSpeices].taxId = currentSpeices; - } - - // Get the best species - TaxonScore bestScore; - for (auto & sp : speciesScores) { - if (sp.second.score > bestScore.score) { - species.clear(); - species.push_back(sp.first); - bestScore = sp.second; - } else if (sp.second.coverage == bestScore.coverage) { - species.push_back(sp.first); - } - } - return bestScore; -} - -TaxonScore Taxonomer::scoreSpecies(const vector &matches, - size_t begin, - size_t end, - int queryLength) { - - // Get the largest hamming distance at each position of query - int aminoAcidNum = queryLength / 3; - auto *hammingsAtEachPos = new signed char[aminoAcidNum + 1]; - memset(hammingsAtEachPos, -1, (aminoAcidNum + 1)); - int currPos; - size_t walker = begin; - uint16_t currHammings; - while (walker < end) { - currPos = matches[walker].qInfo.pos / 3; - currHammings = matches[walker].rightEndHamming; - if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]]) - hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); - if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]]) - hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); - if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]]) - hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); - if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]]) - hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); - if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]]) - hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); - if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]]) - hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); - if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]]) - hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); - if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]]) - hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); - walker++; - } - - // Sum up hamming distances and count the number of position covered by the matches. - float hammingSum = 0; - int hammingDist = 0; - int coveredPosCnt = 0; - for (int h = 0; h < aminoAcidNum; h++) { - if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. - coveredPosCnt++; - } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively - hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); - hammingDist += hammingsAtEachPos[h]; - coveredPosCnt++; - } - } - delete[] hammingsAtEachPos; - // Score - int coveredLength = coveredPosCnt * 3; - if (coveredLength >= queryLength) coveredLength = queryLength; - - float score = ((float)coveredLength - hammingSum) / (float) queryLength; - float coverage = (float) coveredLength / (float) (queryLength); - - return {0, score, coverage, hammingDist, 0}; -} - -TaxonScore Taxonomer::scoreSpecies(const vector &matches, - size_t begin, - size_t end, - int queryLength, - int queryLength2) { - - // Get the largest hamming distance at each position of query - int aminoAcidNum_total = queryLength / 3 + queryLength2 / 3; - int aminoAcidNum_read1 = queryLength / 3; - auto *hammingsAtEachPos = new signed char[aminoAcidNum_total + 3]; - memset(hammingsAtEachPos, -1, (aminoAcidNum_total + 3)); - - int currPos; - size_t walker = begin; - uint16_t currHammings; - - while (walker < end) { - currPos = matches[walker].qInfo.pos / 3; - currHammings = matches[walker].rightEndHamming; - if (GET_2_BITS(currHammings) > hammingsAtEachPos[currPos + unmaskedPos[0]]) - hammingsAtEachPos[currPos + unmaskedPos[0]] = GET_2_BITS(currHammings); - if (GET_2_BITS(currHammings >> 2) > hammingsAtEachPos[currPos + unmaskedPos[1]]) - hammingsAtEachPos[currPos + unmaskedPos[1]] = GET_2_BITS(currHammings >> 2); - if (GET_2_BITS(currHammings >> 4) > hammingsAtEachPos[currPos + unmaskedPos[2]]) - hammingsAtEachPos[currPos + unmaskedPos[2]] = GET_2_BITS(currHammings >> 4); - if (GET_2_BITS(currHammings >> 6) > hammingsAtEachPos[currPos + unmaskedPos[3]]) - hammingsAtEachPos[currPos + unmaskedPos[3]] = GET_2_BITS(currHammings >> 6); - if (GET_2_BITS(currHammings >> 8) > hammingsAtEachPos[currPos + unmaskedPos[4]]) - hammingsAtEachPos[currPos + unmaskedPos[4]] = GET_2_BITS(currHammings >> 8); - if (GET_2_BITS(currHammings >> 10) > hammingsAtEachPos[currPos + unmaskedPos[5]]) - hammingsAtEachPos[currPos + unmaskedPos[5]] = GET_2_BITS(currHammings >> 10); - if (GET_2_BITS(currHammings >> 12) > hammingsAtEachPos[currPos + unmaskedPos[6]]) - hammingsAtEachPos[currPos + unmaskedPos[6]] = GET_2_BITS(currHammings >> 12); - if (GET_2_BITS(currHammings >> 14) > hammingsAtEachPos[currPos + unmaskedPos[7]]) - hammingsAtEachPos[currPos + unmaskedPos[7]] = GET_2_BITS(currHammings >> 14); - walker++; - } - - // Sum up hamming distances and count the number of position covered by the matches. - float hammingSum = 0; - int hammingDist = 0; - int coveredPosCnt_read1 = 0; - int coveredPosCnt_read2 = 0; - for (int h = 0; h < aminoAcidNum_total; h++) { - // Read 1 - if (h < aminoAcidNum_read1) { - if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. - coveredPosCnt_read1++; - } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively - hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); - hammingDist += hammingsAtEachPos[h]; - coveredPosCnt_read1++; - } - } - // Read 2 - else { - if (hammingsAtEachPos[h] == 0) { // Add 0 for 0 hamming dist. - coveredPosCnt_read2++; - } else if (hammingsAtEachPos[h] != -1) { // Add 1.5, 2, 2.5 for 1, 2, 3 hamming dist. respectively - hammingSum += 1.0f + (0.5f * (float) hammingsAtEachPos[h]); - hammingDist += hammingsAtEachPos[h]; - coveredPosCnt_read2++; - } - } - } - delete[] hammingsAtEachPos; - - // Score - int coveredLength_read1 = coveredPosCnt_read1 * 3; - int coveredLength_read2 = coveredPosCnt_read2 * 3; - if (coveredLength_read1 >= queryLength) coveredLength_read1 = queryLength; - if (coveredLength_read2 >= queryLength2) coveredLength_read2 = queryLength2; - - float score = ((float) (coveredLength_read1 + coveredLength_read2) - hammingSum) / (float) (queryLength + queryLength2); - float coverage = (float) (coveredLength_read1 + coveredLength_read2) / (float) (queryLength + queryLength2); - - return {0, score, coverage, hammingDist, 0}; -} - bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) { // match1 87654321 -> 08765432 // match2 98765432 -> 08765432 @@ -1044,8 +599,6 @@ bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) { bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * match2) { // int hamming1 = match1->hamming - GET_2_BITS(match1->rightEndHamming); // int hamming2 = match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14); - // cout << match1->rightEndHamming << " " << match2->rightEndHamming << endl; - // cout << hamming1 << " " << hamming2 << endl; // match1 87654321 -> 08765432 // match2 98765432 -> 08765432 return (match1->hamming - GET_2_BITS(match1->rightEndHamming)) == (match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14)); diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index fbe3d08f..36452f01 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -73,6 +73,16 @@ class Taxonomer { // Internal int denominator; + vector speciesMatches; + + // getBestSpeciesMatches + vector curFrameMatches; + vector matchPaths; + vector maxSpecies; + + // remainConsecutiveMatches + vector curPosMatches; + vector nextPosMatches; @@ -86,6 +96,7 @@ class Taxonomer { Taxonomer(const LocalParameters & par, NcbiTaxonomy * taxonomy); ~Taxonomer(); + void assignTaxonomy(const Match *matchList, size_t numOfMatches, std::vector & queryList, From a50671c61fe45e59c88b9304c3ed1a99cb8fee17 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 22 Aug 2024 15:03:00 +0900 Subject: [PATCH 04/21] unordered maps and sets generated in functions became member variables to reuse --- src/commons/Taxonomer.cpp | 40 +++++++++++++++++++++++---------------- src/commons/Taxonomer.h | 13 ++++++++++--- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index e2611ae6..70f8bf05 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -38,12 +38,20 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon } // Reserve memoory for internal containers - speciesMatches.reserve(1000); - curFrameMatches.reserve(1000); - matchPaths.reserve(1000); - maxSpecies.reserve(1000); - curPosMatches.reserve(1000); - nextPosMatches.reserve(1000); + speciesMatches.reserve(4096); + curFrameMatches.reserve(4096); + matchPaths.reserve(4096); + maxSpecies.reserve(4096); + curPosMatches.reserve(4096); + nextPosMatches.reserve(4096); + + species2score.reserve(4096); + species2matchPaths.reserve(4096); + speciesMatchRange.reserve(4096); + taxCnt.reserve(4096); + cladeCnt.reserve(4096); + used.reserve(4096); + idx2depthScore.reserve(4096); } Taxonomer::~Taxonomer() { @@ -135,7 +143,7 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, } // Filter redundant matches - unordered_map taxCnt; + taxCnt.clear(); filterRedundantMatches(speciesMatches, taxCnt); for (auto & tax : taxCnt) { queryList[currentQuery].taxCnt[tax.first] = tax.second; @@ -195,7 +203,7 @@ void Taxonomer::filterRedundantMatches(vector & speciesMatches, TaxID Taxonomer::lowerRankClassification(const unordered_map & taxCnt, TaxID spTaxId, int queryLength) { unsigned int maxCnt = (queryLength - 1)/denominator + 1; - unordered_map cladeCnt; + cladeCnt.clear(); getSpeciesCladeCounts(taxCnt, cladeCnt, spTaxId); if (accessionLevel == 2) { // Don't do accession-level classification // Remove leaf nodes @@ -262,13 +270,13 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, size_t end, size_t offset, int queryLength) { - TaxonScore bestScore; matchPaths.clear(); - unordered_map species2score; - unordered_map> species2matchPaths; + species2score.clear(); + species2matchPaths.clear(); + speciesMatchRange.clear(); + + TaxonScore bestScore; float bestSpScore = 0; - unordered_map> speciesMatchRange; - size_t i = offset; while (i < end + 1) { TaxID currentSpecies = matchList[i].speciesId; @@ -501,9 +509,9 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM if (taxonomy->IsAncestor(eukaryotaTaxId, speciesId)) { MIN_DEPTH = minConsCntEuk - 1; } - unordered_set used; - unordered_map idx2depthScore; - + + used.clear(); + idx2depthScore.clear(); for (const auto& entry : linkedMatches) { if (!used.count(entry.first)) { used.insert(entry.first); diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 36452f01..84145d8e 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -75,18 +75,25 @@ class Taxonomer { int denominator; vector speciesMatches; + // chooseBestTaxon + unordered_map taxCnt; + // getBestSpeciesMatches vector curFrameMatches; vector matchPaths; vector maxSpecies; + unordered_map species2score; + unordered_map> species2matchPaths; + unordered_map> speciesMatchRange; // remainConsecutiveMatches vector curPosMatches; vector nextPosMatches; + unordered_set used; + unordered_map idx2depthScore; - - - + // lowerRankClassification + unordered_map cladeCnt; // Output unordered_map taxCounts; From a1145c6495419d2dd86e4440381db3893c0bbcbb Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 22 Aug 2024 15:22:04 +0900 Subject: [PATCH 05/21] skip lowerRankClassification to measure time --- src/commons/Taxonomer.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 70f8bf05..db641d23 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -161,9 +161,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, } // Lower rank classification - TaxID result = lowerRankClassification(taxCnt, - speciesScore.taxId, - queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2); + // TaxID result = lowerRankClassification(taxCnt, + // speciesScore.taxId, + // queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2); + + TaxID result = speciesScore.taxId; // Store classification results queryList[currentQuery].isClassified = true; From 4bdd089ab40b95b1507028cbf56351dbeecf7c89 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 22 Aug 2024 15:40:01 +0900 Subject: [PATCH 06/21] measure time spent for spliting matches --- src/commons/Taxonomer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index db641d23..07e16524 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -79,7 +79,7 @@ void Taxonomer::assignTaxonomy(const Match *matchList, matchBlocks[blockIdx].end = matchIdx - 1; blockIdx++; } - + cout << "Time spent for spliting matches: " << double(time(nullptr) - beforeAnalyze) << endl; // Process each block #pragma omp parallel default(none), shared(cout, matchBlocks, matchList, seqNum, queryList, blockIdx, par) { From 8df0188e9d68ea5c71992e8c78c610e09b13eb6f Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 22 Aug 2024 16:17:46 +0900 Subject: [PATCH 07/21] measure time spent for spliting matches --- src/commons/Classifier.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index 6531fcfe..eecf1837 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -175,7 +175,7 @@ void Classifier::assignTaxonomy(const Match *matchList, matchBlocks[blockIdx].end = matchIdx - 1; blockIdx++; } - + cout << "Time spent for spliting matches: " << double(time(nullptr) - beforeAnalyze) << endl; // Process each block #pragma omp parallel default(none), shared(cout, matchBlocks, matchList, seqNum, queryList, blockIdx, par) { From 6cc6ea58f6ea8af795aae50ce0a5a8d5174e8ef1 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Thu, 22 Aug 2024 16:43:58 +0900 Subject: [PATCH 08/21] Skip filterRedundantMatches to measure time --- src/commons/Taxonomer.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 07e16524..d151b98d 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -143,11 +143,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, } // Filter redundant matches - taxCnt.clear(); - filterRedundantMatches(speciesMatches, taxCnt); - for (auto & tax : taxCnt) { - queryList[currentQuery].taxCnt[tax.first] = tax.second; - } + // taxCnt.clear(); + // filterRedundantMatches(speciesMatches, taxCnt); + // for (auto & tax : taxCnt) { + // queryList[currentQuery].taxCnt[tax.first] = tax.second; + // } // If score is not enough, classify to the parent of the selected species if (speciesScore.score < par.minSpScore) { @@ -162,8 +162,8 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, // Lower rank classification // TaxID result = lowerRankClassification(taxCnt, - // speciesScore.taxId, - // queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2); + // speciesScore.taxId, + // queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2); TaxID result = speciesScore.taxId; From d96e7fbf41c81f5d6833e90b77dd8b325d2b3346 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Fri, 23 Aug 2024 13:37:16 +0900 Subject: [PATCH 09/21] Use std::move() instead of vectorA=vectorB;vectorB.clear() --- src/commons/Classifier.cpp | 1 + src/commons/Taxonomer.cpp | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index eecf1837..1c509777 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -194,6 +194,7 @@ void Classifier::assignTaxonomy(const Match *matchList, for (size_t i = 0; i < seqNum; i++) { ++taxCounts[queryList[i].classification]; } + delete[] matchBlocks; cout << "Time spent for analyzing: " << double(time(nullptr) - beforeAnalyze) << endl; diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index d151b98d..13abb7c9 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -471,8 +471,7 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM } } // Update curPosMatches and nextPosMatches - curPosMatches = nextPosMatches; - nextPosMatches.clear(); + curPosMatches = std::move(nextPosMatches); currPos = nextPos; } } else { @@ -499,8 +498,7 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM } // Update curPosMatches and nextPosMatches - curPosMatches = nextPosMatches; - nextPosMatches.clear(); + curPosMatches = std::move(nextPosMatches); currPos = nextPos; } } From 43f1fcb5747515cfc405fa14630120a05a158c11 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Mon, 26 Aug 2024 18:18:47 +0900 Subject: [PATCH 10/21] Remove std::map>, and Use two parallel vectors for key and values and a vector for starting value idx for each key. --- src/commons/Taxonomer.cpp | 127 ++++++++++++++++++++++++-------------- src/commons/Taxonomer.h | 19 ++++-- 2 files changed, 95 insertions(+), 51 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 13abb7c9..28d7bbb0 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -44,6 +44,9 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon maxSpecies.reserve(4096); curPosMatches.reserve(4096); nextPosMatches.reserve(4096); + linkedMatchKeys.reserve(4096); + linkedMatchValues.reserve(4096); + linkedMatchValuesIdx.reserve(4096); species2score.reserve(4096); species2matchPaths.reserve(4096); @@ -143,11 +146,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, } // Filter redundant matches - // taxCnt.clear(); - // filterRedundantMatches(speciesMatches, taxCnt); - // for (auto & tax : taxCnt) { - // queryList[currentQuery].taxCnt[tax.first] = tax.second; - // } + taxCnt.clear(); + filterRedundantMatches(speciesMatches, taxCnt); + for (auto & tax : taxCnt) { + queryList[currentQuery].taxCnt[tax.first] = tax.second; + } // If score is not enough, classify to the parent of the selected species if (speciesScore.score < par.minSpScore) { @@ -161,11 +164,11 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, } // Lower rank classification - // TaxID result = lowerRankClassification(taxCnt, - // speciesScore.taxId, - // queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2); + TaxID result = lowerRankClassification(taxCnt, + speciesScore.taxId, + queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2); - TaxID result = speciesScore.taxId; + // TaxID result = speciesScore.taxId; // Store classification results queryList[currentQuery].isClassified = true; @@ -444,7 +447,9 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM size_t end = curFrameMatches.size(); curPosMatches.clear(); nextPosMatches.clear(); - map> linkedMatches; + linkedMatchKeys.clear(); + linkedMatchValues.clear(); + linkedMatchValuesIdx.clear(); size_t currPos = curFrameMatches[0]->qInfo.pos; uint64_t frame = curFrameMatches[0]->qInfo.frame; @@ -463,11 +468,19 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM if (currPos + 3 == nextPos) { // Compare curPosMatches and nextPosMatches for (auto &curPosMatch: curPosMatches) { + size_t startIdx = linkedMatchValues.size(); + bool found = false; for (auto &nextPosMatch: nextPosMatches) { if (isConsecutive(curPosMatch, nextPosMatch)){ - linkedMatches[curPosMatch].push_back(nextPosMatch); + linkedMatchValues.push_back(nextPosMatch); + found = true; + // linkedMatches[curPosMatch].push_back(nextPosMatch); } } + if (found) { + linkedMatchKeys.push_back(curPosMatch); + linkedMatchValuesIdx.push_back(startIdx); + } } } // Update curPosMatches and nextPosMatches @@ -489,11 +502,19 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM if (currPos + 3 == nextPos) { // Compare curPosMatches and nextPosMatches for (auto &curPosMatch: curPosMatches) { + size_t startIdx = linkedMatchValues.size(); + bool found = false; for (auto &nextPosMatch: nextPosMatches) { if (isConsecutive(nextPosMatch, curPosMatch)){ - linkedMatches[curPosMatch].push_back(nextPosMatch); + linkedMatchValues.push_back(nextPosMatch); + found = true; + // linkedMatches[curPosMatch].push_back(nextPosMatch); } } + if (found) { + linkedMatchKeys.push_back(curPosMatch); + linkedMatchValuesIdx.push_back(startIdx); + } } } @@ -512,85 +533,101 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM used.clear(); idx2depthScore.clear(); - for (const auto& entry : linkedMatches) { - if (!used.count(entry.first)) { - used.insert(entry.first); + + for (size_t k = 0; k < linkedMatchKeys.size(); k++) { + const Match * key = linkedMatchKeys[k]; + size_t startIdx = linkedMatchValuesIdx[k]; + size_t endIdx = (k + 1 < linkedMatchKeys.size()) ? linkedMatchValuesIdx[k + 1] : linkedMatchValues.size(); + + if (!used.count(key)) { + used.insert(key); depthScore bestPath{}; - for (size_t j = 0; j < entry.second.size(); j++) { - used.insert(entry.second[j]); + for (size_t j = startIdx; j < endIdx; j++) { + used.insert(linkedMatchValues[j]); depthScore curPath = DFS(curFrameMatches, - entry.second[j], - linkedMatches, - 1, + linkedMatchValues[j], + linkedMatchKeys, + linkedMatchValues, + linkedMatchValuesIdx, + 1, MIN_DEPTH, used, idx2depthScore, - entry.first->getScore(), - entry.first->hamming); + key->getScore(), + key->hamming); if (curPath.score > bestPath.score && curPath.depth > MIN_DEPTH) { bestPath = curPath; } } // Store the best path if (bestPath.depth > MIN_DEPTH) { - matchPaths.emplace_back(entry.first->qInfo.pos, // start coordinate on query - entry.first->qInfo.pos + bestPath.depth * 3 + 20, // end coordinate on query + matchPaths.emplace_back(key->qInfo.pos, // start coordinate on query + key->qInfo.pos + bestPath.depth * 3 + 20, // end coordinate on query bestPath.score, bestPath.hammingDist, - entry.first, + key, bestPath.endMatch); } } } } -depthScore Taxonomer::DFS(const vector &matches, - const Match * curMatch, - const map> &linkedMatches, - size_t depth, size_t MIN_DEPTH, - unordered_set &used, - unordered_map & match2depthScore, - float score, int hammingDist) { +depthScore Taxonomer::DFS( + const vector &matches, + const Match *curMatch, + const vector &linkedMatchesKeys, + const vector &linkedMatchesValues, + const vector &linkedMatchesIndices, + size_t depth, size_t MIN_DEPTH, + unordered_set &used, + unordered_map &match2depthScore, + float score, int hammingDist) +{ depth++; - depthScore bestDepthScore = depthScore{}; + depthScore bestDepthScore{}; depthScore returnDepthScore; depthScore curDepthScore; - float recievedScore = score; - if (linkedMatches.find(curMatch) == linkedMatches.end()) { // reached a leaf node + float receivedScore = score; + + auto it = find(linkedMatchesKeys.begin(), linkedMatchesKeys.end(), curMatch); + if (it == linkedMatchesKeys.end()) { // Reached a leaf node uint8_t lastEndHamming = (curMatch->rightEndHamming >> 14); if (lastEndHamming == 0) { score += 3.0f; } else { score += 2.0f - 0.5f * lastEndHamming; } - match2depthScore[curMatch] = depthScore(1, score - recievedScore, lastEndHamming, curMatch); + match2depthScore[curMatch] = depthScore(1, score - receivedScore, lastEndHamming, curMatch); return depthScore(depth, score, hammingDist + lastEndHamming, curMatch); - } else { // not a leaf node + } else { // Not a leaf node + size_t index = it - linkedMatchesKeys.begin(); + size_t startIdx = linkedMatchesIndices[index]; + size_t endIdx = (index + 1 < linkedMatchesIndices.size()) ? linkedMatchesIndices[index + 1] : linkedMatchesValues.size(); + uint8_t lastEndHamming = (curMatch->rightEndHamming >> 14); if (lastEndHamming == 0) { score += 3.0f; } else { score += 2.0f - 0.5f * lastEndHamming; } - for (const Match * nextMatch: linkedMatches.at(curMatch)) { + for (size_t i = startIdx; i < endIdx; ++i) { + const Match *nextMatch = linkedMatchesValues[i]; used.insert(nextMatch); - // Reuse the depth score of nextMatchIdx if it has been calculated - if (match2depthScore.find(nextMatch) != match2depthScore.end()){ + if (match2depthScore.find(nextMatch) != match2depthScore.end()) { returnDepthScore = match2depthScore[nextMatch]; curDepthScore = depthScore(returnDepthScore.depth + depth, returnDepthScore.score + score, returnDepthScore.hammingDist + hammingDist + lastEndHamming, returnDepthScore.endMatch); } else { - curDepthScore = DFS(matches, nextMatch, linkedMatches, depth, MIN_DEPTH, used, match2depthScore, score, hammingDist + lastEndHamming); + curDepthScore = DFS(matches, nextMatch, linkedMatchesKeys, linkedMatchesValues, linkedMatchesIndices, depth, MIN_DEPTH, used, match2depthScore, score, hammingDist + lastEndHamming); } - if (curDepthScore.score > bestDepthScore.score - && curDepthScore.depth > MIN_DEPTH) { + if (curDepthScore.score > bestDepthScore.score && curDepthScore.depth > MIN_DEPTH) { bestDepthScore = curDepthScore; } - } + } if (bestDepthScore.depth > MIN_DEPTH) { match2depthScore[curMatch] = depthScore(bestDepthScore.depth - depth + 1, - bestDepthScore.score - recievedScore, + bestDepthScore.score - receivedScore, bestDepthScore.hammingDist - hammingDist, bestDepthScore.endMatch); } diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 84145d8e..24995f41 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -89,6 +89,9 @@ class Taxonomer { // remainConsecutiveMatches vector curPosMatches; vector nextPosMatches; + vector linkedMatchKeys; + vector linkedMatchValues; + vector linkedMatchValuesIdx; unordered_set used; unordered_map idx2depthScore; @@ -98,6 +101,16 @@ class Taxonomer { // Output unordered_map taxCounts; + depthScore DFS(const vector &matches, + const Match *curMatch, + const vector &linkedMatchesKeys, + const vector &linkedMatchesValues, + const vector &linkedMatchesIndices, + size_t depth, size_t MIN_DEPTH, + unordered_set &used, + unordered_map &match2depthScore, + float score, int hammingDist); + public: Taxonomer(const LocalParameters & par, NcbiTaxonomy * taxonomy); @@ -135,12 +148,6 @@ class Taxonomer { void filterRedundantMatches(vector & speciesMatches, unordered_map & taxCnt); - depthScore DFS(const vector &matches, const Match * curMatchIdx, - const map> &linkedMatches, - size_t depth, size_t MIN_DEPTH, unordered_set &used, - unordered_map &match2depthScore, - float score, int hammingDist); - static bool isConsecutive(const Match * match1, const Match * match2); static bool isConsecutive_diffFrame(const Match * match1, const Match * match2); From c750443b29f579853946f0bfbc6f2ed7eb005279 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 27 Aug 2024 10:43:14 +0900 Subject: [PATCH 11/21] use unordered_set to check existing keys --- src/commons/Classifier.cpp | 1 - src/commons/Taxonomer.cpp | 9 ++++++--- src/commons/Taxonomer.h | 2 ++ 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index 1c509777..4dc01285 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -175,7 +175,6 @@ void Classifier::assignTaxonomy(const Match *matchList, matchBlocks[blockIdx].end = matchIdx - 1; blockIdx++; } - cout << "Time spent for spliting matches: " << double(time(nullptr) - beforeAnalyze) << endl; // Process each block #pragma omp parallel default(none), shared(cout, matchBlocks, matchList, seqNum, queryList, blockIdx, par) { diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 28d7bbb0..4bad2d06 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -45,6 +45,7 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon curPosMatches.reserve(4096); nextPosMatches.reserve(4096); linkedMatchKeys.reserve(4096); + linkedMatchKeySet.reserve(4096); linkedMatchValues.reserve(4096); linkedMatchValuesIdx.reserve(4096); @@ -82,7 +83,6 @@ void Taxonomer::assignTaxonomy(const Match *matchList, matchBlocks[blockIdx].end = matchIdx - 1; blockIdx++; } - cout << "Time spent for spliting matches: " << double(time(nullptr) - beforeAnalyze) << endl; // Process each block #pragma omp parallel default(none), shared(cout, matchBlocks, matchList, seqNum, queryList, blockIdx, par) { @@ -448,6 +448,7 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM curPosMatches.clear(); nextPosMatches.clear(); linkedMatchKeys.clear(); + linkedMatchKeySet.clear(); linkedMatchValues.clear(); linkedMatchValuesIdx.clear(); @@ -479,6 +480,7 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM } if (found) { linkedMatchKeys.push_back(curPosMatch); + linkedMatchKeySet.insert(curPosMatch); linkedMatchValuesIdx.push_back(startIdx); } } @@ -513,6 +515,7 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM } if (found) { linkedMatchKeys.push_back(curPosMatch); + linkedMatchKeySet.insert(curPosMatch); linkedMatchValuesIdx.push_back(startIdx); } } @@ -575,6 +578,7 @@ depthScore Taxonomer::DFS( const vector &matches, const Match *curMatch, const vector &linkedMatchesKeys, + const unordered_set &linkedMatchKeySet, const vector &linkedMatchesValues, const vector &linkedMatchesIndices, size_t depth, size_t MIN_DEPTH, @@ -588,8 +592,7 @@ depthScore Taxonomer::DFS( depthScore curDepthScore; float receivedScore = score; - auto it = find(linkedMatchesKeys.begin(), linkedMatchesKeys.end(), curMatch); - if (it == linkedMatchesKeys.end()) { // Reached a leaf node + if (linkedMatchKeySet.find(curMatch) == linkedMatchKeySet.end()) { // Reached a leaf node uint8_t lastEndHamming = (curMatch->rightEndHamming >> 14); if (lastEndHamming == 0) { score += 3.0f; diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 24995f41..93484a3b 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -90,6 +90,7 @@ class Taxonomer { vector curPosMatches; vector nextPosMatches; vector linkedMatchKeys; + unordered_set linkedMatchKeySet; vector linkedMatchValues; vector linkedMatchValuesIdx; unordered_set used; @@ -104,6 +105,7 @@ class Taxonomer { depthScore DFS(const vector &matches, const Match *curMatch, const vector &linkedMatchesKeys, + const unordered_set &linkedMatchKeySet, const vector &linkedMatchesValues, const vector &linkedMatchesIndices, size_t depth, size_t MIN_DEPTH, From feccf12e6b6da32a0baa02cc0c5fc99a77d6cd13 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 27 Aug 2024 10:58:47 +0900 Subject: [PATCH 12/21] use unordered_map --- src/commons/Taxonomer.cpp | 20 ++++++++++++-------- src/commons/Taxonomer.h | 4 ++-- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 4bad2d06..2122023d 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -45,9 +45,9 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon curPosMatches.reserve(4096); nextPosMatches.reserve(4096); linkedMatchKeys.reserve(4096); - linkedMatchKeySet.reserve(4096); linkedMatchValues.reserve(4096); linkedMatchValuesIdx.reserve(4096); + keyIndexMap.reserve(4096); species2score.reserve(4096); species2matchPaths.reserve(4096); @@ -448,9 +448,9 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM curPosMatches.clear(); nextPosMatches.clear(); linkedMatchKeys.clear(); - linkedMatchKeySet.clear(); linkedMatchValues.clear(); linkedMatchValuesIdx.clear(); + keyIndexMap.clear(); size_t currPos = curFrameMatches[0]->qInfo.pos; uint64_t frame = curFrameMatches[0]->qInfo.frame; @@ -480,7 +480,6 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM } if (found) { linkedMatchKeys.push_back(curPosMatch); - linkedMatchKeySet.insert(curPosMatch); linkedMatchValuesIdx.push_back(startIdx); } } @@ -515,7 +514,6 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM } if (found) { linkedMatchKeys.push_back(curPosMatch); - linkedMatchKeySet.insert(curPosMatch); linkedMatchValuesIdx.push_back(startIdx); } } @@ -527,6 +525,10 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM } } + for (size_t i = 0; i < linkedMatchKeys.size(); ++i) { + keyIndexMap[linkedMatchKeys[i]] = i; + } + // Iterate linkedMatches to get filteredMatches // (ignore matches not enoughly consecutive) size_t MIN_DEPTH = minConsCnt - 1; @@ -550,6 +552,7 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM depthScore curPath = DFS(curFrameMatches, linkedMatchValues[j], linkedMatchKeys, + keyIndexMap, linkedMatchValues, linkedMatchValuesIdx, 1, @@ -578,7 +581,7 @@ depthScore Taxonomer::DFS( const vector &matches, const Match *curMatch, const vector &linkedMatchesKeys, - const unordered_set &linkedMatchKeySet, + const unordered_map & keyIndexMap, const vector &linkedMatchesValues, const vector &linkedMatchesIndices, size_t depth, size_t MIN_DEPTH, @@ -592,7 +595,8 @@ depthScore Taxonomer::DFS( depthScore curDepthScore; float receivedScore = score; - if (linkedMatchKeySet.find(curMatch) == linkedMatchKeySet.end()) { // Reached a leaf node + auto it = keyIndexMap.find(curMatch); + if (it == keyIndexMap.end()) { // Reached a leaf node uint8_t lastEndHamming = (curMatch->rightEndHamming >> 14); if (lastEndHamming == 0) { score += 3.0f; @@ -602,7 +606,7 @@ depthScore Taxonomer::DFS( match2depthScore[curMatch] = depthScore(1, score - receivedScore, lastEndHamming, curMatch); return depthScore(depth, score, hammingDist + lastEndHamming, curMatch); } else { // Not a leaf node - size_t index = it - linkedMatchesKeys.begin(); + size_t index = it->second; size_t startIdx = linkedMatchesIndices[index]; size_t endIdx = (index + 1 < linkedMatchesIndices.size()) ? linkedMatchesIndices[index + 1] : linkedMatchesValues.size(); @@ -622,7 +626,7 @@ depthScore Taxonomer::DFS( returnDepthScore.hammingDist + hammingDist + lastEndHamming, returnDepthScore.endMatch); } else { - curDepthScore = DFS(matches, nextMatch, linkedMatchesKeys, linkedMatchesValues, linkedMatchesIndices, depth, MIN_DEPTH, used, match2depthScore, score, hammingDist + lastEndHamming); + curDepthScore = DFS(matches, nextMatch, linkedMatchesKeys, keyIndexMap, linkedMatchesValues, linkedMatchesIndices, depth, MIN_DEPTH, used, match2depthScore, score, hammingDist + lastEndHamming); } if (curDepthScore.score > bestDepthScore.score && curDepthScore.depth > MIN_DEPTH) { bestDepthScore = curDepthScore; diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 93484a3b..8e29e193 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -90,7 +90,7 @@ class Taxonomer { vector curPosMatches; vector nextPosMatches; vector linkedMatchKeys; - unordered_set linkedMatchKeySet; + unordered_map keyIndexMap; vector linkedMatchValues; vector linkedMatchValuesIdx; unordered_set used; @@ -105,7 +105,7 @@ class Taxonomer { depthScore DFS(const vector &matches, const Match *curMatch, const vector &linkedMatchesKeys, - const unordered_set &linkedMatchKeySet, + const unordered_map & keyIndexMap, const vector &linkedMatchesValues, const vector &linkedMatchesIndices, size_t depth, size_t MIN_DEPTH, From fb87be73d4afeac6fe53c9f796e539f157282f31 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 27 Aug 2024 11:55:53 +0900 Subject: [PATCH 13/21] back to 43f1fcb --- src/commons/Taxonomer.cpp | 17 +++++------------ src/commons/Taxonomer.h | 20 +++++++++----------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 2122023d..28d7bbb0 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -47,7 +47,6 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon linkedMatchKeys.reserve(4096); linkedMatchValues.reserve(4096); linkedMatchValuesIdx.reserve(4096); - keyIndexMap.reserve(4096); species2score.reserve(4096); species2matchPaths.reserve(4096); @@ -83,6 +82,7 @@ void Taxonomer::assignTaxonomy(const Match *matchList, matchBlocks[blockIdx].end = matchIdx - 1; blockIdx++; } + cout << "Time spent for spliting matches: " << double(time(nullptr) - beforeAnalyze) << endl; // Process each block #pragma omp parallel default(none), shared(cout, matchBlocks, matchList, seqNum, queryList, blockIdx, par) { @@ -450,7 +450,6 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM linkedMatchKeys.clear(); linkedMatchValues.clear(); linkedMatchValuesIdx.clear(); - keyIndexMap.clear(); size_t currPos = curFrameMatches[0]->qInfo.pos; uint64_t frame = curFrameMatches[0]->qInfo.frame; @@ -525,10 +524,6 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM } } - for (size_t i = 0; i < linkedMatchKeys.size(); ++i) { - keyIndexMap[linkedMatchKeys[i]] = i; - } - // Iterate linkedMatches to get filteredMatches // (ignore matches not enoughly consecutive) size_t MIN_DEPTH = minConsCnt - 1; @@ -552,7 +547,6 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM depthScore curPath = DFS(curFrameMatches, linkedMatchValues[j], linkedMatchKeys, - keyIndexMap, linkedMatchValues, linkedMatchValuesIdx, 1, @@ -581,7 +575,6 @@ depthScore Taxonomer::DFS( const vector &matches, const Match *curMatch, const vector &linkedMatchesKeys, - const unordered_map & keyIndexMap, const vector &linkedMatchesValues, const vector &linkedMatchesIndices, size_t depth, size_t MIN_DEPTH, @@ -595,8 +588,8 @@ depthScore Taxonomer::DFS( depthScore curDepthScore; float receivedScore = score; - auto it = keyIndexMap.find(curMatch); - if (it == keyIndexMap.end()) { // Reached a leaf node + auto it = find(linkedMatchesKeys.begin(), linkedMatchesKeys.end(), curMatch); + if (it == linkedMatchesKeys.end()) { // Reached a leaf node uint8_t lastEndHamming = (curMatch->rightEndHamming >> 14); if (lastEndHamming == 0) { score += 3.0f; @@ -606,7 +599,7 @@ depthScore Taxonomer::DFS( match2depthScore[curMatch] = depthScore(1, score - receivedScore, lastEndHamming, curMatch); return depthScore(depth, score, hammingDist + lastEndHamming, curMatch); } else { // Not a leaf node - size_t index = it->second; + size_t index = it - linkedMatchesKeys.begin(); size_t startIdx = linkedMatchesIndices[index]; size_t endIdx = (index + 1 < linkedMatchesIndices.size()) ? linkedMatchesIndices[index + 1] : linkedMatchesValues.size(); @@ -626,7 +619,7 @@ depthScore Taxonomer::DFS( returnDepthScore.hammingDist + hammingDist + lastEndHamming, returnDepthScore.endMatch); } else { - curDepthScore = DFS(matches, nextMatch, linkedMatchesKeys, keyIndexMap, linkedMatchesValues, linkedMatchesIndices, depth, MIN_DEPTH, used, match2depthScore, score, hammingDist + lastEndHamming); + curDepthScore = DFS(matches, nextMatch, linkedMatchesKeys, linkedMatchesValues, linkedMatchesIndices, depth, MIN_DEPTH, used, match2depthScore, score, hammingDist + lastEndHamming); } if (curDepthScore.score > bestDepthScore.score && curDepthScore.depth > MIN_DEPTH) { bestDepthScore = curDepthScore; diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 8e29e193..91082cdd 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -90,7 +90,6 @@ class Taxonomer { vector curPosMatches; vector nextPosMatches; vector linkedMatchKeys; - unordered_map keyIndexMap; vector linkedMatchValues; vector linkedMatchValuesIdx; unordered_set used; @@ -103,15 +102,14 @@ class Taxonomer { unordered_map taxCounts; depthScore DFS(const vector &matches, - const Match *curMatch, - const vector &linkedMatchesKeys, - const unordered_map & keyIndexMap, - const vector &linkedMatchesValues, - const vector &linkedMatchesIndices, - size_t depth, size_t MIN_DEPTH, - unordered_set &used, - unordered_map &match2depthScore, - float score, int hammingDist); + const Match *curMatch, + const vector &linkedMatchesKeys, + const vector &linkedMatchesValues, + const vector &linkedMatchesIndices, + size_t depth, size_t MIN_DEPTH, + unordered_set &used, + unordered_map &match2depthScore, + float score, int hammingDist); public: @@ -222,4 +220,4 @@ class Taxonomer { }; -#endif //METABULI_TAXONOMER_H +#endif //METABULI_TAXONOMER_H \ No newline at end of file From dd0ca5f67faa59fe0c863a5c70434cee54afc9f8 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 27 Aug 2024 14:34:19 +0900 Subject: [PATCH 14/21] Reducing use of unordered_map --- src/commons/Taxonomer.cpp | 120 +++++++++++++++++++++----------------- src/commons/Taxonomer.h | 19 +++--- 2 files changed, 77 insertions(+), 62 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 28d7bbb0..aea9f42b 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -37,24 +37,35 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon denominator = 1000; } - // Reserve memoory for internal containers + + // chooseBestTaxon + taxCnt.reserve(4096); + + // getBestSpeciesMatches speciesMatches.reserve(4096); curFrameMatches.reserve(4096); matchPaths.reserve(4096); + combinedMatchPaths.reserve(4096); maxSpecies.reserve(4096); + speciesList.reserve(4096); + speciesPathIdx.reserve(4096); + speciesCombPathIdx.reserve(4096); + speciesScores.reserve(4096); + + // remainConsecutiveMatches curPosMatches.reserve(4096); nextPosMatches.reserve(4096); linkedMatchKeys.reserve(4096); linkedMatchValues.reserve(4096); linkedMatchValuesIdx.reserve(4096); - - species2score.reserve(4096); - species2matchPaths.reserve(4096); - speciesMatchRange.reserve(4096); - taxCnt.reserve(4096); - cladeCnt.reserve(4096); used.reserve(4096); - idx2depthScore.reserve(4096); + idx2depthScore.reserve(4096); + + // lowerRankClassification + cladeCnt.reserve(4096); + + // Output + taxCounts.reserve(4096); } Taxonomer::~Taxonomer() { @@ -179,16 +190,16 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, queryList[currentQuery].newSpecies = false; } -void Taxonomer::filterRedundantMatches(vector & speciesMatches, +void Taxonomer::filterRedundantMatches(vector & speciesMatches, unordered_map & taxCnt) { // Sort matches by the coordinate on the query sort(speciesMatches.begin(), speciesMatches.end(), - [](const Match & a, const Match & b) { return a.qInfo.pos < b.qInfo.pos; }); + [](const Match * a, const Match * b) { return a->qInfo.pos < b->qInfo.pos; }); // Remove redundant matches size_t matchNum = speciesMatches.size(); for (size_t i = 0; i < matchNum;) { - size_t currQuotient = speciesMatches[i].qInfo.pos / 3; + size_t currQuotient = speciesMatches[i]->qInfo.pos / 3; uint8_t minHamming = speciesMatches[i].hamming; Match minHammingMatch = speciesMatches[i]; TaxID minHammingTaxId = minHammingMatch.targetId; @@ -276,16 +287,21 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, size_t offset, int queryLength) { matchPaths.clear(); - species2score.clear(); - species2matchPaths.clear(); - speciesMatchRange.clear(); + combinedMatchPaths.clear(); + speciesList.clear(); + speciesPathIdx.clear(); + speciesCombPathIdx.clear(); + speciesScores.clear(); + pair bestSpeciesRange; TaxonScore bestScore; float bestSpScore = 0; size_t i = offset; + size_t meaningfulSpecies = 0; while (i < end + 1) { TaxID currentSpecies = matchList[i].speciesId; size_t start = i; + size_t previousPathSize = matchPaths.size(); // For current species while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { uint8_t curFrame = matchList[i].qInfo.frame; @@ -299,81 +315,77 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, remainConsecutiveMatches(curFrameMatches, matchPaths, currentSpecies); } } - speciesMatchRange[currentSpecies] = make_pair(start, i); + size_t pathSize = matchPaths.size(); // Combine MatchPaths - if (!matchPaths.empty()) { - float score = combineMatchPaths(matchPaths, species2matchPaths[currentSpecies], queryLength); + if (pathSize > previousPathSize) { + speciesList.push_back(currentSpecies); + speciesPathIdx.push_back(previousPathSize); + speciesCombPathIdx.push_back(combinedMatchPaths.size()); + float score = combineMatchPaths(matchPaths, previousPathSize, combinedMatchPaths, combinedMatchPaths.size(), queryLength); score = min(score, 1.0f); + speciesScores.push_back(score); if (score > 0.f) { - species2score[currentSpecies] = score; + meaningfulSpecies++; } if (score > bestSpScore) { bestSpScore = score; + bestSpeciesRange = make_pair(start, i); } } - matchPaths.clear(); + // matchPaths.clear(); } // If there are no meaningful species - if (species2score.empty()) { + if (meaningfulSpecies == 0) { bestScore.score = 0; return bestScore; } maxSpecies.clear(); - for (auto & spScore : species2score) { - if (spScore.second >= bestSpScore * tieRatio) { - maxSpecies.push_back(spScore.first); + float coveredLength = 0.f; + for (size_t i = 0; i < speciesList.size(); i++) { + if (speciesScores[i] >= bestSpScore * tieRatio) { + maxSpecies.push_back(speciesList[i]); + bestScore.score += speciesScores[i]; + // Calculate coverage + // size_t start = speciesCombPathIdx[i]; + // size_t end = (i + 1 < speciesCombPathIdx.size()) ? speciesCombPathIdx[i + 1] : combinedMatchPaths.size(); + // coveredLength = 0; + // for (size_t j = start; j < end; j++) { + // coveredLength += combinedMatchPaths[j].end - combinedMatchPaths[j].start + 1; + // } + // bestScore.coverage += coveredLength / queryLength; } } - - // More than one species --> LCA - float coveredLength = 0.f; + + // More than one species --> LCA if (maxSpecies.size() > 1) { bestScore.LCA = true; bestScore.taxId = taxonomy->LCA(maxSpecies)->taxId; - for (auto & sp : maxSpecies) { - bestScore.score += species2score[sp]; - coveredLength = 0; - for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { - coveredLength += matchPath.end - matchPath.start + 1; - } - bestScore.coverage += coveredLength / queryLength; - } bestScore.score /= maxSpecies.size(); - bestScore.coverage /= maxSpecies.size(); + // bestScore.coverage /= maxSpecies.size(); return bestScore; } - // One species bestScore.taxId = maxSpecies[0]; - bestScore.score = species2score[maxSpecies[0]]; - int hammingDist = 0; - for (auto & matchPath : species2matchPaths[maxSpecies[0]]) { - coveredLength += matchPath.end - matchPath.start + 1; - hammingDist += matchPath.hammingDist; - } - speciesMatches.reserve(speciesMatchRange[bestScore.taxId].second - - speciesMatchRange[bestScore.taxId].first + 1); + speciesMatches.reserve(bestSpeciesRange.second - bestSpeciesRange.first + 1); - for (size_t j = speciesMatchRange[bestScore.taxId].first; j < speciesMatchRange[bestScore.taxId].second; j++) { - speciesMatches.push_back(matchList[j]); + for (size_t j = bestSpeciesRange.first; j < bestSpeciesRange.second; j++) { + speciesMatches.push_back(&matchList[j]); } - bestScore.coverage = coveredLength / queryLength; - bestScore.hammingDist = hammingDist; return bestScore; } float Taxonomer::combineMatchPaths(vector & matchPaths, + size_t matchPathStart, vector & combinedMatchPaths, + size_t combMatchPathStart, int readLength) { - combinedMatchPaths.clear(); - // Sort matchPaths by the their score - sort(matchPaths.begin(), matchPaths.end(), + sort(matchPaths.begin() + matchPathStart, matchPaths.end(), [](const MatchPath &a, const MatchPath &b) { if (a.score != b.score) { return a.score > b.score; @@ -389,13 +401,13 @@ float Taxonomer::combineMatchPaths(vector & matchPaths, // 2. Add the matchPath with the highest score that is not overlapped with the matchPath in combinedMatchPaths // 3. Repeat 2 until no matchPath can be added float score = 0; - for (size_t i = 0; i < matchPaths.size(); i++) { - if (combinedMatchPaths.empty()) { + for (size_t i = matchPathStart; i < matchPaths.size(); i++) { + if (combMatchPathStart == combinedMatchPaths.size()) { combinedMatchPaths.push_back(matchPaths[i]); score += matchPaths[i].score; } else { bool isOverlapped = false; - for (size_t j = 0; j < combinedMatchPaths.size(); j++) { + for (size_t j = combMatchPathStart; j < combinedMatchPaths.size(); j++) { if (isMatchPathOverlapped(matchPaths[i], combinedMatchPaths[j])) { // overlap! int overlappedLength = min(matchPaths[i].end, combinedMatchPaths[j].end) - max(matchPaths[i].start, combinedMatchPaths[j].start) + 1; diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 91082cdd..789a6358 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -42,7 +42,6 @@ struct MatchPath { int hammingDist; const Match * startMatch; const Match * endMatch; - }; struct MatchBlock { @@ -73,7 +72,7 @@ class Taxonomer { // Internal int denominator; - vector speciesMatches; + vector speciesMatches; // chooseBestTaxon unordered_map taxCnt; @@ -81,10 +80,12 @@ class Taxonomer { // getBestSpeciesMatches vector curFrameMatches; vector matchPaths; + vector combinedMatchPaths; vector maxSpecies; - unordered_map species2score; - unordered_map> species2matchPaths; - unordered_map> speciesMatchRange; + vector speciesList; + vector speciesPathIdx; + vector speciesCombPathIdx; + vector speciesScores; // remainConsecutiveMatches vector curPosMatches; @@ -134,8 +135,10 @@ class Taxonomer { TaxID speciesID); float combineMatchPaths(vector & matchPaths, - vector & combinedMatchPaths, - int readLength); + size_t matchPathStart, + vector & combMatchPaths, + size_t combMatchPathStart, + int readLength); bool isMatchPathOverlapped(const MatchPath & matchPath1, const MatchPath & matchPath2); @@ -152,7 +155,7 @@ class Taxonomer { static bool isConsecutive_diffFrame(const Match * match1, const Match * match2); - TaxonScore getBestSpeciesMatches(vector &speciesMatches, const Match *matchList, size_t end, + TaxonScore getBestSpeciesMatches(vector &speciesMatches, const Match *matchList, size_t end, size_t offset, int queryLength); // TaxonScore getBestSpeciesMatches(vector &speciesMatches, const Match *matchList, size_t end, From 9aa02a25c6d53c833727ecebe64d4bfb149c9e28 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 27 Aug 2024 14:38:43 +0900 Subject: [PATCH 15/21] from vector speciesMathces to vector speciesMatches --- src/commons/Taxonomer.cpp | 18 +++++++++--------- src/commons/Taxonomer.h | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index aea9f42b..316c6f77 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -200,16 +200,16 @@ void Taxonomer::filterRedundantMatches(vector & speciesMatches, size_t matchNum = speciesMatches.size(); for (size_t i = 0; i < matchNum;) { size_t currQuotient = speciesMatches[i]->qInfo.pos / 3; - uint8_t minHamming = speciesMatches[i].hamming; - Match minHammingMatch = speciesMatches[i]; + uint8_t minHamming = speciesMatches[i]->hamming; + Match minHammingMatch = *speciesMatches[i]; TaxID minHammingTaxId = minHammingMatch.targetId; - while ((i < matchNum) && (currQuotient == speciesMatches[i].qInfo.pos / 3)) { - if (speciesMatches[i].hamming < minHamming) { - minHamming = speciesMatches[i].hamming; - minHammingMatch = speciesMatches[i]; + while ((i < matchNum) && (currQuotient == speciesMatches[i]->qInfo.pos / 3)) { + if (speciesMatches[i]->hamming < minHamming) { + minHamming = speciesMatches[i]->hamming; + minHammingMatch = *speciesMatches[i]; minHammingTaxId = minHammingMatch.targetId; - } else if (speciesMatches[i].hamming == minHamming) { - minHammingTaxId = taxonomy->LCA(minHammingTaxId, speciesMatches[i].targetId); + } else if (speciesMatches[i]->hamming == minHamming) { + minHammingTaxId = taxonomy->LCA(minHammingTaxId, speciesMatches[i]->targetId); } i++; } @@ -281,7 +281,7 @@ TaxID Taxonomer::BFS(const unordered_map & cladeCnt, TaxID r } } -TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, +TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, const Match *matchList, size_t end, size_t offset, diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 789a6358..d47ff861 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -148,7 +148,7 @@ class Taxonomer { void trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength); - void filterRedundantMatches(vector & speciesMatches, + void filterRedundantMatches(vector & speciesMatches, unordered_map & taxCnt); static bool isConsecutive(const Match * match1, const Match * match2); From 7aebb7ad0a69e18d0593c1d74c095fe7262c7d54 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 27 Aug 2024 19:08:09 +0900 Subject: [PATCH 16/21] Removed std::unordered_set visited. --- src/commons/Taxonomer.cpp | 23 ++++++++--------------- src/commons/Taxonomer.h | 23 ++++++++++------------- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 316c6f77..378f2b93 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -58,8 +58,7 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon linkedMatchKeys.reserve(4096); linkedMatchValues.reserve(4096); linkedMatchValuesIdx.reserve(4096); - used.reserve(4096); - idx2depthScore.reserve(4096); + match2depthScore.reserve(4096); // lowerRankClassification cladeCnt.reserve(4096); @@ -239,8 +238,8 @@ TaxID Taxonomer::lowerRankClassification(const unordered_map &taxCnt, - unordered_map & cladeCount, - TaxID speciesTaxID) { + unordered_map & cladeCount, + TaxID speciesTaxID) { for (auto it = taxCnt.begin(); it != taxCnt.end(); ++it) { TaxonNode const * taxon = taxonomy->taxonNode(it->first); cladeCount[taxon->taxId].taxCount = it->second; @@ -543,19 +542,16 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM MIN_DEPTH = minConsCntEuk - 1; } - used.clear(); - idx2depthScore.clear(); + match2depthScore.clear(); for (size_t k = 0; k < linkedMatchKeys.size(); k++) { const Match * key = linkedMatchKeys[k]; size_t startIdx = linkedMatchValuesIdx[k]; size_t endIdx = (k + 1 < linkedMatchKeys.size()) ? linkedMatchValuesIdx[k + 1] : linkedMatchValues.size(); - if (!used.count(key)) { - used.insert(key); + if (match2depthScore.find(key) == match2depthScore.end()) { depthScore bestPath{}; for (size_t j = startIdx; j < endIdx; j++) { - used.insert(linkedMatchValues[j]); depthScore curPath = DFS(curFrameMatches, linkedMatchValues[j], linkedMatchKeys, @@ -563,15 +559,14 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM linkedMatchValuesIdx, 1, MIN_DEPTH, - used, - idx2depthScore, + match2depthScore, key->getScore(), key->hamming); if (curPath.score > bestPath.score && curPath.depth > MIN_DEPTH) { bestPath = curPath; } } - // Store the best path + match2depthScore[key] = bestPath; if (bestPath.depth > MIN_DEPTH) { matchPaths.emplace_back(key->qInfo.pos, // start coordinate on query key->qInfo.pos + bestPath.depth * 3 + 20, // end coordinate on query @@ -590,7 +585,6 @@ depthScore Taxonomer::DFS( const vector &linkedMatchesValues, const vector &linkedMatchesIndices, size_t depth, size_t MIN_DEPTH, - unordered_set &used, unordered_map &match2depthScore, float score, int hammingDist) { @@ -623,7 +617,6 @@ depthScore Taxonomer::DFS( } for (size_t i = startIdx; i < endIdx; ++i) { const Match *nextMatch = linkedMatchesValues[i]; - used.insert(nextMatch); if (match2depthScore.find(nextMatch) != match2depthScore.end()) { returnDepthScore = match2depthScore[nextMatch]; curDepthScore = depthScore(returnDepthScore.depth + depth, @@ -631,7 +624,7 @@ depthScore Taxonomer::DFS( returnDepthScore.hammingDist + hammingDist + lastEndHamming, returnDepthScore.endMatch); } else { - curDepthScore = DFS(matches, nextMatch, linkedMatchesKeys, linkedMatchesValues, linkedMatchesIndices, depth, MIN_DEPTH, used, match2depthScore, score, hammingDist + lastEndHamming); + curDepthScore = DFS(matches, nextMatch, linkedMatchesKeys, linkedMatchesValues, linkedMatchesIndices, depth, MIN_DEPTH, match2depthScore, score, hammingDist + lastEndHamming); } if (curDepthScore.score > bestDepthScore.score && curDepthScore.depth > MIN_DEPTH) { bestDepthScore = curDepthScore; diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index d47ff861..dd9b50c4 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -93,31 +93,28 @@ class Taxonomer { vector linkedMatchKeys; vector linkedMatchValues; vector linkedMatchValuesIdx; - unordered_set used; - unordered_map idx2depthScore; + unordered_map match2depthScore; // lowerRankClassification unordered_map cladeCnt; // Output - unordered_map taxCounts; + unordered_map taxCounts; - depthScore DFS(const vector &matches, - const Match *curMatch, - const vector &linkedMatchesKeys, - const vector &linkedMatchesValues, - const vector &linkedMatchesIndices, - size_t depth, size_t MIN_DEPTH, - unordered_set &used, - unordered_map &match2depthScore, - float score, int hammingDist); + depthScore DFS(const vector &matches, + const Match *curMatch, + const vector &linkedMatchesKeys, + const vector &linkedMatchesValues, + const vector &linkedMatchesIndices, + size_t depth, size_t MIN_DEPTH, + unordered_map &match2depthScore, + float score, int hammingDist); public: Taxonomer(const LocalParameters & par, NcbiTaxonomy * taxonomy); ~Taxonomer(); - void assignTaxonomy(const Match *matchList, size_t numOfMatches, std::vector & queryList, From a40311218cdf7f78292dc89560cd688b87c5a7da Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 27 Aug 2024 19:52:56 +0900 Subject: [PATCH 17/21] removed a vector curFrameMatches --- src/commons/Taxonomer.cpp | 47 +++++++++++++++++---------------------- src/commons/Taxonomer.h | 10 ++++----- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 378f2b93..508e4e99 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -43,7 +43,6 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon // getBestSpeciesMatches speciesMatches.reserve(4096); - curFrameMatches.reserve(4096); matchPaths.reserve(4096); combinedMatchPaths.reserve(4096); maxSpecies.reserve(4096); @@ -304,14 +303,13 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatc // For current species while ((i < end + 1) && currentSpecies == matchList[i].speciesId) { uint8_t curFrame = matchList[i].qInfo.frame; - curFrameMatches.clear(); + size_t start = i; // For current frame while ((i < end + 1) && currentSpecies == matchList[i].speciesId && curFrame == matchList[i].qInfo.frame) { - curFrameMatches.push_back(&matchList[i]); i ++; } - if (curFrameMatches.size() > 1) { - remainConsecutiveMatches(curFrameMatches, matchPaths, currentSpecies); + if (i - start >= minConsCnt) { + remainConsecutiveMatches(matchList, start, i, matchPaths, currentSpecies); } } size_t pathSize = matchPaths.size(); @@ -451,28 +449,29 @@ void Taxonomer::trimMatchPath(MatchPath & path1, const MatchPath & path2, int ov } } -void Taxonomer::remainConsecutiveMatches(const vector & curFrameMatches, +void Taxonomer::remainConsecutiveMatches(const Match * matchList, + size_t start, + size_t end, vector & matchPaths, TaxID speciesId) { - size_t i = 0; - size_t end = curFrameMatches.size(); + size_t i = start; curPosMatches.clear(); nextPosMatches.clear(); linkedMatchKeys.clear(); linkedMatchValues.clear(); linkedMatchValuesIdx.clear(); - size_t currPos = curFrameMatches[0]->qInfo.pos; - uint64_t frame = curFrameMatches[0]->qInfo.frame; + size_t currPos = matchList[start].qInfo.pos; + uint64_t frame = matchList[start].qInfo.frame; if (frame < 3) { // Forward frame - while ( i < end && curFrameMatches[i]->qInfo.pos == currPos) { - curPosMatches.emplace_back(curFrameMatches[i]); + while ( i < end && matchList[i].qInfo.pos == currPos) { + curPosMatches.emplace_back(matchList + i); i++; } while (i < end) { - uint32_t nextPos = curFrameMatches[i]->qInfo.pos; - while (i < end && nextPos == curFrameMatches[i]->qInfo.pos) { - nextPosMatches.emplace_back(curFrameMatches[i]); + uint32_t nextPos = matchList[i].qInfo.pos; + while (i < end && nextPos == matchList[i].qInfo.pos) { + nextPosMatches.emplace_back(matchList + i); ++ i; } // Check if current position and next position are consecutive @@ -485,7 +484,6 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM if (isConsecutive(curPosMatch, nextPosMatch)){ linkedMatchValues.push_back(nextPosMatch); found = true; - // linkedMatches[curPosMatch].push_back(nextPosMatch); } } if (found) { @@ -499,14 +497,14 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM currPos = nextPos; } } else { - while ( i < end && curFrameMatches[i]->qInfo.pos == currPos) { - curPosMatches.emplace_back(curFrameMatches[i]); + while ( i < end && matchList[i].qInfo.pos == currPos) { + curPosMatches.emplace_back(matchList + i); i++; } while (i < end) { - uint32_t nextPos = curFrameMatches[i]->qInfo.pos; - while (i < end && nextPos == curFrameMatches[i]->qInfo.pos) { - nextPosMatches.emplace_back(curFrameMatches[i]); + uint32_t nextPos = matchList[i].qInfo.pos; + while (i < end && nextPos == matchList[i].qInfo.pos) { + nextPosMatches.emplace_back(matchList + i); ++ i; } // Check if current position and next position are consecutive @@ -519,7 +517,6 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM if (isConsecutive(nextPosMatch, curPosMatch)){ linkedMatchValues.push_back(nextPosMatch); found = true; - // linkedMatches[curPosMatch].push_back(nextPosMatch); } } if (found) { @@ -552,8 +549,7 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM if (match2depthScore.find(key) == match2depthScore.end()) { depthScore bestPath{}; for (size_t j = startIdx; j < endIdx; j++) { - depthScore curPath = DFS(curFrameMatches, - linkedMatchValues[j], + depthScore curPath = DFS(linkedMatchValues[j], linkedMatchKeys, linkedMatchValues, linkedMatchValuesIdx, @@ -579,7 +575,6 @@ void Taxonomer::remainConsecutiveMatches(const vector & curFrameM } depthScore Taxonomer::DFS( - const vector &matches, const Match *curMatch, const vector &linkedMatchesKeys, const vector &linkedMatchesValues, @@ -624,7 +619,7 @@ depthScore Taxonomer::DFS( returnDepthScore.hammingDist + hammingDist + lastEndHamming, returnDepthScore.endMatch); } else { - curDepthScore = DFS(matches, nextMatch, linkedMatchesKeys, linkedMatchesValues, linkedMatchesIndices, depth, MIN_DEPTH, match2depthScore, score, hammingDist + lastEndHamming); + curDepthScore = DFS(nextMatch, linkedMatchesKeys, linkedMatchesValues, linkedMatchesIndices, depth, MIN_DEPTH, match2depthScore, score, hammingDist + lastEndHamming); } if (curDepthScore.score > bestDepthScore.score && curDepthScore.depth > MIN_DEPTH) { bestDepthScore = curDepthScore; diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index dd9b50c4..2a9ced39 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -78,7 +78,6 @@ class Taxonomer { unordered_map taxCnt; // getBestSpeciesMatches - vector curFrameMatches; vector matchPaths; vector combinedMatchPaths; vector maxSpecies; @@ -101,8 +100,7 @@ class Taxonomer { // Output unordered_map taxCounts; - depthScore DFS(const vector &matches, - const Match *curMatch, + depthScore DFS(const Match *curMatch, const vector &linkedMatchesKeys, const vector &linkedMatchesValues, const vector &linkedMatchesIndices, @@ -127,9 +125,11 @@ class Taxonomer { vector & queryList, const LocalParameters &par); - void remainConsecutiveMatches(const vector & curFrameMatches, + void remainConsecutiveMatches(const Match * matchList, + size_t start, + size_t end, vector & matchPaths, - TaxID speciesID); + TaxID speciesId); float combineMatchPaths(vector & matchPaths, size_t matchPathStart, From 5f754bf19948c12a0b7cf17640fbc5f9c6ff7755 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 27 Aug 2024 20:19:29 +0900 Subject: [PATCH 18/21] removed vectors (curPosMatches, nextPosMatches) in remainConsecutiveMatches --- src/commons/Taxonomer.cpp | 50 +++++++++++++++++++++++---------------- src/commons/Taxonomer.h | 2 -- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 508e4e99..e1c07c6a 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -52,8 +52,6 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon speciesScores.reserve(4096); // remainConsecutiveMatches - curPosMatches.reserve(4096); - nextPosMatches.reserve(4096); linkedMatchKeys.reserve(4096); linkedMatchValues.reserve(4096); linkedMatchValuesIdx.reserve(4096); @@ -455,8 +453,6 @@ void Taxonomer::remainConsecutiveMatches(const Match * matchList, vector & matchPaths, TaxID speciesId) { size_t i = start; - curPosMatches.clear(); - nextPosMatches.clear(); linkedMatchKeys.clear(); linkedMatchValues.clear(); linkedMatchValuesIdx.clear(); @@ -464,70 +460,82 @@ void Taxonomer::remainConsecutiveMatches(const Match * matchList, size_t currPos = matchList[start].qInfo.pos; uint64_t frame = matchList[start].qInfo.frame; if (frame < 3) { // Forward frame - while ( i < end && matchList[i].qInfo.pos == currPos) { - curPosMatches.emplace_back(matchList + i); - i++; + size_t curPosMatchStart = i; + while (i < end && matchList[i].qInfo.pos == currPos) { + ++ i; } + size_t curPosMatchEnd = i; // exclusive + while (i < end) { uint32_t nextPos = matchList[i].qInfo.pos; + size_t nextPosMatchStart = i; while (i < end && nextPos == matchList[i].qInfo.pos) { - nextPosMatches.emplace_back(matchList + i); ++ i; } + size_t nextPosMatchEnd = i; // exclusive + // Check if current position and next position are consecutive if (currPos + 3 == nextPos) { // Compare curPosMatches and nextPosMatches - for (auto &curPosMatch: curPosMatches) { + for (size_t curIdx = curPosMatchStart; curIdx < curPosMatchEnd; ++curIdx) { size_t startIdx = linkedMatchValues.size(); bool found = false; - for (auto &nextPosMatch: nextPosMatches) { - if (isConsecutive(curPosMatch, nextPosMatch)){ - linkedMatchValues.push_back(nextPosMatch); + for (size_t nextIdx = nextPosMatchStart; nextIdx < nextPosMatchEnd; nextIdx++) { + if (isConsecutive(matchList + curIdx, matchList + nextIdx)){ + linkedMatchValues.push_back(matchList + nextIdx); found = true; } } if (found) { - linkedMatchKeys.push_back(curPosMatch); + linkedMatchKeys.push_back(matchList + curIdx); linkedMatchValuesIdx.push_back(startIdx); } } } // Update curPosMatches and nextPosMatches - curPosMatches = std::move(nextPosMatches); + curPosMatchStart = nextPosMatchStart; + curPosMatchEnd = nextPosMatchEnd; currPos = nextPos; } } else { + size_t curPosMatchStart = i; + while ( i < end && matchList[i].qInfo.pos == currPos) { curPosMatches.emplace_back(matchList + i); i++; } + size_t curPosMatchEnd = i; // exclusive + while (i < end) { uint32_t nextPos = matchList[i].qInfo.pos; + size_t nextPosMatchStart = i; while (i < end && nextPos == matchList[i].qInfo.pos) { nextPosMatches.emplace_back(matchList + i); ++ i; } + size_t nextPosMatchEnd = i; // exclusive + // Check if current position and next position are consecutive if (currPos + 3 == nextPos) { // Compare curPosMatches and nextPosMatches - for (auto &curPosMatch: curPosMatches) { + for (size_t curIdx = curPosMatchStart; curIdx < curPosMatchEnd; ++curIdx) { size_t startIdx = linkedMatchValues.size(); bool found = false; - for (auto &nextPosMatch: nextPosMatches) { - if (isConsecutive(nextPosMatch, curPosMatch)){ - linkedMatchValues.push_back(nextPosMatch); + for (size_t nextIdx = nextPosMatchStart; nextIdx < nextPosMatchEnd; nextIdx++) { + if (isConsecutive(matchList + nextIdx, matchList + curIdx)){ + linkedMatchValues.push_back(matchList + nextIdx); found = true; } } if (found) { - linkedMatchKeys.push_back(curPosMatch); + linkedMatchKeys.push_back(matchList + curIdx); linkedMatchValuesIdx.push_back(startIdx); } } - } // Update curPosMatches and nextPosMatches - curPosMatches = std::move(nextPosMatches); + curPosMatchStart = nextPosMatchStart; + curPosMatchEnd = nextPosMatchEnd; currPos = nextPos; } } diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 2a9ced39..ac5c2abe 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -87,8 +87,6 @@ class Taxonomer { vector speciesScores; // remainConsecutiveMatches - vector curPosMatches; - vector nextPosMatches; vector linkedMatchKeys; vector linkedMatchValues; vector linkedMatchValuesIdx; From 8dfd406acafa54d18062feb1058d2aa14475ff72 Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Tue, 27 Aug 2024 20:29:00 +0900 Subject: [PATCH 19/21] removed vectors (curPosMatches, nextPosMatches) in remainConsecutiveMatches --- src/commons/Taxonomer.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index e1c07c6a..5808a472 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -501,7 +501,6 @@ void Taxonomer::remainConsecutiveMatches(const Match * matchList, size_t curPosMatchStart = i; while ( i < end && matchList[i].qInfo.pos == currPos) { - curPosMatches.emplace_back(matchList + i); i++; } size_t curPosMatchEnd = i; // exclusive @@ -510,7 +509,6 @@ void Taxonomer::remainConsecutiveMatches(const Match * matchList, uint32_t nextPos = matchList[i].qInfo.pos; size_t nextPosMatchStart = i; while (i < end && nextPos == matchList[i].qInfo.pos) { - nextPosMatches.emplace_back(matchList + i); ++ i; } size_t nextPosMatchEnd = i; // exclusive From e990156549ae22dc3c036656e79df850c928525e Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Wed, 28 Aug 2024 14:00:55 +0900 Subject: [PATCH 20/21] stop using 'vector speciesMatches' + optimize filterRedundantMatches --- src/commons/Taxonomer.cpp | 101 ++++++++++++++++++++++++-------------- src/commons/Taxonomer.h | 22 ++++++--- 2 files changed, 80 insertions(+), 43 deletions(-) diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 5808a472..911b975d 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -42,7 +42,6 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon taxCnt.reserve(4096); // getBestSpeciesMatches - speciesMatches.reserve(4096); matchPaths.reserve(4096); combinedMatchPaths.reserve(4096); maxSpecies.reserve(4096); @@ -60,12 +59,21 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon // lowerRankClassification cladeCnt.reserve(4096); + // filterRedundantMatches + arraySize_filterRedundantMatches = 4096; + bestMatchForQuotient = new const Match*[arraySize_filterRedundantMatches](); + bestMatchTaxIdForQuotient = new TaxID[arraySize_filterRedundantMatches]; + minHammingForQuotient = new uint8_t[arraySize_filterRedundantMatches]; + + // Output taxCounts.reserve(4096); } Taxonomer::~Taxonomer() { - + delete[] bestMatchForQuotient; + delete[] bestMatchTaxIdForQuotient; + delete[] minHammingForQuotient; } void Taxonomer::assignTaxonomy(const Match *matchList, @@ -119,13 +127,9 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, vector & queryList, const LocalParameters &par) { // Get the best species and its matches for the current query - if (speciesMatches.size() < end - offset + 1) { - speciesMatches.reserve(end - offset + 1); - } - speciesMatches.clear(); - TaxonScore speciesScore(0, 0, 0, 0, 0); - speciesScore = getBestSpeciesMatches(speciesMatches, + std::pair bestSpeciesRange; + speciesScore = getBestSpeciesMatches(bestSpeciesRange, matchList, end, offset, @@ -154,7 +158,10 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, // Filter redundant matches taxCnt.clear(); - filterRedundantMatches(speciesMatches, taxCnt); + filterRedundantMatches(matchList, + bestSpeciesRange, + taxCnt, + queryList[currentQuery].queryLength + queryList[currentQuery].queryLength2); for (auto & tax : taxCnt) { queryList[currentQuery].taxCnt[tax.first] = tax.second; } @@ -186,30 +193,41 @@ void Taxonomer::chooseBestTaxon(uint32_t currentQuery, queryList[currentQuery].newSpecies = false; } -void Taxonomer::filterRedundantMatches(vector & speciesMatches, - unordered_map & taxCnt) { - // Sort matches by the coordinate on the query - sort(speciesMatches.begin(), speciesMatches.end(), - [](const Match * a, const Match * b) { return a->qInfo.pos < b->qInfo.pos; }); +void Taxonomer::filterRedundantMatches(const Match *matchList, + const std::pair & bestSpeciesRange, + unordered_map & taxCnt, + int queryLength) { + // Determine the maximum quotient we need to handle + size_t maxQuotient = (queryLength + 3) / 3; - // Remove redundant matches - size_t matchNum = speciesMatches.size(); - for (size_t i = 0; i < matchNum;) { - size_t currQuotient = speciesMatches[i]->qInfo.pos / 3; - uint8_t minHamming = speciesMatches[i]->hamming; - Match minHammingMatch = *speciesMatches[i]; - TaxID minHammingTaxId = minHammingMatch.targetId; - while ((i < matchNum) && (currQuotient == speciesMatches[i]->qInfo.pos / 3)) { - if (speciesMatches[i]->hamming < minHamming) { - minHamming = speciesMatches[i]->hamming; - minHammingMatch = *speciesMatches[i]; - minHammingTaxId = minHammingMatch.targetId; - } else if (speciesMatches[i]->hamming == minHamming) { - minHammingTaxId = taxonomy->LCA(minHammingTaxId, speciesMatches[i]->targetId); + ensureArraySize(maxQuotient + 1); + + // std::fill_n(bestMatchTaxIdForQuotient, maxQuotient + 1, 0); + + for (size_t i = bestSpeciesRange.first; i < bestSpeciesRange.second; i ++) { + size_t currQuotient = matchList[i].qInfo.pos / 3; + uint8_t hamming = matchList[i].hamming; + + if (bestMatchForQuotient[currQuotient] == nullptr) { + bestMatchForQuotient[currQuotient] = matchList + i; + bestMatchTaxIdForQuotient[currQuotient] = matchList[i].targetId; + minHammingForQuotient[currQuotient] = hamming; + } else { + if (hamming < minHammingForQuotient[currQuotient]) { + bestMatchForQuotient[currQuotient] = matchList + i; + bestMatchTaxIdForQuotient[currQuotient] = matchList[i].targetId; + minHammingForQuotient[currQuotient] = hamming; + } else if (hamming == minHammingForQuotient[currQuotient]) { + bestMatchTaxIdForQuotient[currQuotient] = taxonomy->LCA( + bestMatchTaxIdForQuotient[currQuotient], matchList[i].targetId); } - i++; } - taxCnt[minHammingTaxId]++; + } + + for (size_t i = 0; i <= maxQuotient; ++i) { + if (bestMatchForQuotient[i] != nullptr) { + taxCnt[bestMatchTaxIdForQuotient[i]]++; + } } } @@ -277,7 +295,7 @@ TaxID Taxonomer::BFS(const unordered_map & cladeCnt, TaxID r } } -TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatches, +TaxonScore Taxonomer::getBestSpeciesMatches(std::pair & bestSpeciesRange, const Match *matchList, size_t end, size_t offset, @@ -289,7 +307,6 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatc speciesCombPathIdx.clear(); speciesScores.clear(); - pair bestSpeciesRange; TaxonScore bestScore; float bestSpScore = 0; size_t i = offset; @@ -364,11 +381,6 @@ TaxonScore Taxonomer::getBestSpeciesMatches(vector & speciesMatc // One species bestScore.taxId = maxSpecies[0]; - speciesMatches.reserve(bestSpeciesRange.second - bestSpeciesRange.first + 1); - - for (size_t j = bestSpeciesRange.first; j < bestSpeciesRange.second; j++) { - speciesMatches.push_back(&matchList[j]); - } return bestScore; } @@ -653,4 +665,19 @@ bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * matc // match1 87654321 -> 08765432 // match2 98765432 -> 08765432 return (match1->hamming - GET_2_BITS(match1->rightEndHamming)) == (match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14)); +} + + +void Taxonomer::ensureArraySize(size_t newSize) { + if (newSize > arraySize_filterRedundantMatches) { + delete[] bestMatchForQuotient; + delete[] bestMatchTaxIdForQuotient; + delete[] minHammingForQuotient; + bestMatchForQuotient = new const Match*[newSize](); + bestMatchTaxIdForQuotient = new TaxID[newSize](); + minHammingForQuotient = new uint8_t[newSize]; + arraySize_filterRedundantMatches = newSize; + } + std::memset(bestMatchForQuotient, 0, newSize * sizeof(const Match*)); + std::memset(minHammingForQuotient, std::numeric_limits::max(), newSize * sizeof(uint8_t)); } \ No newline at end of file diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index ac5c2abe..29f3f36c 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -72,7 +72,7 @@ class Taxonomer { // Internal int denominator; - vector speciesMatches; + // vector speciesMatches; // chooseBestTaxon unordered_map taxCnt; @@ -95,6 +95,13 @@ class Taxonomer { // lowerRankClassification unordered_map cladeCnt; + // filterRedundantMatches + const Match **bestMatchForQuotient; + TaxID *bestMatchTaxIdForQuotient; + uint8_t *minHammingForQuotient; + size_t arraySize_filterRedundantMatches; + + // Output unordered_map taxCounts; @@ -106,6 +113,8 @@ class Taxonomer { unordered_map &match2depthScore, float score, int hammingDist); + void ensureArraySize(size_t newSize); + public: Taxonomer(const LocalParameters & par, NcbiTaxonomy * taxonomy); @@ -143,18 +152,19 @@ class Taxonomer { void trimMatchPath(MatchPath & path1, const MatchPath & path2, int overlapLength); - void filterRedundantMatches(vector & speciesMatches, - unordered_map & taxCnt); + void filterRedundantMatches(const Match *matchList, + const std::pair & bestSpeciesRange, + unordered_map & taxCnt, + int queryLength); static bool isConsecutive(const Match * match1, const Match * match2); static bool isConsecutive_diffFrame(const Match * match1, const Match * match2); - TaxonScore getBestSpeciesMatches(vector &speciesMatches, const Match *matchList, size_t end, + TaxonScore getBestSpeciesMatches(std::pair & bestSpeciesRange, + const Match *matchList, size_t end, size_t offset, int queryLength); - // TaxonScore getBestSpeciesMatches(vector &speciesMatches, const Match *matchList, size_t end, - // size_t offset, Query & currentQuery); // TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, // int readLength1, int readLength2); // TaxonScore getBestGenusMatches_spaced(vector &matchesForMajorityLCA, const Match *matchList, size_t end, size_t offset, From ac245735ba209d73943ae08a961c73ab52bcd6ea Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Wed, 28 Aug 2024 18:35:57 +0900 Subject: [PATCH 21/21] Now single thread is okay. Minor optimizations in remainConsecutiveMatches --- src/commons/KmerExtractor.cpp | 313 ++++++++++++++++++++-------------- src/commons/KmerExtractor.h | 24 +++ src/commons/KmerMatcher.cpp | 8 +- src/commons/Taxonomer.cpp | 25 ++- src/commons/Taxonomer.h | 1 + src/commons/common.h | 7 +- src/util/apply-threshold.cpp | 6 +- src/util/binning2report.cpp | 2 +- 8 files changed, 241 insertions(+), 145 deletions(-) diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp index 0559748c..593ebe16 100644 --- a/src/commons/KmerExtractor.cpp +++ b/src/commons/KmerExtractor.cpp @@ -65,15 +65,24 @@ void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq, } } + // Vector to check empty reads + std::vector> emptyReads(par.threads); + for (int i = 0; i < par.threads; ++i) { + emptyReads[i].resize(chunkSize); + for (size_t j = 0; j < chunkSize; ++j) { + emptyReads[i][j] = false; + } + } + // Initialize atomic variable for active tasks std::vector> busyThreads(par.threads); - for (size_t i = 0; i < par.threads; ++i) { + for (int i = 0; i < par.threads; ++i) { busyThreads[i].store(false); } // OpenMP parallel region with tasks #pragma omp parallel default(none) shared(par, readLength, kmerBuffer, \ -queryList, currentSplit, processedQueryNum, kseq, chunkSize, chunkReads_thread, busyThreads) +queryList, currentSplit, processedQueryNum, kseq, chunkSize, chunkReads_thread, busyThreads, emptyReads) { char *maskedSeq = new char[readLength]; char *seq = nullptr; @@ -89,11 +98,15 @@ queryList, currentSplit, processedQueryNum, kseq, chunkSize, chunkReads_thread, while (processedQueryNum < currentSplit.readCnt) { // Find an idle thread int threadId; - for (threadId = 0; threadId < par.threads; ++threadId) { - if (threadId == masterThread) { continue; } - if (!busyThreads[threadId].load()) { - busyThreads[threadId].store(true); - break; + if (par.threads == 1) { + threadId = 0; + } else { + for (threadId = 0; threadId < par.threads; ++threadId) { + if (threadId == masterThread) { continue; } + if (!busyThreads[threadId].load()) { + busyThreads[threadId].store(true); + break; + } } } @@ -104,52 +117,24 @@ queryList, currentSplit, processedQueryNum, kseq, chunkSize, chunkReads_thread, size_t chunkEnd = min(processedQueryNum + chunkSize, currentSplit.readCnt); count = 0; - for (size_t i = 0; i < chunkSize && processedQueryNum < chunkEnd; ++i, ++processedQueryNum) { - kseq->ReadEntry(); - queryList[processedQueryNum].name = string(kseq->entry.name.s); - queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) kseq->entry.sequence.l); - - // Check if the read is too short - int kmerCnt = LocalUtil::getQueryKmerNumber((int) kseq->entry.sequence.l, spaceNum); - if (kmerCnt < 1) { - chunkReads_thread[threadId][i] = ""; - queryList[processedQueryNum].kmerCnt = 0; - } else { - chunkReads_thread[threadId][i] = string(kseq->entry.sequence.s); - queryList[processedQueryNum].kmerCnt = kmerCnt; - } - count++; - } - - // Process each chunk by idle threads - #pragma omp task firstprivate(count, processedQueryNum, threadId) - { - for (size_t i = 0; i < count; ++i) { - size_t queryIdx = processedQueryNum - count + i; - if (chunkReads_thread[threadId][i].empty()) { continue; } - - // Get masked sequence - if (maskMode) { - if (readLength < chunkReads_thread[threadId][i].length() + 1) { - readLength = chunkReads_thread[threadId][i].length() + 1; - delete[] maskedSeq; - maskedSeq = new char[readLength]; - } - SeqIterator::maskLowComplexityRegions(chunkReads_thread[threadId][i].c_str(), maskedSeq, *probMatrix, maskProb, subMat); - seq = maskedSeq; - } else { - seq = const_cast(chunkReads_thread[threadId][i].c_str()); - } - - size_t posToWrite = kmerBuffer.reserveMemory(queryList[queryIdx].kmerCnt); - - // Process Read 1 - seqIterator->sixFrameTranslation(seq, (int) chunkReads_thread[threadId][i].length(), aaFrames_read); - seqIterator->fillQueryKmerBuffer(seq, (int) chunkReads_thread[threadId][i].length(), kmerBuffer, posToWrite, - (uint32_t) queryIdx, aaFrames_read); + loadChunkOfReads(kseq, + queryList, + processedQueryNum, + chunkSize, + chunkEnd, + chunkReads_thread[threadId], + emptyReads[threadId], + count, + false); + if (par.threads == 1) { + processSequence(count, processedQueryNum, chunkReads_thread[threadId], emptyReads[threadId], seq, maskedSeq, readLength, kmerBuffer, queryList, aaFrames_read, false); + } else { + #pragma omp task firstprivate(count, processedQueryNum, threadId) + { + processSequence(count, processedQueryNum, chunkReads_thread[threadId], emptyReads[threadId], seq, maskedSeq, readLength, kmerBuffer, queryList, aaFrames_read, false); + busyThreads[threadId].store(false); } - busyThreads[threadId].store(false); - } + } } } delete[] maskedSeq; @@ -168,7 +153,7 @@ void KmerExtractor::fillQueryKmerBufferParallel_paired(KSeqWrapper *kseq1, // Reserve memory for each read of the chunk for each thread std::vector> chunkReads1_thread(par.threads); std::vector> chunkReads2_thread(par.threads); - for (size_t i = 0; i < par.threads; ++i) { + for (int i = 0; i < par.threads; ++i) { chunkReads1_thread[i].resize(chunkSize); chunkReads2_thread[i].resize(chunkSize); for (size_t j = 0; j < chunkSize; ++j) { @@ -176,20 +161,30 @@ void KmerExtractor::fillQueryKmerBufferParallel_paired(KSeqWrapper *kseq1, chunkReads2_thread[i][j].reserve(readLength); } } + // Vector to check empty reads + std::vector> emptyReads(par.threads); + for (int i = 0; i < par.threads; ++i) { + emptyReads[i].resize(chunkSize); + for (size_t j = 0; j < chunkSize; ++j) { + emptyReads[i][j] = false; + } + } // Initialize atomic variable for active tasks std::vector> busyThreads(par.threads); - for (size_t i = 0; i < par.threads; ++i) { + for (int i = 0; i < par.threads; ++i) { busyThreads[i].store(false); } // OpenMP parallel region with tasks #pragma omp parallel default(none) shared(par, readLength, \ kmerBuffer, queryList, currentSplit, processedQueryNum, kseq1, kseq2, \ -chunkSize, chunkReads1_thread, chunkReads2_thread, busyThreads, cout) +chunkSize, chunkReads1_thread, chunkReads2_thread, busyThreads, cout, emptyReads) { - char *maskedSeq1 = new char[readLength]; - char *maskedSeq2 = new char[readLength]; + size_t maxReadLength1 = 1000; + size_t maxReadLength2 = 1000; + char *maskedSeq1 = new char[maxReadLength1]; + char *maskedSeq2 = new char[maxReadLength2]; char *seq1 = nullptr; char *seq2 = nullptr; vector aaFrames_read1[6]; @@ -202,91 +197,159 @@ chunkSize, chunkReads1_thread, chunkReads2_thread, busyThreads, cout) #pragma omp single nowait { int masterThread = omp_get_thread_num(); - size_t count = 0; + while (processedQueryNum < currentSplit.readCnt) { // Find an idle thread int threadId; - for (threadId = 0; threadId < par.threads; ++threadId) { - if (threadId == masterThread) { continue; } - if (!busyThreads[threadId].load()) { - busyThreads[threadId].store(true); - break; + if (par.threads == 1) { + threadId = 0; + } else { + for (threadId = 0; threadId < par.threads; ++threadId) { + if (threadId == masterThread) { continue; } + if (!busyThreads[threadId].load()) { + busyThreads[threadId].store(true); + break; + } } } - if (threadId == par.threads) { continue; } size_t chunkEnd = min(processedQueryNum + chunkSize, currentSplit.readCnt); + + size_t count = 0; + size_t processedQueryNumCopy = processedQueryNum; + loadChunkOfReads(kseq1, + queryList, + processedQueryNum, + chunkSize, + chunkEnd, + chunkReads1_thread[threadId], + emptyReads[threadId], + count, + false); + count = 0; + processedQueryNum = processedQueryNumCopy; + loadChunkOfReads(kseq2, + queryList, + processedQueryNum, + chunkSize, + chunkEnd, + chunkReads2_thread[threadId], + emptyReads[threadId], + count, + true); - for (size_t i = 0; i < chunkSize && processedQueryNum < chunkEnd; ++i, ++processedQueryNum) { - kseq1->ReadEntry(); - kseq2->ReadEntry(); - queryList[processedQueryNum].name = string(kseq1->entry.name.s); - queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) kseq1->entry.sequence.l); - queryList[processedQueryNum].queryLength2 = LocalUtil::getMaxCoveredLength((int) kseq2->entry.sequence.l); - - // Check if the read is too short - int kmerCnt1 = LocalUtil::getQueryKmerNumber((int) kseq1->entry.sequence.l, spaceNum); - int kmerCnt2 = LocalUtil::getQueryKmerNumber((int) kseq2->entry.sequence.l, spaceNum); - if (kmerCnt1 < 1 || kmerCnt2 < 1) { - chunkReads1_thread[threadId][i] = ""; - chunkReads2_thread[threadId][i] = ""; - queryList[processedQueryNum].kmerCnt = 0; - } else { - // cout << threadId << endl - chunkReads1_thread[threadId][i] = string(kseq1->entry.sequence.s); - chunkReads2_thread[threadId][i] = string(kseq2->entry.sequence.s); - queryList[processedQueryNum].kmerCnt = kmerCnt1 + kmerCnt2; + if (par.threads == 1) { + processSequence(count, processedQueryNum, chunkReads1_thread[threadId], emptyReads[threadId], seq1, maskedSeq1, maxReadLength1, kmerBuffer, queryList, aaFrames_read1, false); + processSequence(count, processedQueryNum, chunkReads2_thread[threadId], emptyReads[threadId], seq2, maskedSeq2, maxReadLength2, kmerBuffer, queryList, aaFrames_read2, true); + } + else { + #pragma omp task firstprivate(count, processedQueryNum, threadId) + { + processSequence(count, processedQueryNum, chunkReads1_thread[threadId], emptyReads[threadId], seq1, maskedSeq1, maxReadLength1, kmerBuffer, queryList, aaFrames_read1, false); + processSequence(count, processedQueryNum, chunkReads2_thread[threadId], emptyReads[threadId], seq2, maskedSeq2, maxReadLength2, kmerBuffer, queryList, aaFrames_read2, true); + busyThreads[threadId].store(false); } - count++; - } - + } + } + } + delete[] maskedSeq1; + delete[] maskedSeq2; + } +} - // Process each chunk by idle threads - #pragma omp task firstprivate(count, processedQueryNum, threadId) - { - for (size_t i = 0; i < count; ++i) { - size_t queryIdx = processedQueryNum - count + i; - if (chunkReads1_thread[threadId][i].empty() || chunkReads2_thread[threadId][i].empty()) { continue; } - - // Get masked sequence - if (maskMode) { - if (readLength < chunkReads1_thread[threadId][i].length() + 1 || readLength < chunkReads2_thread[threadId][i].length() + 1) { - readLength = max(chunkReads1_thread[threadId][i].length() + 1, chunkReads2_thread[threadId][i].length() + 1); - delete[] maskedSeq1; - delete[] maskedSeq2; - maskedSeq1 = new char[readLength]; - maskedSeq2 = new char[readLength]; - } - SeqIterator::maskLowComplexityRegions(chunkReads1_thread[threadId][i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat); - SeqIterator::maskLowComplexityRegions(chunkReads2_thread[threadId][i].c_str(),maskedSeq2, *probMatrix, maskProb, subMat); - seq1 = maskedSeq1; - seq2 = maskedSeq2; - } else { - seq1 = const_cast(chunkReads1_thread[threadId][i].c_str()); - seq2 = const_cast(chunkReads2_thread[threadId][i].c_str()); - } +void KmerExtractor::processSequence(size_t count, + size_t processedQueryNum, + const vector & reads, + const vector & emptyReads, + char *seq, + char *maskedSeq, + size_t & maxReadLength, + QueryKmerBuffer &kmerBuffer, + const vector & queryList, + vector *aaFrames, + bool isReverse) { + for (size_t i = 0; i < count; ++i) { + size_t queryIdx = processedQueryNum - count + i; + if (emptyReads[i]) { continue; } + // Get masked sequence + if (maskMode) { + if (maxReadLength < reads[i].length() + 1) { + maxReadLength = reads[i].length() + 1; + delete[] maskedSeq; + maskedSeq = new char[maxReadLength]; + } + SeqIterator::maskLowComplexityRegions(reads[i].c_str(), maskedSeq, *probMatrix, maskProb, subMat); + seq = maskedSeq; + } else { + seq = const_cast(reads[i].c_str()); + } - size_t posToWrite = kmerBuffer.reserveMemory(queryList[queryIdx].kmerCnt); - - // Process Read 1 - seqIterator->sixFrameTranslation(seq1, (int) chunkReads1_thread[threadId][i].length(), aaFrames_read1); - seqIterator->fillQueryKmerBuffer(seq1, (int) chunkReads1_thread[threadId][i].length(), kmerBuffer, posToWrite, - (uint32_t) queryIdx, aaFrames_read1); + size_t posToWrite = 0; + if (isReverse) { + posToWrite = kmerBuffer.reserveMemory(queryList[queryIdx].kmerCnt2); + seqIterator->sixFrameTranslation(seq, (int) reads[i].length(), aaFrames); + seqIterator->fillQueryKmerBuffer(seq, (int) reads[i].length(), kmerBuffer, posToWrite, + (uint32_t) queryIdx, aaFrames, queryList[queryIdx].queryLength+3); + } else { + posToWrite = kmerBuffer.reserveMemory(queryList[queryIdx].kmerCnt); + seqIterator->sixFrameTranslation(seq, (int) reads[i].length(), aaFrames); + seqIterator->fillQueryKmerBuffer(seq, (int) reads[i].length(), kmerBuffer, posToWrite, + (uint32_t) queryIdx, aaFrames); + } + } +} - // Process Read 2 - seqIterator->sixFrameTranslation(seq2, (int) chunkReads2_thread[threadId][i].length(), aaFrames_read2); - seqIterator->fillQueryKmerBuffer(seq2, (int) chunkReads2_thread[threadId][i].length(), kmerBuffer, posToWrite, - (uint32_t) queryIdx, aaFrames_read2, queryList[queryIdx].queryLength+3); - } - busyThreads[threadId].store(false); - } +void KmerExtractor::loadChunkOfReads(KSeqWrapper *kseq, + vector & queryList, + size_t & processedQueryNum, + size_t chunkSize, + size_t chunkEnd, + vector & reads, + vector & emptyReads, + size_t & count, + bool isReverse) { + if (isReverse) { + for (size_t i = 0; i < chunkSize && processedQueryNum < chunkEnd; ++i, ++processedQueryNum) { + kseq->ReadEntry(); + queryList[processedQueryNum].queryLength2 = LocalUtil::getMaxCoveredLength((int) kseq->entry.sequence.l); + + if (emptyReads[i]) { continue; } + + // Check if the read is too short + int kmerCnt = LocalUtil::getQueryKmerNumber((int) kseq->entry.sequence.l, spaceNum); + if (kmerCnt < 1) { + reads[i] = ""; + emptyReads[i] = true; + queryList[processedQueryNum].kmerCnt2 = 0; + } else { + reads[i] = string(kseq->entry.sequence.s); + // emptyReads[i] = false; // already set to false + queryList[processedQueryNum].kmerCnt2 = kmerCnt; } + count++; + } + } else { + for (size_t i = 0; i < chunkSize && processedQueryNum < chunkEnd; ++i, ++processedQueryNum) { + kseq->ReadEntry(); + queryList[processedQueryNum].name = string(kseq->entry.name.s); + queryList[processedQueryNum].queryLength = LocalUtil::getMaxCoveredLength((int) kseq->entry.sequence.l); + + // Check if the read is too short + int kmerCnt = LocalUtil::getQueryKmerNumber((int) kseq->entry.sequence.l, spaceNum); + if (kmerCnt < 1) { + reads[i] = ""; + emptyReads[i] = true; + queryList[processedQueryNum].kmerCnt = 0; + } else { + reads[i] = string(kseq->entry.sequence.s); + emptyReads[i] = false; + queryList[processedQueryNum].kmerCnt = kmerCnt; + } + count++; } - delete[] maskedSeq1; - delete[] maskedSeq2; } } \ No newline at end of file diff --git a/src/commons/KmerExtractor.h b/src/commons/KmerExtractor.h index a2699139..650f03f3 100644 --- a/src/commons/KmerExtractor.h +++ b/src/commons/KmerExtractor.h @@ -32,6 +32,28 @@ class KmerExtractor { vector &queryList, const QuerySplit & currentSplit, const LocalParameters &par); + + void loadChunkOfReads(KSeqWrapper *kseq, + vector & queryList, + size_t & processedQueryNum, + size_t chunkSize, + size_t chunkEnd, + vector & reads, + vector & emptyReads, + size_t & count, + bool isReverse); + + void processSequence(size_t count, + size_t processedQueryNum, + const vector & reads, + const vector & emptyReads, + char *seq, + char *maskedSeq, + size_t & maxReadLength, + QueryKmerBuffer &kmerBuffer, + const vector & queryList, + vector *aaFrames, + bool isReverse); public: explicit KmerExtractor(const LocalParameters & par); @@ -44,6 +66,8 @@ class KmerExtractor { KSeqWrapper* kseq2 = nullptr); + + }; static inline bool compareForLinearSearch(const QueryKmer &a, const QueryKmer &b) { diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index 46bdae08..9f9a2e9a 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -210,23 +210,23 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI size_t kmerInfoBufferIdx = 0; size_t diffIdxBufferIdx = 0; - //query variables + // Query variables uint64_t currentQuery = UINT64_MAX; uint64_t currentQueryAA = UINT64_MAX; QueryKmerInfo currentQueryInfo; - //target variables + // Target variables size_t diffIdxPos = 0; std::vector candidateTargetKmers; //vector for candidate target k-mer, some of which are selected after based on hamming distance std::vector candidateKmerInfos; uint64_t currentTargetKmer; - //Match buffer for each thread + // Match buffer for each thread int localBufferSize = 2'000'000; // 32 Mb auto *matches = new Match[localBufferSize]; // 16 * 2'000'000 = 32 Mb int matchCnt = 0; - //vectors for selected target k-mers + // Vectors for selected target k-mers std::vector selectedHammingSum; std::vector selectedMatches; std::vector selectedHammings; diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 911b975d..85e537e0 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -344,7 +344,6 @@ TaxonScore Taxonomer::getBestSpeciesMatches(std::pair & bestSpec bestSpeciesRange = make_pair(start, i); } } - // matchPaths.clear(); } // If there are no meaningful species @@ -468,6 +467,12 @@ void Taxonomer::remainConsecutiveMatches(const Match * matchList, linkedMatchKeys.clear(); linkedMatchValues.clear(); linkedMatchValuesIdx.clear(); + + linkedMatchKeys.resize(end - start); + linkedMatchValuesIdx.resize(end - start); + size_t linkedMatchKeysIdx = 0; + + size_t currPos = matchList[start].qInfo.pos; uint64_t frame = matchList[start].qInfo.frame; @@ -499,8 +504,8 @@ void Taxonomer::remainConsecutiveMatches(const Match * matchList, } } if (found) { - linkedMatchKeys.push_back(matchList + curIdx); - linkedMatchValuesIdx.push_back(startIdx); + linkedMatchKeys[linkedMatchKeysIdx] = matchList + curIdx; + linkedMatchValuesIdx[linkedMatchKeysIdx++] = startIdx; } } } @@ -538,8 +543,8 @@ void Taxonomer::remainConsecutiveMatches(const Match * matchList, } } if (found) { - linkedMatchKeys.push_back(matchList + curIdx); - linkedMatchValuesIdx.push_back(startIdx); + linkedMatchKeys[linkedMatchKeysIdx] = matchList + curIdx; + linkedMatchValuesIdx[linkedMatchKeysIdx++] = startIdx; } } } @@ -559,16 +564,17 @@ void Taxonomer::remainConsecutiveMatches(const Match * matchList, match2depthScore.clear(); - for (size_t k = 0; k < linkedMatchKeys.size(); k++) { + for (size_t k = 0; k < linkedMatchKeysIdx; k++) { const Match * key = linkedMatchKeys[k]; size_t startIdx = linkedMatchValuesIdx[k]; - size_t endIdx = (k + 1 < linkedMatchKeys.size()) ? linkedMatchValuesIdx[k + 1] : linkedMatchValues.size(); + size_t endIdx = (k + 1 < linkedMatchKeysIdx) ? linkedMatchValuesIdx[k + 1] : linkedMatchValues.size(); if (match2depthScore.find(key) == match2depthScore.end()) { depthScore bestPath{}; for (size_t j = startIdx; j < endIdx; j++) { depthScore curPath = DFS(linkedMatchValues[j], linkedMatchKeys, + linkedMatchKeysIdx, linkedMatchValues, linkedMatchValuesIdx, 1, @@ -595,6 +601,7 @@ void Taxonomer::remainConsecutiveMatches(const Match * matchList, depthScore Taxonomer::DFS( const Match *curMatch, const vector &linkedMatchesKeys, + size_t linkedMatchKeysIdx, const vector &linkedMatchesValues, const vector &linkedMatchesIndices, size_t depth, size_t MIN_DEPTH, @@ -620,7 +627,7 @@ depthScore Taxonomer::DFS( } else { // Not a leaf node size_t index = it - linkedMatchesKeys.begin(); size_t startIdx = linkedMatchesIndices[index]; - size_t endIdx = (index + 1 < linkedMatchesIndices.size()) ? linkedMatchesIndices[index + 1] : linkedMatchesValues.size(); + size_t endIdx = (index + 1 < linkedMatchKeysIdx) ? linkedMatchesIndices[index + 1] : linkedMatchesValues.size(); uint8_t lastEndHamming = (curMatch->rightEndHamming >> 14); if (lastEndHamming == 0) { @@ -637,7 +644,7 @@ depthScore Taxonomer::DFS( returnDepthScore.hammingDist + hammingDist + lastEndHamming, returnDepthScore.endMatch); } else { - curDepthScore = DFS(nextMatch, linkedMatchesKeys, linkedMatchesValues, linkedMatchesIndices, depth, MIN_DEPTH, match2depthScore, score, hammingDist + lastEndHamming); + curDepthScore = DFS(nextMatch, linkedMatchesKeys, linkedMatchKeysIdx, linkedMatchesValues, linkedMatchesIndices, depth, MIN_DEPTH, match2depthScore, score, hammingDist + lastEndHamming); } if (curDepthScore.score > bestDepthScore.score && curDepthScore.depth > MIN_DEPTH) { bestDepthScore = curDepthScore; diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index 29f3f36c..5c84c8c1 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -107,6 +107,7 @@ class Taxonomer { depthScore DFS(const Match *curMatch, const vector &linkedMatchesKeys, + size_t linkedMatchKeysIdx, const vector &linkedMatchesValues, const vector &linkedMatchesIndices, size_t depth, size_t MIN_DEPTH, diff --git a/src/commons/common.h b/src/commons/common.h index 2961060d..16f6ff09 100644 --- a/src/commons/common.h +++ b/src/commons/common.h @@ -48,6 +48,7 @@ struct Query{ int queryLength; int queryLength2; int kmerCnt; + int kmerCnt2; bool isClassified; bool newSpecies; // 36 byte @@ -58,13 +59,13 @@ struct Query{ bool operator==(int id) const { return queryId == id;} Query(int queryId, int classification, float score, float coverage, int hammingDist, int queryLength, - int queryLength2, int kmerCnt, bool isClassified, bool newSpecies, std::string name) + int queryLength2, int kmerCnt, int kmerCnt2, bool isClassified, bool newSpecies, std::string name) : queryId(queryId), classification(classification), score(score), coverage(coverage), - hammingDist(hammingDist), queryLength(queryLength), queryLength2(queryLength2), kmerCnt(kmerCnt), + hammingDist(hammingDist), queryLength(queryLength), queryLength2(queryLength2), kmerCnt(kmerCnt), kmerCnt2(kmerCnt2), isClassified(isClassified), newSpecies(newSpecies), name(std::move(name)) {} Query() : queryId(0), classification(0), score(0), coverage(0), hammingDist(0), queryLength(0), - queryLength2(0), kmerCnt(0), isClassified(false), newSpecies(false) {} + queryLength2(0), kmerCnt(0), kmerCnt2(0), isClassified(false), newSpecies(false) {} }; template diff --git a/src/util/apply-threshold.cpp b/src/util/apply-threshold.cpp index 4b3a2d49..f54c52c3 100644 --- a/src/util/apply-threshold.cpp +++ b/src/util/apply-threshold.cpp @@ -61,7 +61,7 @@ int applyThreshold(int argc, const char **argv, const Command &command) { // Low coverage or low score if (stof(columns[par.coverageCol]) < par.minCoverage || stof(columns[par.scoreCol]) < par.minScore) { newResults.emplace_back(lineCnt, 0, stof(columns[par.scoreCol]), stof(columns[par.coverageCol]), stoi(columns[6]), - stoi(columns[3]),0, 0, false, false, columns[1]); + stoi(columns[3]),0, 0, 0, false, false, columns[1]); //int queryId, int classification, float score, float coverage, int hammingDist, int queryLength, // int queryLength2, int kmerCnt, bool isClassified, bool newSpecies, std::string name taxonCounts[0]++; @@ -70,11 +70,11 @@ int applyThreshold(int argc, const char **argv, const Command &command) { else if (stof(columns[par.scoreCol]) < par.minSpScore && ncbiTaxonomy.taxonNode(ncbiTaxonomy.getTaxIdAtRank(stoi(columns[2]), "species"))->rankIdx == 4) { TaxID parentTaxId = ncbiTaxonomy.taxonNode(ncbiTaxonomy.getTaxIdAtRank(stoi(columns[2]), "species"))->parentTaxId; newResults.emplace_back(lineCnt, parentTaxId, stof(columns[4]), stof(columns[5]), stoi(columns[6]), - stoi(columns[3]),0, 0, true, false, columns[1]); + stoi(columns[3]),0, 0, 0, true, false, columns[1]); taxonCounts[parentTaxId]++; } else { newResults.emplace_back(lineCnt, stoi(columns[2]), stof(columns[4]), stof(columns[5]), stoi(columns[6]), - stoi(columns[3]),0, 0, stoi(columns[0]), false, columns[1]); + stoi(columns[3]),0, 0, 0, stoi(columns[0]), false, columns[1]); taxonCounts[stoi(columns[2])]++; } lineCnt++; diff --git a/src/util/binning2report.cpp b/src/util/binning2report.cpp index 33920ed1..bf55ccb6 100644 --- a/src/util/binning2report.cpp +++ b/src/util/binning2report.cpp @@ -44,7 +44,7 @@ int binning2report(int argc, const char **argv, const Command &command){ columns.push_back(eachItem); } binnings.emplace_back(lineCnt, stoi(columns[par.taxidCol]), 0, 0, 0, - 0,0, 0, stoi(columns[par.taxidCol]), false, columns[par.readIdCol]); + 0, 0, 0, 0, stoi(columns[par.taxidCol]), false, columns[par.readIdCol]); taxonCounts[stoi(columns[par.taxidCol])]++; lineCnt++; }