diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h index ff5ae5c8..689d039a 100644 --- a/src/LocalCommandDeclarations.h +++ b/src/LocalCommandDeclarations.h @@ -15,5 +15,7 @@ extern int binning2report(int argc, const char **argv, const Command& command); extern int filterByGenus(int argc, const char **argv, const Command& command); extern int databaseReport(int argc, const char **argv, const Command& command); extern int mapping2taxon(int argc, const char **argv, const Command& command); +extern int expand_diffidx(int argc, const char **argv, const Command& command); +extern int makeAAoffset(int argc, const char **argv, const Command& command); #endif //ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H diff --git a/src/commons/FileMerger.cpp b/src/commons/FileMerger.cpp index 5249c59b..f2b7a947 100644 --- a/src/commons/FileMerger.cpp +++ b/src/commons/FileMerger.cpp @@ -20,164 +20,6 @@ FileMerger::~FileMerger() { delete taxonomy; } -//void FileMerger::mergeTargetFiles(std::vector diffIdxFileNames, std::vector infoFileNames, vector & taxIdListAtRank, vector & taxIdList) { -// size_t writtenKmerCnt = 0; -// -// ///Files to write on & buffers to fill them -// FILE * mergedDiffFile = fopen(mergedDiffFileName, "wb"); -// FILE * mergedInfoFile = fopen(mergedInfoFileName, "wb"); -// FILE * diffIdxSplitFile = fopen(diffIdxSplitFileName, "wb"); -// uint16_t * diffBuffer = (uint16_t *)malloc(sizeof(uint16_t) * kmerBufSize); -// size_t diffBufferIdx = 0; -// size_t totalBufferIdx = 0; -// TargetKmerInfo * infoBuffer = (TargetKmerInfo *)malloc(sizeof(TargetKmerInfo) * kmerBufSize); -// size_t infoBufferIdx = 0; -// size_t totalInfoIdx = 0; -// -// ///Prepare files to merge -// size_t numOfSplitFiles = diffIdxFileNames.size(); -// size_t numOfincompletedFiles = numOfSplitFiles; -// size_t numOfKmerBeforeMerge = 0; -// uint64_t * lookingKmers = new uint64_t[numOfSplitFiles]; -//// uint64_t lookingKmers[numOfSplitFiles]; -//// TargetKmerInfo lookingInfos[numOfSplitFiles]; -// auto * lookingInfos = new TargetKmerInfo[numOfSplitFiles]; -// //size_t diffFileIdx[numOfSplitFiles]; -// auto * diffFileIdx = new size_t[numOfSplitFiles]; -// memset(diffFileIdx, 0, numOfSplitFiles * sizeof(size_t)); -// auto * infoFileIdx = new size_t[numOfSplitFiles]; -//// size_t infoFileIdx[numOfSplitFiles]; -// memset(infoFileIdx, 0, numOfSplitFiles * sizeof(size_t)); -// size_t maxIdxOfEachFiles[numOfSplitFiles]; -// struct MmapedData *diffFileList = new struct MmapedData[numOfSplitFiles]; -// struct MmapedData *infoFileList = new struct MmapedData[numOfSplitFiles]; -// for (size_t file = 0; file < numOfSplitFiles; file++) { -// diffFileList[file] = mmapData(diffIdxFileNames[file]); -// infoFileList[file] = mmapData(infoFileNames[file]); -// maxIdxOfEachFiles[file] = diffFileList[file].fileSize / sizeof(uint16_t); -// numOfKmerBeforeMerge += infoFileList[file].fileSize / sizeof(TargetKmerInfo); -// } -// -// ///To make differential index splits -// uint64_t AAofTempSplitOffset = UINT64_MAX; -// size_t sizeOfSplit = numOfKmerBeforeMerge / (SplitNum - 1); -// size_t offsetList[SplitNum + 1]; -// int offsetListIdx = 1; -// for(size_t os = 0; os < SplitNum; os++){ -// offsetList[os] = os * sizeOfSplit; -// } -// offsetList[SplitNum] = UINT64_MAX; -// -// DiffIdxSplit splitList[SplitNum]; -// memset(splitList, 0, sizeof(DiffIdxSplit) * SplitNum); -// int splitListIdx = 1; -// -// /// get the first k-mer to write -// for(size_t file = 0; file < numOfSplitFiles; file++){ -// lookingKmers[file] = getNextKmer(0, diffFileList[file], diffFileIdx[file]); -// lookingInfos[file] = infoFileList[file].data[0]; -// infoFileIdx[file] ++; -// } -// -// size_t idxOfMin = smallest(lookingKmers, lookingInfos, taxIdListAtRank, numOfSplitFiles); -// uint64_t lastWrittenKmer = 0; -// uint64_t entryKmer = lookingKmers[idxOfMin]; -// TargetKmerInfo entryInfo = lookingInfos[idxOfMin]; -// -// // write first k-mer -// getDiffIdx(lastWrittenKmer, entryKmer, mergedDiffFile, diffBuffer, diffBufferIdx, totalBufferIdx); -// lastWrittenKmer = entryKmer; -// writeInfo(&entryInfo, mergedInfoFile, infoBuffer, infoBufferIdx, totalInfoIdx); -// writtenKmerCnt++; -// int splitCheck = 0; -// int endFlag = 0; -// -// while(true){ -// // update entry k-mer -// entryKmer = lookingKmers[idxOfMin]; -// entryInfo = lookingInfos[idxOfMin]; -// -// ///update looking k-mers -// lookingKmers[idxOfMin] = getNextKmer(entryKmer, diffFileList[idxOfMin], diffFileIdx[idxOfMin]); -// lookingInfos[idxOfMin] = infoFileList[idxOfMin].data[infoFileIdx[idxOfMin]]; -// infoFileIdx[idxOfMin] ++; -// if( diffFileIdx[idxOfMin] > maxIdxOfEachFiles[idxOfMin] ){ -// lookingKmers[idxOfMin] = UINT64_MAX; -// numOfincompletedFiles--; -// if(numOfincompletedFiles == 0) break; -// } -// idxOfMin = smallest(lookingKmers, lookingInfos, taxIdListAtRank, numOfSplitFiles); -// -// int hasSeenOtherStrains = 0; -// while(taxIdListAtRank[entryInfo.sequenceID] == taxIdListAtRank[lookingInfos[idxOfMin].sequenceID]){ -// if(entryKmer != lookingKmers[idxOfMin]) break; -// -// hasSeenOtherStrains += (taxIdList[entryInfo.sequenceID] != taxIdList[lookingInfos[idxOfMin].sequenceID]); -// -// lookingKmers[idxOfMin] = getNextKmer(entryKmer, diffFileList[idxOfMin], diffFileIdx[idxOfMin]); -// lookingInfos[idxOfMin] = infoFileList[idxOfMin].data[infoFileIdx[idxOfMin]]; -// infoFileIdx[idxOfMin] ++; -// -// if(diffFileIdx[idxOfMin] > maxIdxOfEachFiles[idxOfMin] ){ -// lookingKmers[idxOfMin] = UINT64_MAX; -// numOfincompletedFiles--; -// if(numOfincompletedFiles == 0){ -// endFlag = 1; -// break; -// } -// } -// idxOfMin = smallest(lookingKmers, lookingInfos, taxIdListAtRank, numOfSplitFiles); -// } -// -// entryInfo.redundancy = (hasSeenOtherStrains > 0 || entryInfo.redundancy); -// getDiffIdx(lastWrittenKmer, entryKmer, mergedDiffFile, diffBuffer, diffBufferIdx, totalBufferIdx); -// lastWrittenKmer = entryKmer; -// writeInfo(&entryInfo, mergedInfoFile, infoBuffer, infoBufferIdx, totalInfoIdx); -// writtenKmerCnt++; -// -// if(AminoAcid(lastWrittenKmer) != AAofTempSplitOffset && splitCheck == 1){ -// splitList[splitListIdx++] = {lastWrittenKmer, totalBufferIdx, totalInfoIdx}; -// splitCheck = 0; -// } -// -// if(writtenKmerCnt == offsetList[offsetListIdx]){ -// AAofTempSplitOffset = AminoAcid(lastWrittenKmer); -// splitCheck = 1; -// offsetListIdx++; -// } -// -// if(endFlag == 1) break; -// } -// -// cre->flushInfoBuf(infoBuffer, mergedInfoFile, infoBufferIdx); -// cre->flushKmerBuf(diffBuffer, mergedDiffFile, diffBufferIdx); -// fwrite(splitList, sizeof(DiffIdxSplit), SplitNum, diffIdxSplitFile); -// for(int i = 0; i < SplitNum; i++){ -// cout< mergedMap; while (std::getline(ss, line)) { std::vector result = splitByDelimiter(line, "\t|\t", 2); @@ -1171,4 +1170,4 @@ TaxID IndexCreator::getMaxTaxID() { ss.close(); return maxTaxID; -} \ No newline at end of file +} diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h index 5137e10b..c491cb20 100644 --- a/src/commons/IndexCreator.h +++ b/src/commons/IndexCreator.h @@ -185,11 +185,14 @@ class IndexCreator{ unordered_map & foundAcc2taxid); static void getSeqSegmentsWithHead(vector & seqSegments, const char * seqFileName); + IndexCreator(const LocalParameters & par); + IndexCreator() {taxonomy = nullptr;} + ~IndexCreator(); + int getNumOfFlush(); - void startIndexCreatingParallel(const LocalParameters & par); void createIndex(const LocalParameters & par); @@ -197,11 +200,17 @@ class IndexCreator{ void getDiffIdx(const uint64_t & lastKmer, const uint64_t & entryToWrite, FILE* handleKmerTable, uint16_t *kmerBuf, size_t & localBufIdx); + void getDiffIdx(const uint64_t & lastKmer, const uint64_t & entryToWrite, FILE* handleKmerTable, uint16_t *kmerBuf, size_t & localBufIdx, size_t & totalBufferIdx); + void writeInfo(TargetKmerInfo * entryToWrite, FILE * infoFile, TargetKmerInfo * infoBuffer, size_t & infoBufferIdx); + static void flushKmerBuf(uint16_t *buffer, FILE *handleKmerTable, size_t & localBufIdx); + static void flushInfoBuf(TargetKmerInfo * buffer, FILE * infoFile, size_t & localBufIdx ); + void makeAAoffsets(const LocalParameters & par); + }; #endif //ADKMER4_INDEXCREATOR_H diff --git a/src/commons/Kmer.h b/src/commons/Kmer.h index ce4aaff8..bf02c628 100644 --- a/src/commons/Kmer.h +++ b/src/commons/Kmer.h @@ -41,6 +41,7 @@ struct DiffIdxSplit{ DiffIdxSplit(uint64_t ADkmer, size_t diffIdxOffset, size_t infoIdxOffset) : ADkmer(ADkmer), diffIdxOffset(diffIdxOffset), infoIdxOffset(infoIdxOffset) { } DiffIdxSplit(const DiffIdxSplit & copy) {ADkmer = copy.ADkmer; diffIdxOffset = copy.diffIdxOffset; infoIdxOffset=copy.infoIdxOffset;} DiffIdxSplit() {}; + DiffIdxSplit& operator=(const DiffIdxSplit&) = default; uint64_t ADkmer; size_t diffIdxOffset; size_t infoIdxOffset; diff --git a/src/commons/KmerMatcher.cpp b/src/commons/KmerMatcher.cpp index e0ea45a0..8ee4aa3a 100644 --- a/src/commons/KmerMatcher.cpp +++ b/src/commons/KmerMatcher.cpp @@ -7,7 +7,7 @@ #include KmerMatcher::KmerMatcher(const LocalParameters & par, - NcbiTaxonomy * taxonomy) { + NcbiTaxonomy * taxonomy) : par(par) { // Parameters threads = par.threads; dbDir = par.filenames[1 + (par.seqMode == 2)]; @@ -19,7 +19,6 @@ KmerMatcher::KmerMatcher(const LocalParameters & par, this->taxonomy = taxonomy; loadTaxIdList(par); - } @@ -112,6 +111,8 @@ void KmerMatcher::loadTaxIdList(const LocalParameters & par) { bool KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, Buffer * matchBuffer, const string & db){ + std::cout << "Comparing query and reference metamers..." << std::endl; + // Set database files if (db.empty()) { targetDiffIdxFileName = dbDir + "/diffIdx"; @@ -121,9 +122,831 @@ bool KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, targetDiffIdxFileName = dbDir + "/" + db + "/diffIdx"; targetInfoFileName = dbDir + "/" + db + "/info"; diffIdxSplitFileName = dbDir + "/" + db + "/split"; - } + } + MmapedData diffIdxSplits = mmapData(diffIdxSplitFileName.c_str(), 3); size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t); + size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve; + QueryKmer * queryKmerList = queryKmerBuffer->buffer; + + // Find the first index of garbage query k-mer (UINT64_MAX) and discard from there + for (size_t checkN = queryKmerNum - 1; checkN > 0; checkN--) { + if (queryKmerList[checkN].ADkmer != UINT64_MAX) { + queryKmerNum = checkN + 1; + break; + } + } + + // Filter out meaningless target splits + size_t numOfDiffIdxSplits = diffIdxSplits.fileSize / sizeof(DiffIdxSplit); + size_t numOfDiffIdxSplits_use = numOfDiffIdxSplits; + for (size_t i = 1; i < numOfDiffIdxSplits; i++) { + if (diffIdxSplits.data[i].ADkmer == 0 || diffIdxSplits.data[i].ADkmer == UINT64_MAX) { + diffIdxSplits.data[i] = {UINT64_MAX, UINT64_MAX, UINT64_MAX}; + numOfDiffIdxSplits_use--; + } + } + + // Divide query k-mer list into blocks for multi threading. + // Each split has start and end points of query list + proper offset point of target k-mer list + std::vector querySplits; + uint64_t queryAA; + size_t quotient = queryKmerNum / threads; + size_t remainder = queryKmerNum % threads; + size_t startIdx = 0; + size_t endIdx = 0; // endIdx is inclusive + for (size_t i = 0; i < threads; i++) { + endIdx = startIdx + quotient - 1; + if (remainder > 0) { + endIdx++; + remainder--; + } + bool needLastTargetBlock = true; + queryAA = AMINO_ACID_PART(queryKmerList[startIdx].ADkmer); + for (size_t j = 0; j < numOfDiffIdxSplits_use; j ++) { + if (queryAA <= AMINO_ACID_PART(diffIdxSplits.data[j].ADkmer)) { + j = j - (j != 0); + querySplits.emplace_back(startIdx, endIdx, endIdx - startIdx + 1, diffIdxSplits.data[j]); + needLastTargetBlock = false; + break; + } + } + if (needLastTargetBlock) { + querySplits.emplace_back(startIdx, endIdx, endIdx - startIdx + 1, diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); + } + startIdx = endIdx + 1; + } + + if (querySplits.size() != threads) { + threads = querySplits.size(); + } + + bool *splitCheckList = (bool *) malloc(sizeof(bool) * threads); + std::fill_n(splitCheckList, threads, false); + time_t beforeSearch = time(nullptr); + size_t totalOverFlowCnt = 0; +#pragma omp parallel default(none), shared(splitCheckList, totalOverFlowCnt, \ +querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName) +{ + // FILE + FILE * diffIdxFp = fopen(targetDiffIdxFileName.c_str(), "rb"); + FILE * kmerInfoFp = fopen(targetInfoFileName.c_str(), "rb"); + + // Target K-mer buffer + uint16_t * diffIdxBuffer = (uint16_t *) malloc(sizeof(uint16_t) * (BufferSize + 1)); // size = 32 Mb + TargetKmerInfo * kmerInfoBuffer = (TargetKmerInfo *) malloc(sizeof(TargetKmerInfo) * (BufferSize + 1)); // 64 Mb + size_t kmerInfoBufferIdx = 0; + size_t diffIdxBufferIdx = 0; + + // Query variables + uint64_t currentQuery = UINT64_MAX; + uint64_t currentQueryAA = UINT64_MAX; + QueryKmerInfo currentQueryInfo; + + // Target variables + size_t diffIdxPos = 0; + std::vector candidateTargetKmers; // vector for candidate target k-mer, some of which are selected after based on hamming distance + std::vector candidateKmerInfos; + std::vector hammingDists; + uint64_t currentTargetKmer; + + // Match buffer for each thread + size_t localBufferSize = 2'000'000; // 32 Mb + auto *matches = new Match[localBufferSize]; // 16 * 2'000'000 = 32 Mb + size_t matchCnt = 0; + + // Vectors for selected target k-mers + std::vector selectedHammingSum; + std::vector selectedMatches; + std::vector selectedHammings; + std::vector selectedDnaEncodings; + selectedHammingSum.resize(1024); + selectedMatches.resize(1024); + selectedHammings.resize(1024); + selectedDnaEncodings.resize(1024); + size_t selectedMatchCnt = 0; + + size_t posToWrite; + size_t idx; + bool hasOverflow = false; + +#pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < querySplits.size(); i++) { + if (totalOverFlowCnt > 0 || splitCheckList[i]) { + continue; + } + currentTargetKmer = querySplits[i].diffIdxSplit.ADkmer; + diffIdxBufferIdx = querySplits[i].diffIdxSplit.diffIdxOffset; + kmerInfoBufferIdx = querySplits[i].diffIdxSplit.infoIdxOffset + - (querySplits[i].diffIdxSplit.ADkmer != 0); + diffIdxPos = querySplits[i].diffIdxSplit.diffIdxOffset; + + fseek(kmerInfoFp, 4 * (long)(kmerInfoBufferIdx), SEEK_SET); + loadBuffer(kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx, BufferSize); + fseek(diffIdxFp, 2 * (long) (diffIdxBufferIdx), SEEK_SET); + loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize); + + if (querySplits[i].diffIdxSplit.ADkmer == 0 && querySplits[i].diffIdxSplit.diffIdxOffset == 0 + && querySplits[i].diffIdxSplit.infoIdxOffset == 0) { + currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, + diffIdxBufferIdx, diffIdxPos); + } + + currentQuery = UINT64_MAX; + currentQueryAA = UINT64_MAX; + size_t lastMovedQueryIdx = 0; + for (size_t j = querySplits[i].start; j < querySplits[i].end + 1; j++) { + // Reuse the comparison data if queries are exactly identical + if (currentQuery == queryKmerList[j].ADkmer + && (currentQueryInfo.frame/3 == queryKmerList[j].info.frame/3)) { + // If local buffer is full, copy them to the shared buffer. + if (unlikely(matchCnt + selectedMatchCnt > localBufferSize)) { + // Check if the shared buffer is full. + posToWrite = matchBuffer->reserveMemory(matchCnt); + if (unlikely(posToWrite + matchCnt >= matchBuffer->bufferSize)) { + hasOverflow = true; + __sync_fetch_and_add(&totalOverFlowCnt, 1); + __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt); + break; + } + moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); + lastMovedQueryIdx = j; + } + for (size_t k = 0; k < selectedMatchCnt; k++) { + idx = selectedMatches[k]; + matches[matchCnt] = {queryKmerList[j].info, + candidateKmerInfos[idx].sequenceID, + taxId2speciesId[candidateKmerInfos[idx].sequenceID], + selectedDnaEncodings[k], + selectedHammings[k], + selectedHammingSum[k]}; + matchCnt++; + } + continue; + } + selectedMatchCnt = 0; + + // Reuse the candidate target k-mers to compare in DNA level if queries are the same at amino acid level but not at DNA level + if (currentQueryAA == AMINO_ACID_PART(queryKmerList[j].ADkmer)) { + compareDna(queryKmerList[j].ADkmer, candidateTargetKmers, hammingDists, selectedMatches, + selectedHammingSum, selectedHammings, selectedDnaEncodings, selectedMatchCnt, queryKmerList[j].info.frame); + // If local buffer is full, copy them to the shared buffer. + if (unlikely(matchCnt + selectedMatchCnt > localBufferSize)) { + // Check if the shared buffer is full. + posToWrite = matchBuffer->reserveMemory(matchCnt); + if (unlikely(posToWrite + matchCnt >= matchBuffer->bufferSize)) { + hasOverflow = true; + __sync_fetch_and_add(&totalOverFlowCnt, 1); + __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt); + break; + } + moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); + lastMovedQueryIdx = j; + } + for (size_t k = 0; k < selectedMatchCnt; k++) { + idx = selectedMatches[k]; + matches[matchCnt] = {queryKmerList[j].info, + candidateKmerInfos[idx].sequenceID, + taxId2speciesId[candidateKmerInfos[idx].sequenceID], + selectedDnaEncodings[k], + selectedHammings[k], + selectedHammingSum[k]}; + matchCnt++; + } + currentQuery = queryKmerList[j].ADkmer; + currentQueryAA = AMINO_ACID_PART(currentQuery); + currentQueryInfo = queryKmerList[j].info; + continue; + } + candidateTargetKmers.clear(); + candidateKmerInfos.clear(); + + // Get next query, and start to find + currentQuery = queryKmerList[j].ADkmer; + currentQueryAA = AMINO_ACID_PART(currentQuery); + currentQueryInfo = queryKmerList[j].info; + + // Skip target k-mers that are not matched in amino acid level + while (diffIdxPos != numOfDiffIdx + && (currentQueryAA > AMINO_ACID_PART(currentTargetKmer))) { + if (unlikely(BufferSize < diffIdxBufferIdx + 7)){ + loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 ); + } + currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, + diffIdxBufferIdx, diffIdxPos); + kmerInfoBufferIdx ++; + } + + if (currentQueryAA != AMINO_ACID_PART(currentTargetKmer)) { + continue; + } + + // Load target k-mers that are matched in amino acid level + while (diffIdxPos != numOfDiffIdx && + currentQueryAA == AMINO_ACID_PART(currentTargetKmer)) { + candidateTargetKmers.push_back(currentTargetKmer); + candidateKmerInfos.push_back(getKmerInfo(BufferSize, kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx)); + if (unlikely(BufferSize < diffIdxBufferIdx + 7)){ + loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 ); + } + currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, + diffIdxBufferIdx, diffIdxPos); + kmerInfoBufferIdx ++; + } + + if (candidateTargetKmers.size() > selectedMatches.size()) { + selectedMatches.resize(candidateTargetKmers.size()); + selectedHammingSum.resize(candidateTargetKmers.size()); + selectedHammings.resize(candidateTargetKmers.size()); + selectedDnaEncodings.resize(candidateTargetKmers.size()); + } + + // Compare the current query and the loaded target k-mers and select + compareDna(currentQuery, candidateTargetKmers, hammingDists, selectedMatches, selectedHammingSum, + selectedHammings, selectedDnaEncodings, selectedMatchCnt, queryKmerList[j].info.frame); + + // If local buffer is full, copy them to the shared buffer. + if (unlikely(matchCnt + selectedMatchCnt > localBufferSize)) { + // Check if the shared buffer is full. + posToWrite = matchBuffer->reserveMemory(matchCnt); + if (unlikely(posToWrite + matchCnt >= matchBuffer->bufferSize)) { + hasOverflow = true; + __sync_fetch_and_add(&totalOverFlowCnt, 1); + __sync_fetch_and_sub(&matchBuffer->startIndexOfReserve, matchCnt); + break; + } + moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); + lastMovedQueryIdx = j; + } + + for (size_t k = 0; k < selectedMatchCnt; k++) { + idx = selectedMatches[k]; + matches[matchCnt] = {queryKmerList[j].info, + candidateKmerInfos[idx].sequenceID, + taxId2speciesId[candidateKmerInfos[idx].sequenceID], + selectedDnaEncodings[k], + selectedHammings[k], + selectedHammingSum[k]}; + matchCnt++; + } + } // End of one split + + // Move matches in the local buffer to the shared buffer + posToWrite = matchBuffer->reserveMemory(matchCnt); + if (unlikely(posToWrite + matchCnt >= matchBuffer->bufferSize)) { + hasOverflow = true; + __sync_fetch_and_add(&totalOverFlowCnt, 1); + __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt); + } else { + moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); + } + + // Check whether current split is completed or not + if (!hasOverflow) { + splitCheckList[i] = true; + } + } // End of omp for (Iterating for splits) + delete[] matches; + fclose(diffIdxFp); + fclose(kmerInfoFp); + free(diffIdxBuffer); + free(kmerInfoBuffer); +} // End of omp parallel + + if (totalOverFlowCnt > 0) { + return false; + } + std::cout << "Time spent for the comparison: " << double(time(nullptr) - beforeSearch) << std::endl; + free(splitCheckList); + totalMatchCnt += matchBuffer->startIndexOfReserve; + return true; +} + +// bool KmerMatcher::matchKmers2(QueryKmerBuffer * queryKmerBuffer, +// Buffer * matchBuffer, +// const string & db){ +// // Set database files +// if (db.empty()) { +// targetDiffIdxFileName = dbDir + "/diffIdx"; +// targetInfoFileName = dbDir + "/info"; +// diffIdxSplitFileName = dbDir + "/split"; +// } else { // for the case of multiple databases +// targetDiffIdxFileName = dbDir + "/" + db + "/diffIdx"; +// targetInfoFileName = dbDir + "/" + db + "/info"; +// diffIdxSplitFileName = dbDir + "/" + db + "/split"; +// } +// MmapedData diffIdxSplits = mmapData(diffIdxSplitFileName.c_str(), 3); +// size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t); + +// // // Print target k-mer information +// // MmapedData targetKmerInfo2 = mmapData(targetInfoFileName.c_str(), 3); +// // size_t numOfTargetKmer = targetKmerInfo2.fileSize / sizeof(TargetKmerInfo); +// // for (size_t i = 0; i < numOfTargetKmer; i++) { +// // cout << targetKmerInfo2.data[i].sequenceID << "\t" << (int) targetKmerInfo2.data[i].redundancy << endl; +// // } + +// size_t queryKmerNum = queryKmerBuffer->startIndexOfReserve; +// QueryKmer *queryKmerList = queryKmerBuffer->buffer; + +// std::cout << "Comparing query and reference metamers..." << std::endl; + +// // Find the first index of garbage query k-mer (UINT64_MAX) and discard from there +// for (size_t checkN = queryKmerNum - 1; checkN > 0; checkN--) { +// if (queryKmerList[checkN].ADkmer != UINT64_MAX) { +// queryKmerNum = checkN + 1; +// break; +// } +// } + +// // Filter out meaningless target splits +// size_t numOfDiffIdxSplits = diffIdxSplits.fileSize / sizeof(DiffIdxSplit); +// size_t numOfDiffIdxSplits_use = numOfDiffIdxSplits; +// for (size_t i = 1; i < numOfDiffIdxSplits; i++) { +// if (diffIdxSplits.data[i].ADkmer == 0 || diffIdxSplits.data[i].ADkmer == UINT64_MAX) { +// diffIdxSplits.data[i] = {UINT64_MAX, UINT64_MAX, UINT64_MAX}; +// numOfDiffIdxSplits_use--; +// } +// } + +// // Divide query k-mer list into blocks for multi threading. +// // Each split has start and end points of query list + proper offset point of target k-mer list +// std::vector querySplits; +// uint64_t queryAA; +// size_t quotient = queryKmerNum / threads; +// size_t remainder = queryKmerNum % threads; +// size_t startIdx = 0; +// size_t endIdx = 0; // endIdx is inclusive +// for (size_t i = 0; i < threads; i++) { +// endIdx = startIdx + quotient - 1; +// if (remainder > 0) { +// endIdx++; +// remainder--; +// } +// bool needLastTargetBlock = true; +// queryAA = AMINO_ACID_PART(queryKmerList[startIdx].ADkmer); +// for (size_t j = 0; j < numOfDiffIdxSplits_use; j ++) { +// if (queryAA <= AMINO_ACID_PART(diffIdxSplits.data[j].ADkmer)) { +// j = j - (j != 0); +// querySplits.emplace_back(startIdx, endIdx, endIdx - startIdx + 1, diffIdxSplits.data[j]); +// needLastTargetBlock = false; +// break; +// } +// } +// if (needLastTargetBlock) { +// querySplits.emplace_back(startIdx, endIdx, endIdx - startIdx + 1, diffIdxSplits.data[numOfDiffIdxSplits_use - 2]); +// } +// startIdx = endIdx + 1; +// } + +// if (querySplits.size() != threads) { +// threads = querySplits.size(); +// } + +// bool *splitCheckList = (bool *) malloc(sizeof(bool) * threads); +// std::fill_n(splitCheckList, threads, false); +// size_t completedSplitCnt = 0; + +// time_t beforeSearch = time(nullptr); + +// while (completedSplitCnt < threads) { +// bool hasOverflow = false; +// #pragma omp parallel default(none), shared(completedSplitCnt, splitCheckList, hasOverflow, \ +// querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName) +// { +// // FILE +// FILE * diffIdxFp = fopen(targetDiffIdxFileName.c_str(), "rb"); +// FILE * kmerInfoFp = fopen(targetInfoFileName.c_str(), "rb"); +// // #if !defined(O_DIRECT) +// // const int mode = (O_RDONLY | O_SYNC); +// // #else +// // const int mode = (O_RDONLY | O_DIRECT | O_SYNC); +// // #endif +// // // Open target table +// // int fdDiffIdx = open(targetDiffIdxFileName.c_str(), mode); +// // if (fdDiffIdx < 0) { +// // cout << "Open target table " << targetDiffIdxFileName << " failed\n"; +// // // EXIT(EXIT_FAILURE); +// // } + +// // int fdKmerInfo = open(targetInfoFileName.c_str(), mode); +// // if (fdKmerInfo < 0) { +// // cout << "Open target table " << targetInfoFileName << " failed\n"; +// // // EXIT(EXIT_FAILURE); +// // } + +// // #if !defined(O_DIRECT) && defined(F_NOCACHE) +// // fcntl(fdDiffIdx, F_NOCACHE, 1); +// // fcntl(fdKmerInfo, F_NOCACHE, 1); +// // #endif + +// // Target K-mer buffer +// uint16_t * diffIdxBuffer = (uint16_t *) malloc(sizeof(uint16_t) * (BufferSize + 1)); // size = 32 Mb +// TargetKmerInfo * kmerInfoBuffer = (TargetKmerInfo *) malloc(sizeof(TargetKmerInfo) * (BufferSize + 1)); // 64 Mb +// size_t kmerInfoBufferIdx = 0; +// size_t diffIdxBufferIdx = 0; +// size_t diffIdxOffset = 0; + +// // Match buffer for each thread +// size_t localBufferSize = 2'000'000; // 32 Mb +// auto *matches = new Match[localBufferSize]; // 16 * 2'000'000 = 32 Mb +// size_t matchCnt = 0; + +// // Query variables +// uint64_t currentQuery = UINT64_MAX; +// uint64_t currentQueryAA = UINT64_MAX; +// QueryKmerInfo currentQueryInfo; + +// // Target variables +// // To Do: Count the number of candidates +// size_t diffIdxPos = 0; +// uint64_t * candidateKmers = new uint64_t[localBufferSize]; // 16 MB +// TargetKmerInfo * candidateKmerInfos = new TargetKmerInfo[localBufferSize]; // 8 MB +// size_t candidateKmerCnt = 0; +// std::vector hammingDists; +// uint64_t currentTargetKmer; + +// // Vectors for selected target k-mers +// std::vector selectedHammingSum; +// std::vector selectedMatches; +// std::vector selectedHammings; +// std::vector selectedDnaEncodings; +// selectedHammingSum.resize(1024); +// selectedMatches.resize(1024); +// selectedHammings.resize(1024); +// selectedDnaEncodings.resize(1024); +// size_t selectedMatchCnt = 0; + +// size_t posToWrite; + +// size_t idx; +// #pragma omp for schedule(dynamic, 1) +// for (size_t i = 0; i < querySplits.size(); i++) { +// if (hasOverflow || splitCheckList[i]) { +// continue; +// } + +// currentTargetKmer = querySplits[i].diffIdxSplit.ADkmer; +// diffIdxBufferIdx = querySplits[i].diffIdxSplit.diffIdxOffset; +// kmerInfoBufferIdx = querySplits[i].diffIdxSplit.infoIdxOffset +// - (querySplits[i].diffIdxSplit.ADkmer != 0); +// diffIdxPos = querySplits[i].diffIdxSplit.diffIdxOffset; +// diffIdxOffset = querySplits[i].diffIdxSplit.diffIdxOffset; + +// // fseek(kmerInfoFp, 4 * (long)(kmerInfoBufferIdx), SEEK_SET); +// // loadBuffer(kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx, BufferSize); +// // fseek(diffIdxFp, 2 * (long) (diffIdxBufferIdx), SEEK_SET); +// // loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize); + +// loadBuffer2(fdKmerInfo, kmerInfoBuffer, kmerInfoBufferIdx, BufferSize, 4 * static_cast(kmerInfoBufferIdx)); + +// loadBuffer2(fdDiffIdx, diffIdxBuffer, diffIdxBufferIdx, BufferSize, 2 * static_cast(diffIdxBufferIdx)); + +// if (querySplits[i].diffIdxSplit.ADkmer == 0 && querySplits[i].diffIdxSplit.diffIdxOffset == 0 +// && querySplits[i].diffIdxSplit.infoIdxOffset == 0) { +// currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, +// diffIdxBufferIdx, diffIdxPos); +// } + +// currentQuery = UINT64_MAX; +// currentQueryAA = UINT64_MAX; + +// size_t lastMovedQueryIdx = 0; +// for (size_t j = querySplits[i].start; j < querySplits[i].end + 1; j++) { +// querySplits[i].start++; + +// // Reuse the comparison data if queries are exactly identical +// if (currentQuery == queryKmerList[j].ADkmer +// && (currentQueryInfo.frame/3 == queryKmerList[j].info.frame/3)) { +// // If local buffer is full, copy them to the shared buffer. +// if (matchCnt + selectedMatchCnt > localBufferSize) { +// // Check if the shared buffer is full. +// posToWrite = matchBuffer->reserveMemory(matchCnt); +// if (posToWrite + matchCnt >= matchBuffer->bufferSize) { +// hasOverflow = true; +// querySplits[i].start = lastMovedQueryIdx + 1; +// __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt); +// break; +// } else { // not full -> copy matches to the shared buffer +// moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); +// lastMovedQueryIdx = j; +// } +// } +// for (size_t k = 0; k < selectedMatchCnt; k++) { +// idx = selectedMatches[k]; +// // Check if candidateKmerInfos[idx].sequenceID is valid +// // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || +// // taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { +// // cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; +// // } +// matches[matchCnt] = {queryKmerList[j].info, +// candidateKmerInfos[idx].sequenceID, +// taxId2speciesId[candidateKmerInfos[idx].sequenceID], +// selectedDnaEncodings[k], +// selectedHammings[k], +// selectedHammingSum[k]}; +// matchCnt++; +// } +// continue; +// } +// selectedMatchCnt = 0; + +// // Reuse the candidate target k-mers to compare in DNA level if queries are the same at amino acid level but not at DNA level +// if (currentQueryAA == AMINO_ACID_PART(queryKmerList[j].ADkmer)) { +// compareDna2(queryKmerList[j].ADkmer, candidateKmers, candidateKmerCnt, hammingDists, selectedMatches, +// selectedHammingSum, selectedHammings, selectedDnaEncodings, selectedMatchCnt, queryKmerList[j].info.frame); + +// // If local buffer is full, copy them to the shared buffer. +// if (matchCnt + selectedMatchCnt > localBufferSize) { +// // Check if the shared buffer is full. +// posToWrite = matchBuffer->reserveMemory(matchCnt); +// if (posToWrite + matchCnt >= matchBuffer->bufferSize) { +// hasOverflow = true; +// querySplits[i].start = lastMovedQueryIdx + 1; +// __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt); +// break; +// } else { // not full -> copy matches to the shared buffer +// moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); +// lastMovedQueryIdx = j; +// } +// } +// for (size_t k = 0; k < selectedMatchCnt; k++) { +// idx = selectedMatches[k]; +// matches[matchCnt] = {queryKmerList[j].info, +// candidateKmerInfos[idx].sequenceID, +// taxId2speciesId[candidateKmerInfos[idx].sequenceID], +// selectedDnaEncodings[k], +// selectedHammings[k], +// selectedHammingSum[k]}; +// matchCnt++; +// } +// currentQuery = queryKmerList[j].ADkmer; +// currentQueryAA = AMINO_ACID_PART(currentQuery); +// currentQueryInfo = queryKmerList[j].info; +// continue; +// } +// candidateKmerCnt = 0; + +// // Get next query, and start to find +// currentQuery = queryKmerList[j].ADkmer; +// currentQueryAA = AMINO_ACID_PART(currentQuery); +// currentQueryInfo = queryKmerList[j].info; + +// // Skip target k-mers that are not matched in amino acid level +// while (diffIdxPos != numOfDiffIdx +// && (currentQueryAA > AMINO_ACID_PART(currentTargetKmer))) { +// if (unlikely(BufferSize < diffIdxBufferIdx + 7)){ +// // loadBuffer2(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 ); +// // loadBuffer2(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize, 2 * static_cast(diffIdxBufferIdx)); +// } +// currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, +// diffIdxBufferIdx, diffIdxPos); +// kmerInfoBufferIdx ++; +// } + +// if (currentQueryAA != AMINO_ACID_PART(currentTargetKmer)) // Move to next query k-mer if there isn't any match. +// continue; + +// // Load target k-mers that are matched in amino acid level +// while (diffIdxPos != numOfDiffIdx && +// currentQueryAA == AMINO_ACID_PART(currentTargetKmer)) { +// candidateKmers[candidateKmerCnt] = currentTargetKmer; +// candidateKmerInfos[candidateKmerCnt++] = getKmerInfo(BufferSize, kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx); +// // Print the target k-mer +// // if (par.printLog == 1) { +// // cout << queryKmerList[j].info.sequenceID << "\t" << queryKmerList[j].info.pos << "\t" +// // << (int) queryKmerList[j].info.frame << endl; +// // cout << "Query k-mer: "; +// // print_binary64(64, currentQuery); +// // cout << "\t"; +// // seqIterator.printKmerInDNAsequence(currentQuery); +// // cout << endl; +// // cout << "Target k-mer: "; +// // print_binary64(64, currentTargetKmer); +// // cout << "\t"; +// // seqIterator.printKmerInDNAsequence(currentTargetKmer); +// // cout << "\t" << kmerInfoBuffer[kmerInfoBufferIdx].sequenceID +// // << "\t" << taxId2speciesId[kmerInfoBuffer[kmerInfoBufferIdx].sequenceID] << endl; +// // cout << (int) getHammingDistanceSum(currentQuery, currentTargetKmer) << "\t"; +// // print_binary16(16, getHammings(currentQuery, currentTargetKmer)); cout << endl; +// // } +// if (unlikely(BufferSize < diffIdxBufferIdx + 7)){ +// loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, +// BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 ); +// } + +// currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, +// diffIdxBufferIdx, diffIdxPos); +// kmerInfoBufferIdx ++; +// } + +// if (candidateKmerCnt > selectedMatches.size()) { +// selectedMatches.resize(candidateKmerCnt); +// selectedHammingSum.resize(candidateKmerCnt); +// selectedHammings.resize(candidateKmerCnt); +// selectedDnaEncodings.resize(candidateKmerCnt); +// } + +// // Compare the current query and the loaded target k-mers and select +// compareDna2(currentQuery, candidateKmers, candidateKmerCnt, hammingDists, selectedMatches, selectedHammingSum, +// selectedHammings, selectedDnaEncodings, selectedMatchCnt, queryKmerList[j].info.frame); + +// // If local buffer is full, copy them to the shared buffer. +// if (matchCnt + selectedMatchCnt > localBufferSize) { +// // Check if the shared buffer is full. +// posToWrite = matchBuffer->reserveMemory(matchCnt); +// if (posToWrite + matchCnt >= matchBuffer->bufferSize) { // full -> write matches to file first +// hasOverflow = true; +// querySplits[i].start = lastMovedQueryIdx + 1; +// __sync_fetch_and_sub(&matchBuffer->startIndexOfReserve, matchCnt); +// break; +// } else { // not full -> copy matches to the shared buffer +// moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); +// lastMovedQueryIdx = j; +// } +// } + +// for (size_t k = 0; k < selectedMatchCnt; k++) { +// idx = selectedMatches[k]; +// matches[matchCnt] = {queryKmerList[j].info, +// candidateKmerInfos[idx].sequenceID, +// taxId2speciesId[candidateKmerInfos[idx].sequenceID], +// selectedDnaEncodings[k], +// selectedHammings[k], +// selectedHammingSum[k]}; +// matchCnt++; +// } +// } // End of one split + +// // Move matches in the local buffer to the shared buffer +// posToWrite = matchBuffer->reserveMemory(matchCnt); +// if (posToWrite + matchCnt >= matchBuffer->bufferSize) { +// hasOverflow = true; +// querySplits[i].start = lastMovedQueryIdx + 1; +// __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt); +// } else { +// moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); +// } + +// // Check whether current split is completed or not +// if (querySplits[i].start - 1 == querySplits[i].end) { +// splitCheckList[i] = true; +// __sync_fetch_and_add(&completedSplitCnt, 1); +// } +// } // End of omp for (Iterating for splits) +// delete[] matches; +// delete[] candidateKmers; +// delete[] candidateKmerInfos; +// fclose(diffIdxFp); +// fclose(kmerInfoFp); +// free(diffIdxBuffer); +// free(kmerInfoBuffer); +// } // End of omp parallel + +// if (hasOverflow) { +// return false; +// } +// } // end of while(completeSplitCnt < threadNum) + +// std::cout << "Time spent for the comparison: " << double(time(nullptr) - beforeSearch) << std::endl; +// free(splitCheckList); + +// totalMatchCnt += matchBuffer->startIndexOfReserve; +// return true; +// } + +void KmerMatcher::sortMatches(Buffer * matchBuffer) { + time_t beforeSortMatches = time(nullptr); + std::cout << "Sorting matches ..." << std::endl; + SORT_PARALLEL(matchBuffer->buffer, + matchBuffer->buffer + matchBuffer->startIndexOfReserve, + compareMatches); + std::cout << "Time spent for sorting matches: " << double(time(nullptr) - beforeSortMatches) << std::endl; +} + +void KmerMatcher::moveMatches(Match *dest, Match *src, size_t & matchNum) { + memcpy(dest, src, sizeof(Match) * matchNum); + matchNum = 0; +} + +// It compares query k-mers to target k-mers. +// If a query has matches, the matches with the smallest hamming distance will be selected +void KmerMatcher::compareDna(uint64_t query, + std::vector &targetKmersToCompare, + std::vector & hammingDists, + std::vector &selectedMatches, + std::vector &selectedHammingSum, + std::vector &selectedHammings, + std::vector &selectedDnaEncodings, + size_t & selectedMatchIdx, + uint8_t frame) { + hammingDists.resize(targetKmersToCompare.size()); + uint8_t minHammingSum = UINT8_MAX; + + // Calculate hamming distance + for (size_t i = 0; i < targetKmersToCompare.size(); i++) { + hammingDists[i] = getHammingDistanceSum(query, targetKmersToCompare[i]); + minHammingSum = min(minHammingSum, hammingDists[i]); + } + + // Select target k-mers that passed hamming criteria + selectedMatchIdx = 0; + uint8_t maxHamming = min(minHammingSum * 2, 7); + for (size_t h = 0; h < targetKmersToCompare.size(); h++) { + if (hammingDists[h] <= maxHamming) { + selectedHammingSum[selectedMatchIdx] = hammingDists[h]; + selectedDnaEncodings[selectedMatchIdx] = GET_24_BITS_UINT(targetKmersToCompare[h]); + selectedHammings[selectedMatchIdx] = (frame < 3) + ? getHammings(query, targetKmersToCompare[h]) + : getHammings_reverse(query, targetKmersToCompare[h]); + selectedMatches[selectedMatchIdx++] = h; + } + } +} + + + +void KmerMatcher::compareDna2(uint64_t query, + const uint64_t * targetKmersToCompare, + size_t candidateCnt, + std::vector & hammingDists, + std::vector &selectedMatches, + std::vector &selectedHammingSum, + std::vector &selectedHammings, + std::vector &selectedDnaEncodings, + size_t & selectedMatchIdx, + uint8_t frame) { + hammingDists.resize(candidateCnt); + uint8_t minHammingSum = UINT8_MAX; + + // Calculate hamming distance + for (size_t i = 0; i < candidateCnt; i++) { + hammingDists[i] = getHammingDistanceSum(query, targetKmersToCompare[i]); + if (hammingDists[i] < minHammingSum) { + minHammingSum = hammingDists[i]; + } + } + + // Select target k-mers that passed hamming criteria + selectedMatchIdx = 0; + uint8_t maxHamming = min(minHammingSum * 2, 7); + for (size_t h = 0; h < candidateCnt; h++) { + if (hammingDists[h] <= maxHamming) { + selectedMatches[selectedMatchIdx] = h; + selectedHammingSum[selectedMatchIdx] = hammingDists[h]; + if (frame < 3) { // Frame of query k-mer + selectedHammings[selectedMatchIdx] = getHammings(query, targetKmersToCompare[h]); + } else { + selectedHammings[selectedMatchIdx] = getHammings_reverse(query, targetKmersToCompare[h]); + } + // Store right 24 bits of the target k-mer in selectedDnaEncodings + selectedDnaEncodings[selectedMatchIdx] = GET_24_BITS_UINT(targetKmersToCompare[h]); + selectedMatchIdx++; + } + } +} + + +bool KmerMatcher::compareMatches(const Match& a, const Match& b) { + if (a.qInfo.sequenceID != b.qInfo.sequenceID) + return a.qInfo.sequenceID < b.qInfo.sequenceID; + + if (a.speciesId != b.speciesId) + return a.speciesId < b.speciesId; + + if (a.qInfo.frame != b.qInfo.frame) + return a.qInfo.frame < b.qInfo.frame; + + if (a.qInfo.pos != b.qInfo.pos) + return a.qInfo.pos < b.qInfo.pos; + + if (a.hamming != b.hamming) + return a.hamming < b.hamming; + + return a.dnaEncoding < b.dnaEncoding; +} + +bool KmerMatcher::matchKmers_skipDecoding(QueryKmerBuffer * queryKmerBuffer, + Buffer * matchBuffer, + const string & db){ + // Set database files + if (db.empty()) { + targetDiffIdxFileName = dbDir + "/diffIdx"; + targetInfoFileName = dbDir + "/info"; + diffIdxSplitFileName = dbDir + "/split"; + } else { // for the case of multiple databases + targetDiffIdxFileName = dbDir + "/" + db + "/diffIdx"; + targetInfoFileName = dbDir + "/" + db + "/info"; + diffIdxSplitFileName = dbDir + "/" + db + "/split"; + } + + MmapedData diffIdxSplits = mmapData(diffIdxSplitFileName.c_str(), 3); + size_t numOfDiffIdx = FileUtil::getFileSize(targetDiffIdxFileName) / sizeof(uint16_t); + + // Load diffIdx count list + string diffIdxAAFileName = targetDiffIdxFileName + ".aa"; + MmapedData aminoacids = mmapData(diffIdxAAFileName.c_str(), 3); + size_t aaOffsetCnt = FileUtil::getFileSize(diffIdxAAFileName) / sizeof(uint64_t); + // cout << "aaOffsetCnt: " << aaOffsetCnt << endl; // // Print target k-mer information // MmapedData targetKmerInfo2 = mmapData(targetInfoFileName.c_str(), 3); @@ -184,25 +1007,63 @@ bool KmerMatcher::matchKmers(QueryKmerBuffer * queryKmerBuffer, } startIdx = endIdx + 1; } + munmap(diffIdxSplits.data, diffIdxSplits.fileSize + 1); if (querySplits.size() != threads) { threads = querySplits.size(); } + // Map querySplits to amino acid offset + vector offsets; + size_t aaOffset = 0; + bool isFound = false; + for (size_t i = 0; i < querySplits.size(); i++) { + for (; aaOffset < aaOffsetCnt; aaOffset++) { + if (AMINO_ACID_PART(querySplits[i].diffIdxSplit.ADkmer) < aminoacids.data[aaOffset]) { + offsets.push_back(aaOffset); + isFound = true; + break; + } + } + if (!isFound) { + offsets.push_back(offsets.back()); + } + isFound = false; + // cout << offsets.back() << endl; + } + munmap(aminoacids.data, aminoacids.fileSize + 1); + + bool *splitCheckList = (bool *) malloc(sizeof(bool) * threads); std::fill_n(splitCheckList, threads, false); size_t completedSplitCnt = 0; time_t beforeSearch = time(nullptr); + // development + size_t totalSkip = 0; + while (completedSplitCnt < threads) { bool hasOverflow = false; #pragma omp parallel default(none), shared(completedSplitCnt, splitCheckList, hasOverflow, \ -querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName) +querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffIdx, targetInfoFileName, \ +offsets, aaOffsetCnt, totalSkip) { // FILE FILE * diffIdxFp = fopen(targetDiffIdxFileName.c_str(), "rb"); FILE * kmerInfoFp = fopen(targetInfoFileName.c_str(), "rb"); + FILE * aaFp = fopen((targetDiffIdxFileName + ".aa").c_str(), "rb"); + FILE * cntFp = fopen((targetDiffIdxFileName + ".deltaCnt").c_str(), "rb"); + FILE * kmerCntFp = fopen((targetDiffIdxFileName + ".kmerCnt").c_str(), "rb"); + FILE * kmerFp = fopen((targetDiffIdxFileName + ".kmers").c_str(), "rb"); + + uint64_t * aaBuffer = (uint64_t *) malloc(sizeof(uint64_t) * (BufferSize + 1)); + uint64_t * nextKmers = (uint64_t *) malloc(sizeof(uint64_t) * (BufferSize + 1)); + uint32_t * cntBuffer = (uint32_t *) malloc(sizeof(uint32_t) * (BufferSize + 1)); + uint32_t * kmerCntBuffer = (uint32_t *) malloc(sizeof(uint32_t) * (BufferSize + 1)); + size_t aaOffsetIdx = 0; + size_t aaOffsetIdx2 = 0; + size_t totalOffsetIdx = 0; // Target K-mer buffer uint16_t * diffIdxBuffer = (uint16_t *) malloc(sizeof(uint16_t) * (BufferSize + 1)); // size = 32 Mb @@ -217,14 +1078,15 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI // Target variables size_t diffIdxPos = 0; - std::vector candidateTargetKmers; //vector for candidate target k-mer, some of which are selected after based on hamming distance + std::vector candidateTargetKmers; // vector for candidate target k-mer, some of which are selected after based on hamming distance std::vector candidateKmerInfos; + std::vector hammingDists; uint64_t currentTargetKmer; // Match buffer for each thread - int localBufferSize = 2'000'000; // 32 Mb + size_t localBufferSize = 2'000'000; // 32 Mb auto *matches = new Match[localBufferSize]; // 16 * 2'000'000 = 32 Mb - int matchCnt = 0; + size_t matchCnt = 0; // Vectors for selected target k-mers std::vector selectedHammingSum; @@ -238,20 +1100,34 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI size_t selectedMatchCnt = 0; size_t posToWrite; - size_t idx; + + size_t localSkip = 0; + #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < querySplits.size(); i++) { if (hasOverflow || splitCheckList[i]) { continue; } - + aaOffsetIdx = offsets[i]; + aaOffsetIdx2 = offsets[i]; + totalOffsetIdx = aaOffsetIdx; currentTargetKmer = querySplits[i].diffIdxSplit.ADkmer; diffIdxBufferIdx = querySplits[i].diffIdxSplit.diffIdxOffset; kmerInfoBufferIdx = querySplits[i].diffIdxSplit.infoIdxOffset - (querySplits[i].diffIdxSplit.ADkmer != 0); diffIdxPos = querySplits[i].diffIdxSplit.diffIdxOffset; + fseek(aaFp, 8 * (long) (aaOffsetIdx), SEEK_SET); + loadBuffer(aaFp, aaBuffer, aaOffsetIdx, BufferSize); + + fseek(kmerFp, 8 * (long) (aaOffsetIdx2), SEEK_SET); + loadBuffer(kmerFp, nextKmers, BufferSize); + fseek(cntFp, 4 * (long) (aaOffsetIdx2), SEEK_SET); + loadBuffer(cntFp, cntBuffer, BufferSize); + fseek(kmerCntFp, 4 * (long) (aaOffsetIdx2), SEEK_SET); + loadBuffer(kmerCntFp, kmerCntBuffer, aaOffsetIdx2, BufferSize); + fseek(kmerInfoFp, 4 * (long)(kmerInfoBufferIdx), SEEK_SET); loadBuffer(kmerInfoFp, kmerInfoBuffer, kmerInfoBufferIdx, BufferSize); fseek(diffIdxFp, 2 * (long) (diffIdxBufferIdx), SEEK_SET); @@ -268,39 +1144,29 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI size_t lastMovedQueryIdx = 0; for (size_t j = querySplits[i].start; j < querySplits[i].end + 1; j++) { - querySplits[i].start++; - // Reuse the comparison data if queries are exactly identical if (currentQuery == queryKmerList[j].ADkmer && (currentQueryInfo.frame/3 == queryKmerList[j].info.frame/3)) { // If local buffer is full, copy them to the shared buffer. - if (matchCnt + selectedMatchCnt > localBufferSize) { + if (unlikely(matchCnt + selectedMatchCnt > localBufferSize)) { // Check if the shared buffer is full. posToWrite = matchBuffer->reserveMemory(matchCnt); - if (posToWrite + matchCnt >= matchBuffer->bufferSize) { + if (unlikely(posToWrite + matchCnt >= matchBuffer->bufferSize)) { hasOverflow = true; - querySplits[i].start = lastMovedQueryIdx + 1; __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt); break; - } else { // not full -> copy matches to the shared buffer - moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); - lastMovedQueryIdx = j; - } + } + moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); + lastMovedQueryIdx = j; } - for (int k = 0; k < selectedMatchCnt; k++) { + for (size_t k = 0; k < selectedMatchCnt; k++) { idx = selectedMatches[k]; - // Check if candidateKmerInfos[idx].sequenceID is valid - // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || - // taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { - // cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; - // } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, taxId2speciesId[candidateKmerInfos[idx].sequenceID], selectedDnaEncodings[k], selectedHammings[k], - selectedHammingSum[k], - (bool) candidateKmerInfos[idx].redundancy}; + selectedHammingSum[k]}; matchCnt++; } continue; @@ -309,32 +1175,29 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI // Reuse the candidate target k-mers to compare in DNA level if queries are the same at amino acid level but not at DNA level if (currentQueryAA == AMINO_ACID_PART(queryKmerList[j].ADkmer)) { - compareDna(queryKmerList[j].ADkmer, candidateTargetKmers, selectedMatches, + compareDna(queryKmerList[j].ADkmer, candidateTargetKmers, hammingDists, selectedMatches, selectedHammingSum, selectedHammings, selectedDnaEncodings, selectedMatchCnt, queryKmerList[j].info.frame); // If local buffer is full, copy them to the shared buffer. - if (matchCnt + selectedMatchCnt > localBufferSize) { + if (unlikely(matchCnt + selectedMatchCnt > localBufferSize)) { // Check if the shared buffer is full. posToWrite = matchBuffer->reserveMemory(matchCnt); - if (posToWrite + matchCnt >= matchBuffer->bufferSize) { + if (unlikely(posToWrite + matchCnt >= matchBuffer->bufferSize)) { hasOverflow = true; - querySplits[i].start = lastMovedQueryIdx + 1; __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt); break; - } else { // not full -> copy matches to the shared buffer - moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); - lastMovedQueryIdx = j; - } + } + moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); + lastMovedQueryIdx = j; } - for (int k = 0; k < selectedMatchCnt; k++) { + for (size_t k = 0; k < selectedMatchCnt; k++) { idx = selectedMatches[k]; matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, taxId2speciesId[candidateKmerInfos[idx].sequenceID], selectedDnaEncodings[k], selectedHammings[k], - selectedHammingSum[k], - (bool) candidateKmerInfos[idx].redundancy}; + selectedHammingSum[k]}; matchCnt++; } currentQuery = queryKmerList[j].ADkmer; @@ -353,17 +1216,50 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI // Skip target k-mers that are not matched in amino acid level while (diffIdxPos != numOfDiffIdx && (currentQueryAA > AMINO_ACID_PART(currentTargetKmer))) { + // seqIterator->printKmerInDNAsequence(currentTargetKmer); cout << " " << diffIdxPos << " " << diffIdxBufferIdx << " "; + // seqIterator->printAAKmer(AMINO_ACID_PART(aaBuffer[aaOffsetIdx]), 24); cout << "\n"; + if (AMINO_ACID_PART(currentTargetKmer) == aaBuffer[aaOffsetIdx]) { + size_t temp = aaOffsetIdx2; + diffIdxBufferIdx += getElement(BufferSize, cntFp, cntBuffer, aaOffsetIdx2); + diffIdxPos += getElement(BufferSize, cntFp, cntBuffer, aaOffsetIdx2); + localSkip += getElement(BufferSize, cntFp, kmerCntBuffer, aaOffsetIdx2); + aaOffsetIdx2 = temp; + kmerInfoBufferIdx += getElement(BufferSize, kmerCntFp, kmerCntBuffer, aaOffsetIdx2); + aaOffsetIdx2 = temp; + currentTargetKmer = getElement(BufferSize, kmerFp, nextKmers, aaOffsetIdx2); + aaOffsetIdx++; + aaOffsetIdx2++; + totalOffsetIdx ++; + if (unlikely(aaOffsetIdx == BufferSize)) { + loadBuffer(aaFp, aaBuffer, aaOffsetIdx, BufferSize); + } + if (unlikely(BufferSize < diffIdxBufferIdx + 7)){ + loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 ); + } + continue; + } else { + while (totalOffsetIdx < aaOffsetCnt && AMINO_ACID_PART(currentTargetKmer) >= aaBuffer[aaOffsetIdx]) { + aaOffsetIdx++; + aaOffsetIdx2++; + totalOffsetIdx++; + if (unlikely(aaOffsetIdx == BufferSize)) { + loadBuffer(aaFp, aaBuffer, aaOffsetIdx, BufferSize); + } + } + } if (unlikely(BufferSize < diffIdxBufferIdx + 7)){ loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 ); } currentTargetKmer = getNextTargetKmer(currentTargetKmer, diffIdxBuffer, diffIdxBufferIdx, diffIdxPos); + kmerInfoBufferIdx ++; } - if (currentQueryAA != AMINO_ACID_PART(currentTargetKmer)) // Move to next query k-mer if there isn't any match. + if (currentQueryAA != AMINO_ACID_PART(currentTargetKmer)) { continue; - + } + // Load target k-mers that are matched in amino acid level while (diffIdxPos != numOfDiffIdx && currentQueryAA == AMINO_ACID_PART(currentTargetKmer)) { @@ -405,54 +1301,45 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI } // Compare the current query and the loaded target k-mers and select - compareDna(currentQuery, candidateTargetKmers, selectedMatches, selectedHammingSum, + compareDna(currentQuery, candidateTargetKmers, hammingDists, selectedMatches, selectedHammingSum, selectedHammings, selectedDnaEncodings, selectedMatchCnt, queryKmerList[j].info.frame); // If local buffer is full, copy them to the shared buffer. - if (matchCnt + selectedMatchCnt > localBufferSize) { + if (unlikely(matchCnt + selectedMatchCnt > localBufferSize)) { // Check if the shared buffer is full. posToWrite = matchBuffer->reserveMemory(matchCnt); - if (posToWrite + matchCnt >= matchBuffer->bufferSize) { // full -> write matches to file first + if (unlikely(posToWrite + matchCnt >= matchBuffer->bufferSize)) { // full -> write matches to file first hasOverflow = true; - querySplits[i].start = lastMovedQueryIdx + 1; __sync_fetch_and_sub(&matchBuffer->startIndexOfReserve, matchCnt); break; - } else { // not full -> copy matches to the shared buffer - moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); - lastMovedQueryIdx = j; - } + } + moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); + lastMovedQueryIdx = j; } - for (int k = 0; k < selectedMatchCnt; k++) { + for (size_t k = 0; k < selectedMatchCnt; k++) { idx = selectedMatches[k]; - // Check if candidateKmerInfos[idx].sequenceID is valid - // if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() || - // taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) { - // cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl; - // } matches[matchCnt] = {queryKmerList[j].info, candidateKmerInfos[idx].sequenceID, taxId2speciesId[candidateKmerInfos[idx].sequenceID], selectedDnaEncodings[k], selectedHammings[k], - selectedHammingSum[k], - (bool) candidateKmerInfos[idx].redundancy}; + selectedHammingSum[k]}; matchCnt++; } } // End of one split // Move matches in the local buffer to the shared buffer posToWrite = matchBuffer->reserveMemory(matchCnt); - if (posToWrite + matchCnt >= matchBuffer->bufferSize) { + if (unlikely(posToWrite + matchCnt >= matchBuffer->bufferSize)) { hasOverflow = true; - querySplits[i].start = lastMovedQueryIdx + 1; __sync_fetch_and_sub(& matchBuffer->startIndexOfReserve, matchCnt); } else { moveMatches(matchBuffer->buffer + posToWrite, matches, matchCnt); } // Check whether current split is completed or not - if (querySplits[i].start - 1 == querySplits[i].end) { + if (!hasOverflow) { splitCheckList[i] = true; __sync_fetch_and_add(&completedSplitCnt, 1); } @@ -460,97 +1347,27 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI delete[] matches; fclose(diffIdxFp); fclose(kmerInfoFp); + fclose(aaFp); + fclose(cntFp); + fclose(kmerCntFp); + fclose(kmerFp); free(diffIdxBuffer); free(kmerInfoBuffer); + free(aaBuffer); + free(nextKmers); + free(cntBuffer); + free(kmerCntBuffer); + __sync_fetch_and_add(&totalSkip, localSkip); } // End of omp parallel if (hasOverflow) { - // std::cout << "overflow!!!" << std::endl; return false; } } // end of while(completeSplitCnt < threadNum) - + cout << "Total skipped diffIdx: " << totalSkip << endl; std::cout << "Time spent for the comparison: " << double(time(nullptr) - beforeSearch) << std::endl; free(splitCheckList); - + cout << "Match count: " << matchBuffer->startIndexOfReserve << endl; totalMatchCnt += matchBuffer->startIndexOfReserve; return true; -} - -void KmerMatcher::sortMatches(Buffer * matchBuffer) { - time_t beforeSortMatches = time(nullptr); - std::cout << "Sorting matches ..." << std::endl; - SORT_PARALLEL(matchBuffer->buffer, - matchBuffer->buffer + matchBuffer->startIndexOfReserve, - compareMatches); - std::cout << "Time spent for sorting matches: " << double(time(nullptr) - beforeSortMatches) << std::endl; -} - -void KmerMatcher::moveMatches(Match *dest, Match *src, int &matchNum) { - memcpy(dest, src, sizeof(Match) * matchNum); - matchNum = 0; -} - -// It compares query k-mers to target k-mers. -// If a query has matches, the matches with the smallest hamming distance will be selected -void KmerMatcher::compareDna(uint64_t query, - std::vector &targetKmersToCompare, - std::vector &selectedMatches, - std::vector &selectedHammingSum, - std::vector &selectedHammings, - std::vector &selectedDnaEncodings, - size_t & selectedMatchIdx, - uint8_t frame) { - size_t size = targetKmersToCompare.size(); - auto *hammingSums = new uint8_t[size + 1]; - uint8_t currentHammingSum; - uint8_t minHammingSum = UINT8_MAX; - - // Calculate hamming distance - for (size_t i = 0; i < size; i++) { - currentHammingSum = getHammingDistanceSum(query, targetKmersToCompare[i]); - if (currentHammingSum < minHammingSum) { - minHammingSum = currentHammingSum; - } - hammingSums[i] = currentHammingSum; - } - - // Select target k-mers that passed hamming criteria - selectedMatchIdx = 0; - uint8_t maxHamming = min(minHammingSum * 2, 7); - for (size_t h = 0; h < size; h++) { - if (hammingSums[h] <= maxHamming) { - selectedMatches[selectedMatchIdx] = h; - selectedHammingSum[selectedMatchIdx] = hammingSums[h]; - if (frame < 3) { // Frame of query k-mer - selectedHammings[selectedMatchIdx] = getHammings(query, targetKmersToCompare[h]); - } else { - selectedHammings[selectedMatchIdx] = getHammings_reverse(query, targetKmersToCompare[h]); - } - // Store right 24 bits of the target k-mer in selectedDnaEncodings - selectedDnaEncodings[selectedMatchIdx] = GET_24_BITS_UINT(targetKmersToCompare[h]); - selectedMatchIdx++; - } - } - delete[] hammingSums; -} - - -bool KmerMatcher::compareMatches(const Match& a, const Match& b) { - if (a.qInfo.sequenceID != b.qInfo.sequenceID) - return a.qInfo.sequenceID < b.qInfo.sequenceID; - - if (a.speciesId != b.speciesId) - return a.speciesId < b.speciesId; - - if (a.qInfo.frame != b.qInfo.frame) - return a.qInfo.frame < b.qInfo.frame; - - if (a.qInfo.pos != b.qInfo.pos) - return a.qInfo.pos < b.qInfo.pos; - - if (a.hamming != b.hamming) - return a.hamming < b.hamming; - - return a.dnaEncoding < b.dnaEncoding; } \ No newline at end of file diff --git a/src/commons/KmerMatcher.h b/src/commons/KmerMatcher.h index 282a6053..a31b16a6 100644 --- a/src/commons/KmerMatcher.h +++ b/src/commons/KmerMatcher.h @@ -11,6 +11,7 @@ #include "unordered_map" #include #include +#include #define BufferSize 16'777'216 // 16 * 1024 * 1024 // 16 M @@ -27,6 +28,7 @@ using namespace std; class KmerMatcher { protected: + const LocalParameters ∥ NcbiTaxonomy *taxonomy; size_t threads; std::string dbDir; @@ -62,31 +64,66 @@ class KmerMatcher { inline size_t AminoAcidPart(size_t kmer) const { return (kmer)&MARKER; } - template - static void loadBuffer(FILE *fp, T *buffer, size_t &bufferIdx, size_t size) { - fread(buffer, sizeof(T), size, fp); + +template +static size_t loadBuffer(FILE *fp, T *buffer, size_t size) { + return fread(buffer, sizeof(T), size, fp); +} + +template +static size_t loadBuffer(FILE *fp, T *buffer, size_t &bufferIdx, size_t size) { + bufferIdx = 0; + return fread(buffer, sizeof(T), size, fp); +} + +template +static size_t loadBuffer(FILE *fp, T *buffer, size_t &bufferIdx, size_t size, + int cnt) { + bufferIdx = 0; + fseek(fp, cnt * sizeof(T), SEEK_CUR); + return fread(buffer, sizeof(T), size, fp); +} + +template +static void loadBuffer2(int fd, T *buffer, size_t &bufferIdx, size_t size, off_t offset) { + ssize_t bytesRead = pread(fd, buffer, size * sizeof(T), offset); + if (bytesRead == -1) { + cerr << "Error reading file" << std::endl; + } bufferIdx = 0; - } +} - template - static void loadBuffer(FILE *fp, T *buffer, size_t &bufferIdx, size_t size, - int cnt) { - fseek(fp, cnt * sizeof(T), SEEK_CUR); - fread(buffer, sizeof(T), size, fp); +template +static void loadBuffer2(int fd, T *buffer, size_t &bufferIdx, size_t size, off_t offset, int cnt) { + off_t newOffset = offset + cnt * sizeof(T); + ssize_t bytesRead = pread(fd, buffer, size * sizeof(T), newOffset); + if (bytesRead == -1) { + cerr << "Error reading file" << std::endl; + } bufferIdx = 0; - } +} - static uint64_t getNextTargetKmer(uint64_t lookingTarget, - const uint16_t *diffIdxBuffer, - size_t &diffBufferIdx, size_t &totalPos); + + // static TargetKmerInfo getKmerInfo(size_t bufferSize, FILE *kmerInfoFp, + // TargetKmerInfo *infoBuffer, + // size_t &infoBufferIdx); - static TargetKmerInfo getKmerInfo(size_t bufferSize, FILE *kmerInfoFp, - TargetKmerInfo *infoBuffer, - size_t &infoBufferIdx); + void moveMatches(Match *dest, Match *src, size_t & matchNum); - void moveMatches(Match *dest, Match *src, int &matchNum); + void compareDna(uint64_t query, + std::vector &targetKmersToCompare, + std::vector & hammingDists, + std::vector &selectedMatches, + std::vector &selectedHammingSum, + std::vector &rightEndHammings, + std::vector &selectedDnaEncodings, + size_t & selectedMatchIdx, + uint8_t frame); - void compareDna(uint64_t query, std::vector &targetKmersToCompare, + void compareDna2(uint64_t query, + const uint64_t * targetKmersToCompare, + size_t candidateCnt, + std::vector & hammingDists, std::vector &selectedMatches, std::vector &selectedHammingSum, std::vector &rightEndHammings, @@ -104,6 +141,30 @@ class KmerMatcher { void loadTaxIdList(const LocalParameters & par); + template + inline T getKmerInfo(size_t bufferSize, + FILE *kmerInfoFp, + T *infoBuffer, + size_t &infoBufferIdx) { + if (unlikely(infoBufferIdx >= bufferSize)) { + loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize, + static_cast(infoBufferIdx - bufferSize)); + } + return infoBuffer[infoBufferIdx]; + } + + template + inline T getElement(size_t bufferSize, + FILE *kmerInfoFp, + T *infoBuffer, + size_t &infoBufferIdx) { + if (unlikely(infoBufferIdx >= bufferSize)) { + loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize, + static_cast(infoBufferIdx - bufferSize)); + } + return infoBuffer[infoBufferIdx]; + } + public: KmerMatcher(const LocalParameters &par, NcbiTaxonomy *taxonomy); @@ -112,9 +173,23 @@ class KmerMatcher { bool matchKmers(QueryKmerBuffer *queryKmerBuffer, Buffer *matchBuffer, const string &db = string()); + + // bool matchKmers2(QueryKmerBuffer *queryKmerBuffer, + // Buffer *matchBuffer, + // const string &db = string()); + + bool matchKmers_skipDecoding(QueryKmerBuffer *queryKmerBuffer, + Buffer *matchBuffer, + const string &db = string()); + void sortMatches(Buffer *matchBuffer); + static uint64_t getNextTargetKmer(uint64_t lookingTarget, + const uint16_t *diffIdxBuffer, + size_t &diffBufferIdx, size_t &totalPos); + + // Getters size_t getTotalMatchCnt() const { return totalMatchCnt; } }; @@ -123,33 +198,19 @@ inline uint64_t KmerMatcher::getNextTargetKmer(uint64_t lookingTarget, const uint16_t *diffIdxBuffer, size_t &diffBufferIdx, size_t &totalPos) { - uint16_t fragment; - uint16_t check = 32768; // 2^15 uint64_t diffIn64bit = 0; - fragment = diffIdxBuffer[diffBufferIdx++]; + uint16_t fragment = diffIdxBuffer[diffBufferIdx++]; totalPos++; - while (!(fragment & check)) { // 27 % + while (!(fragment & 0x8000)) { diffIn64bit |= fragment; diffIn64bit <<= 15u; fragment = diffIdxBuffer[diffBufferIdx++]; totalPos++; } - fragment &= ~check; // not; 8.47 % - diffIn64bit |= fragment; // or : 23.6% + diffIn64bit |= (fragment & 0x7FFF); return diffIn64bit + lookingTarget; } -inline TargetKmerInfo KmerMatcher::getKmerInfo(size_t bufferSize, - FILE *kmerInfoFp, - TargetKmerInfo *infoBuffer, - size_t &infoBufferIdx) { - if (unlikely(infoBufferIdx >= bufferSize)) { - loadBuffer(kmerInfoFp, infoBuffer, infoBufferIdx, bufferSize, - (int)(infoBufferIdx - bufferSize)); - } - return infoBuffer[infoBufferIdx]; -} - inline uint8_t KmerMatcher::getHammingDistanceSum(uint64_t kmer1, uint64_t kmer2) { // 12345678 uint8_t hammingSum = 0; @@ -157,14 +218,10 @@ inline uint8_t KmerMatcher::getHammingDistanceSum(uint64_t kmer1, hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 3U)][GET_3_BITS(kmer2 >> 3U)]; hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 6U)][GET_3_BITS(kmer2 >> 6U)]; hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 9U)][GET_3_BITS(kmer2 >> 9U)]; - hammingSum += - hammingLookup[GET_3_BITS(kmer1 >> 12U)][GET_3_BITS(kmer2 >> 12U)]; - hammingSum += - hammingLookup[GET_3_BITS(kmer1 >> 15U)][GET_3_BITS(kmer2 >> 15U)]; - hammingSum += - hammingLookup[GET_3_BITS(kmer1 >> 18U)][GET_3_BITS(kmer2 >> 18U)]; - hammingSum += - hammingLookup[GET_3_BITS(kmer1 >> 21U)][GET_3_BITS(kmer2 >> 21U)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 12U)][GET_3_BITS(kmer2 >> 12U)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 15U)][GET_3_BITS(kmer2 >> 15U)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 18U)][GET_3_BITS(kmer2 >> 18U)]; + hammingSum += hammingLookup[GET_3_BITS(kmer1 >> 21U)][GET_3_BITS(kmer2 >> 21U)]; return hammingSum; } diff --git a/src/commons/Match.h b/src/commons/Match.h index 471e32ff..fbd2aac6 100644 --- a/src/commons/Match.h +++ b/src/commons/Match.h @@ -13,18 +13,16 @@ struct Match { // 24 byte TaxID speciesId, uint32_t dnaEncoding, uint16_t eachHamming, - uint8_t hamming, - bool redundancy): + uint8_t hamming): qInfo(qInfo), targetId(targetId), speciesId(speciesId), dnaEncoding(dnaEncoding), - rightEndHamming(eachHamming), hamming(hamming), redundancy(redundancy) { } + rightEndHamming(eachHamming), hamming(hamming) { } - QueryKmerInfo qInfo; // 8 - TaxID targetId; // 4 taxonomy id infact - TaxID speciesId; // 4 - uint32_t dnaEncoding; // 4 - uint16_t rightEndHamming; // 2 - uint8_t hamming; // 1 - bool redundancy; // 1 + QueryKmerInfo qInfo; // 8 // Query K-mer information + TaxID targetId; // 4 // axonomy id infact + TaxID speciesId; // 4 // Used to group matches by species + uint32_t dnaEncoding; // 4 // Used to check if two matches are consecutive + uint16_t rightEndHamming; // 2 // Used to calculate score + uint8_t hamming; // 1 // Used to filter redundant matches void printMatch() const { std::cout << qInfo.sequenceID << " " << qInfo.pos << " " << qInfo.frame << " " diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp index 1f42dbd0..58f610ff 100644 --- a/src/commons/SeqIterator.cpp +++ b/src/commons/SeqIterator.cpp @@ -1222,3 +1222,18 @@ void SeqIterator::printKmerInDNAsequence(uint64_t kmer) { } } } + + +void SeqIterator::printAAKmer(uint64_t kmer, int shifts) { + kmer >>= shifts; + vector aa8mer(8); + for (int i = 0; i < 8; i++) { + int quotient = kmer / powers[7 - i]; + kmer = kmer - (quotient * powers[7 - i]); + aa8mer[7 - i] = quotient; + } + string aminoacid = "ARNDCQEGHILKMFPSTWYVX"; + for (int i = 0; i < 8; i++) { + cout << aminoacid[aa8mer[i]]; + } +} \ No newline at end of file diff --git a/src/commons/SeqIterator.h b/src/commons/SeqIterator.h index c53093e0..5ec388f6 100644 --- a/src/commons/SeqIterator.h +++ b/src/commons/SeqIterator.h @@ -86,6 +86,8 @@ class SeqIterator { void printKmerInDNAsequence(uint64_t kmer); + void printAAKmer(uint64_t kmer, int shits = 28); + explicit SeqIterator(const LocalParameters &par); ~SeqIterator(); }; diff --git a/src/commons/Taxonomer.cpp b/src/commons/Taxonomer.cpp index 26c6f319..3476b7f5 100644 --- a/src/commons/Taxonomer.cpp +++ b/src/commons/Taxonomer.cpp @@ -26,8 +26,8 @@ Taxonomer::Taxonomer(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxon minCoveredPos = par.minCoveredPos; accessionLevel = par.accessionLevel; minSSMatch = par.minSSMatch; - minConsCnt = par.minConsCnt; - minConsCntEuk = par.minConsCntEuk; + minConsCnt = (size_t) par.minConsCnt; + minConsCntEuk = (size_t) par.minConsCntEuk; eukaryotaTaxId = par.eukaryotaTaxId; tieRatio = par.tieRatio; @@ -362,7 +362,7 @@ TaxonScore Taxonomer::getBestSpeciesMatches(std::pair & bestSpec } maxSpecies.clear(); - float coveredLength = 0.f; + // float coveredLength = 0.f; for (size_t i = 0; i < speciesList.size(); i++) { if (speciesScores[i] >= bestSpScore * tieRatio) { maxSpecies.push_back(speciesList[i]); @@ -481,17 +481,13 @@ void Taxonomer::getMatchPaths(const Match * matchList, size_t end, vector & filteredMatchPaths, TaxID speciesId) { - // cout << "getMatchPaths2" << endl; - // for (size_t i = start; i < end; i ++) { - // matchList[i].printMatch(); - // } size_t i = start; size_t currPos = matchList[start].qInfo.pos; uint64_t frame = matchList[start].qInfo.frame; - size_t MIN_DEPTH = minConsCnt - 1; + int MIN_DEPTH = (int) minConsCnt - 1; if (taxonomy->IsAncestor(eukaryotaTaxId, speciesId)) { - MIN_DEPTH = minConsCntEuk - 1; + MIN_DEPTH = (int) minConsCntEuk - 1; } connectedToNext.resize(end - start + 1); @@ -832,13 +828,13 @@ bool Taxonomer::isConsecutive(const Match * match1, const Match * match2) { return (match1->dnaEncoding >> 3) == (match2->dnaEncoding & 0x1FFFFF); } -bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * match2) { - // int hamming1 = match1->hamming - GET_2_BITS(match1->rightEndHamming); - // int hamming2 = match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14); - // match1 87654321 -> 08765432 - // match2 98765432 -> 08765432 - return (match1->hamming - GET_2_BITS(match1->rightEndHamming)) == (match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14)); -} +// bool Taxonomer::isConsecutive_diffFrame(const Match * match1, const Match * match2) { +// // int hamming1 = match1->hamming - GET_2_BITS(match1->rightEndHamming); +// // int hamming2 = match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14); +// // match1 87654321 -> 08765432 +// // match2 98765432 -> 08765432 +// return (match1->hamming - GET_2_BITS(match1->rightEndHamming)) == (match2->hamming - GET_2_BITS(match2->rightEndHamming >> 14)); +// } void Taxonomer::ensureArraySize(size_t newSize) { diff --git a/src/commons/Taxonomer.h b/src/commons/Taxonomer.h index c36f7406..fda5d29c 100644 --- a/src/commons/Taxonomer.h +++ b/src/commons/Taxonomer.h @@ -79,8 +79,8 @@ class Taxonomer { int minCoveredPos; int accessionLevel; int minSSMatch; - int minConsCnt; - int minConsCntEuk; + size_t minConsCnt; + size_t minConsCntEuk; int eukaryotaTaxId; float tieRatio; @@ -180,7 +180,7 @@ class Taxonomer { static bool isConsecutive(const Match * match1, const Match * match2); - static bool isConsecutive_diffFrame(const Match * match1, const Match * match2); + // static bool isConsecutive_diffFrame(const Match * match1, const Match * match2); TaxonScore getBestSpeciesMatches(std::pair & bestSpeciesRange, const Match *matchList, size_t end, diff --git a/src/commons/common.h b/src/commons/common.h index 16f6ff09..c9c18a05 100644 --- a/src/commons/common.h +++ b/src/commons/common.h @@ -9,6 +9,7 @@ #define likely(x) __builtin_expect((x),1) #define unlikely(x) __builtin_expect((x),0) #define kmerLength 8 +#define AA(kmer) ((kmer) & ~16777215) struct KmerCnt { KmerCnt(size_t length, size_t kmerCnt, size_t totalCnt) : length(length), kmerCnt(kmerCnt), totalCnt(totalCnt) {} @@ -110,4 +111,17 @@ int loadDbParameters(LocalParameters & par); int searchAccession2TaxID(const std::string & name, const std::unordered_map & acc2taxid); +template +size_t loadBuffer(FILE *fp, T *buffer, size_t &bufferIdx, size_t number, int cnt) { + bufferIdx = 0; + fseek(fp, cnt * sizeof(T), SEEK_CUR); + return fread(buffer, sizeof(T), number, fp); +} + +template +size_t loadBuffer(FILE *fp, T *buffer, size_t &bufferIdx, size_t number) { + bufferIdx = 0; + return fread(buffer, sizeof(T), number, fp); +} + #endif //ADCLASSIFIER2_COMMON_H diff --git a/src/metabuli.cpp b/src/metabuli.cpp index d3cc3a1c..b523abce 100644 --- a/src/metabuli.cpp +++ b/src/metabuli.cpp @@ -42,6 +42,20 @@ std::vector commands = { {{"Directory where the DB will be generated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty}, {"A list of FASTA files", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, + {"makeAAoffset", makeAAoffset, &localPar.build, COMMAND_EXPERT, + "Make AA offset file", + nullptr, + "Jaebeom Kim ", + "", + CITATION_SPACEPHARER, + {{"Differential index", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty}}}, + {"expand_diffidx", expand_diffidx, &localPar.build, COMMAND_EXPERT, + "expand differential index", + nullptr, + "Jaebeom Kim ", + "", + CITATION_SPACEPHARER, + {{"Differential index", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty}}}, {"database-report", databaseReport, &localPar.databaseReport, COMMAND_DB, "It generates a report of taxa in a database.", nullptr, diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 8d453154..2ef0e37e 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -8,4 +8,6 @@ set(util_source_files util/database-report.cpp util/mapping2taxon.cpp util/gradeByCladeSize.cpp + util/expand_diffidx.cpp + util/makeAAoffset.cpp PARENT_SCOPE) \ No newline at end of file diff --git a/src/util/expand_diffidx.cpp b/src/util/expand_diffidx.cpp new file mode 100644 index 00000000..894c7271 --- /dev/null +++ b/src/util/expand_diffidx.cpp @@ -0,0 +1,124 @@ +#include "LocalParameters.h" +#include +#include +#include +#include +#include "KmerMatcher.h" +#include "common.h" +#include "SeqIterator.h" + +using namespace std; + +int expand_diffidx(int argc, const char **argv, const Command &command){ + LocalParameters &par = LocalParameters::getLocalInstance(); + // setExpandDiffIdxDefault(par); + par.parseParameters(argc, argv, command, false, Parameters::PARSE_ALLOW_EMPTY, 0); + string diffIdxFileName = par.filenames[0]; + string expandedDiffIdxFileName = diffIdxFileName + ".expanded"; + size_t bufferSize = 16'777'216; // 1000'000'000; + + // Diff idx + FILE * diffIdxFp = fopen(diffIdxFileName.c_str(), "rb"); + uint16_t * diffIdxBuffer = (uint16_t *) malloc(sizeof(uint16_t) * (bufferSize + 1)); // size = 32 Mb + size_t diffIdxBufferIdx = 0; + size_t diffIdxPos = 0; + size_t numOfDiffIdx = FileUtil::getFileSize(diffIdxFileName) / sizeof(uint16_t); + // size_t diffIdxCnt = 0; + + + // Expanded idx + FILE * expandedDiffIdxFp = fopen(expandedDiffIdxFileName.c_str(), "wb"); + uint64_t * expandedIdxBuffer = (uint64_t *) malloc(sizeof(uint64_t) * (bufferSize + 1)); // size = 80 Mb + size_t expandedIdxBufferIdx = 0; + // fseek(diffIdxFp, 2 * (long) (diffIdxBufferIdx), SEEK_SET); + // loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, bufferSize); + + bool complete = false; + uint64_t currentTargetKmer = 0; + uint64_t currentTargetKmerAA = 0; + uint64_t currentAACnt = 0; + uint64_t AAkmerCnt = 0; + uint64_t currentAAOffset = 0; + size_t max = 0; + size_t max2 = 0; + + uint64_t MARKER = 16777215; + MARKER = ~ MARKER; + + SeqIterator * seqIterator = new SeqIterator(par); + size_t loadedItems; + bool isFirst = true; + size_t total = 0; + size_t totalAACnt = 0; + + // List + vector aminoAcids; + vector diffIdxCnts; + + while (!complete) { + // Load diff idx buffer + // fseek(diffIdxFp, 2 * (long) (diffIdxBufferIdx), SEEK_SET); + if (isFirst) { + loadedItems = loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, bufferSize); + isFirst = false; + } else { + loadedItems = loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, bufferSize, ((int)(bufferSize - diffIdxBufferIdx)) * -1 ); + } + // size_t loadedBytes = isFirst + // ? loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, bufferSize) + // : loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, bufferSize, ((int)(bufferSize - diffIdxBufferIdx)) * -1 ); + // if (isFirst) { + // isFirst = false; + // } + // loadBuffer(diffIdxFp, + // diffIdxBuffer, + // diffIdxBufferIdx, + // bufferSize); + + // loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, BufferSize, ((int)(BufferSize - diffIdxBufferIdx)) * -1 ); + + if (loadedItems != bufferSize) { + complete = true; + } + // Expand diff idx in buffer + // size_t before = diffIdxBufferIdx; + while (loadedItems >= diffIdxBufferIdx + 7) { + // ((kmer) & MARKER) + size_t before = diffIdxBufferIdx; + currentTargetKmer = KmerMatcher::getNextTargetKmer(currentTargetKmer, + diffIdxBuffer, + diffIdxBufferIdx, + diffIdxPos); + size_t after = diffIdxBufferIdx; + if (currentTargetKmerAA != (currentTargetKmer & MARKER)) { + // New amino acid -> process previous one + size_t diffIdxCntOfCurrentAA = before - currentAAOffset; + if (diffIdxCntOfCurrentAA) { + aminoAcids.push_back(currentTargetKmerAA); + diffIdxCnts.push_back(diffIdxCntOfCurrentAA); + } + } + // diffIdxCnt += 1; + // if (expandedIdxBufferIdx == bufferSize) { + // fwrite(expandedIdxBuffer, sizeof(uint64_t), bufferSize, expandedDiffIdxFp); + // expandedIdxBufferIdx = 0; + // } + // expandedIdxBuffer[expandedIdxBufferIdx++] = currentTargetKmer; + } + if (diffIdxPos == numOfDiffIdx) { + break; + } + } + cout << totalAACnt << endl; + cout << "Total diffIdx cnt > X: " << total << "\n"; + cout << AAkmerCnt << " " << max << " " << max2 << "\n"; + fwrite(expandedIdxBuffer, sizeof(uint64_t), expandedIdxBufferIdx, expandedDiffIdxFp); + + + + fclose(diffIdxFp); + fclose(expandedDiffIdxFp); + free(diffIdxBuffer); + free(expandedIdxBuffer); + return 0; +} \ No newline at end of file diff --git a/src/util/filter_by_genus.cpp b/src/util/filter_by_genus.cpp index b21b3a4b..25116642 100644 --- a/src/util/filter_by_genus.cpp +++ b/src/util/filter_by_genus.cpp @@ -73,4 +73,6 @@ int filterByGenus(int argc, const char **argv, const Command &command) { // Print line cout << line << endl; } + + return 0; } \ No newline at end of file diff --git a/src/util/makeAAoffset.cpp b/src/util/makeAAoffset.cpp new file mode 100644 index 00000000..648728a9 --- /dev/null +++ b/src/util/makeAAoffset.cpp @@ -0,0 +1,130 @@ +#include "LocalParameters.h" +#include +#include +#include +#include +#include "KmerMatcher.h" +#include "common.h" +#include "SeqIterator.h" + +using namespace std; + +int makeAAoffset(int argc, const char **argv, const Command &command){ + LocalParameters &par = LocalParameters::getLocalInstance(); + // setExpandDiffIdxDefault(par); + par.parseParameters(argc, argv, command, false, Parameters::PARSE_ALLOW_EMPTY, 0); + string diffIdxFileName = par.filenames[0]; + string offsetFileName = diffIdxFileName + ".aa"; + string kmersFileName = diffIdxFileName + ".kmers"; + string cntFileName = diffIdxFileName + ".deltaCnt"; + string kmerCntFileName = diffIdxFileName + ".kmerCnt"; + size_t bufferSize = 16'777'216; // 1000'000'000; + + // Diff idx + FILE * diffIdxFp = fopen(diffIdxFileName.c_str(), "rb"); + uint16_t * diffIdxBuffer = (uint16_t *) malloc(sizeof(uint16_t) * (bufferSize + 1)); // size = 32 Mb + size_t diffIdxBufferIdx = 0; + size_t diffIdxPos = 0; + size_t numOfDiffIdx = FileUtil::getFileSize(diffIdxFileName) / sizeof(uint16_t); + // size_t diffIdxCnt = 0; + + + // Offset list files + FILE * offsetListFp = fopen(offsetFileName.c_str(), "wb"); + FILE * kmerListFp = fopen(kmersFileName.c_str(), "wb"); + FILE * cntListFp = fopen(cntFileName.c_str(), "wb"); + FILE * kmerCntFp = fopen(kmerCntFileName.c_str(), "wb"); + uint64_t * aminoAcids = (uint64_t *) malloc(sizeof(uint64_t) * (bufferSize + 1)); + uint64_t * kmers = (uint64_t *) malloc(sizeof(uint64_t) * (bufferSize + 1)); + uint32_t * diffIdxCnts = (uint32_t *) malloc(sizeof(uint32_t) * (bufferSize + 1)); + uint32_t * kmerCnts = (uint32_t *) malloc(sizeof(uint32_t) * (bufferSize + 1)); + size_t offsetIdx = 0; + + + uint64_t currentTargetKmer = 0; + uint64_t currentTargetKmerAA = 0; + uint64_t currentAACnt = 0; + uint64_t AAkmerCnt = 0; + uint64_t currentAAOffset = 0; + + uint64_t MARKER = 16777215; + MARKER = ~ MARKER; + + SeqIterator * seqIterator = new SeqIterator(par); + + size_t loadedItems; + bool isFirst = true; + bool complete = false; + while (!complete) { + // Load diff idx buffer + if (isFirst) { + loadedItems = loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, bufferSize); + isFirst = false; + } else { + loadedItems = loadBuffer(diffIdxFp, diffIdxBuffer, diffIdxBufferIdx, bufferSize, ((int)(bufferSize - diffIdxBufferIdx)) * -1 ); + } + + if (loadedItems != bufferSize) { + complete = true; + } + + while (loadedItems >= diffIdxBufferIdx + 7) { + size_t before = diffIdxPos; + currentTargetKmer = KmerMatcher::getNextTargetKmer(currentTargetKmer, + diffIdxBuffer, + diffIdxBufferIdx, + diffIdxPos); + // seqIterator->printKmerInDNAsequence(currentTargetKmer); + // cout << " " << diffIdxPos << " " << diffIdxBufferIdx << "\n"; + size_t after = diffIdxPos; + if (currentTargetKmerAA != (currentTargetKmer & MARKER)) { + // New amino acid -> process previous one + if (after - currentAAOffset >= 10) { + if (offsetIdx == bufferSize) { + fwrite(aminoAcids, sizeof(uint64_t), bufferSize, offsetListFp); + fwrite(kmers, sizeof(uint64_t), bufferSize, kmerListFp); + fwrite(diffIdxCnts, sizeof(uint32_t), bufferSize, cntListFp); + fwrite(kmerCnts, sizeof(uint32_t), bufferSize, kmerCntFp); + offsetIdx = 0; + } + kmerCnts[offsetIdx] = AAkmerCnt + 1; + aminoAcids[offsetIdx] = currentTargetKmerAA; + kmers[offsetIdx] = currentTargetKmer; + diffIdxCnts[offsetIdx] = after - currentAAOffset; + + // seqIterator->printAAKmer(currentTargetKmerAA, 24); + // cout << " "; + // seqIterator->printAAKmer(currentTargetKmer, 24); + // cout << " "; + // seqIterator->printKmerInDNAsequence(currentTargetKmer); + // cout << " " << AAkmerCnt + 1 << " " << after - currentAAOffset << "\n"; + // cout << currentTargetKmerAA << " " << currentTargetKmer << " " << AAkmerCnt + 1 << " " << after - currentAAOffset << endl; + offsetIdx++; + } + currentTargetKmerAA = currentTargetKmer & MARKER; + currentAAOffset = after; + AAkmerCnt = 0; + } else { + AAkmerCnt++; + } + } + if (diffIdxPos == numOfDiffIdx) { + break; + } + } + fwrite(aminoAcids, sizeof(uint64_t), offsetIdx, offsetListFp); + fwrite(kmers, sizeof(uint64_t), offsetIdx, kmerListFp); + fwrite(diffIdxCnts, sizeof(uint32_t), offsetIdx, cntListFp); + fwrite(kmerCnts, sizeof(uint32_t), offsetIdx, kmerCntFp); + + fclose(offsetListFp); + fclose(cntListFp); + fclose(diffIdxFp); + fclose(kmerListFp); + fclose(kmerCntFp); + free(diffIdxBuffer); + free(aminoAcids); + free(diffIdxCnts); + free(kmers); + return 0; +} \ No newline at end of file diff --git a/src/workflow/updateDB.cpp b/src/workflow/updateDB.cpp index f4895674..083b47d6 100644 --- a/src/workflow/updateDB.cpp +++ b/src/workflow/updateDB.cpp @@ -35,9 +35,9 @@ int updateDB(int argc, const char **argv, const Command &command){ const char * seqFileName = argv[0]; const char * taxIdFileName = argv[1]; - const char * outdatedFileName = argv[2]; + // const char * outdatedFileName = argv[2]; const char * outdatedTaxIdList = argv[3];///before _diffIdx or _info - const char * updatedFileName = argv[4]; + // const char * updatedFileName = argv[4]; ifstream seqFile; seqFile.open(seqFileName); @@ -143,4 +143,5 @@ int updateDB(int argc, const char **argv, const Command &command){ // // // return 0; + return 0; } \ No newline at end of file