Skip to content

Commit

Permalink
Merge pull request #85 from jaebeom-kim/windows
Browse files Browse the repository at this point in the history
DB search is now 30% faster in Windows OS (in cygwin)
  • Loading branch information
jaebeom-kim authored Sep 10, 2024
2 parents 7d8f497 + 946c6a6 commit cc0493c
Show file tree
Hide file tree
Showing 19 changed files with 1,399 additions and 374 deletions.
2 changes: 2 additions & 0 deletions src/LocalCommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,7 @@ extern int binning2report(int argc, const char **argv, const Command& command);
extern int filterByGenus(int argc, const char **argv, const Command& command);
extern int databaseReport(int argc, const char **argv, const Command& command);
extern int mapping2taxon(int argc, const char **argv, const Command& command);
extern int expand_diffidx(int argc, const char **argv, const Command& command);
extern int makeAAoffset(int argc, const char **argv, const Command& command);

#endif //ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H
158 changes: 0 additions & 158 deletions src/commons/FileMerger.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,164 +20,6 @@ FileMerger::~FileMerger() {
delete taxonomy;
}

//void FileMerger::mergeTargetFiles(std::vector<char*> diffIdxFileNames, std::vector<char*> infoFileNames, vector<int> & taxIdListAtRank, vector<int> & taxIdList) {
// size_t writtenKmerCnt = 0;
//
// ///Files to write on & buffers to fill them
// FILE * mergedDiffFile = fopen(mergedDiffFileName, "wb");
// FILE * mergedInfoFile = fopen(mergedInfoFileName, "wb");
// FILE * diffIdxSplitFile = fopen(diffIdxSplitFileName, "wb");
// uint16_t * diffBuffer = (uint16_t *)malloc(sizeof(uint16_t) * kmerBufSize);
// size_t diffBufferIdx = 0;
// size_t totalBufferIdx = 0;
// TargetKmerInfo * infoBuffer = (TargetKmerInfo *)malloc(sizeof(TargetKmerInfo) * kmerBufSize);
// size_t infoBufferIdx = 0;
// size_t totalInfoIdx = 0;
//
// ///Prepare files to merge
// size_t numOfSplitFiles = diffIdxFileNames.size();
// size_t numOfincompletedFiles = numOfSplitFiles;
// size_t numOfKmerBeforeMerge = 0;
// uint64_t * lookingKmers = new uint64_t[numOfSplitFiles];
//// uint64_t lookingKmers[numOfSplitFiles];
//// TargetKmerInfo lookingInfos[numOfSplitFiles];
// auto * lookingInfos = new TargetKmerInfo[numOfSplitFiles];
// //size_t diffFileIdx[numOfSplitFiles];
// auto * diffFileIdx = new size_t[numOfSplitFiles];
// memset(diffFileIdx, 0, numOfSplitFiles * sizeof(size_t));
// auto * infoFileIdx = new size_t[numOfSplitFiles];
//// size_t infoFileIdx[numOfSplitFiles];
// memset(infoFileIdx, 0, numOfSplitFiles * sizeof(size_t));
// size_t maxIdxOfEachFiles[numOfSplitFiles];
// struct MmapedData<uint16_t> *diffFileList = new struct MmapedData<uint16_t>[numOfSplitFiles];
// struct MmapedData<TargetKmerInfo> *infoFileList = new struct MmapedData<TargetKmerInfo>[numOfSplitFiles];
// for (size_t file = 0; file < numOfSplitFiles; file++) {
// diffFileList[file] = mmapData<uint16_t>(diffIdxFileNames[file]);
// infoFileList[file] = mmapData<TargetKmerInfo>(infoFileNames[file]);
// maxIdxOfEachFiles[file] = diffFileList[file].fileSize / sizeof(uint16_t);
// numOfKmerBeforeMerge += infoFileList[file].fileSize / sizeof(TargetKmerInfo);
// }
//
// ///To make differential index splits
// uint64_t AAofTempSplitOffset = UINT64_MAX;
// size_t sizeOfSplit = numOfKmerBeforeMerge / (SplitNum - 1);
// size_t offsetList[SplitNum + 1];
// int offsetListIdx = 1;
// for(size_t os = 0; os < SplitNum; os++){
// offsetList[os] = os * sizeOfSplit;
// }
// offsetList[SplitNum] = UINT64_MAX;
//
// DiffIdxSplit splitList[SplitNum];
// memset(splitList, 0, sizeof(DiffIdxSplit) * SplitNum);
// int splitListIdx = 1;
//
// /// get the first k-mer to write
// for(size_t file = 0; file < numOfSplitFiles; file++){
// lookingKmers[file] = getNextKmer(0, diffFileList[file], diffFileIdx[file]);
// lookingInfos[file] = infoFileList[file].data[0];
// infoFileIdx[file] ++;
// }
//
// size_t idxOfMin = smallest(lookingKmers, lookingInfos, taxIdListAtRank, numOfSplitFiles);
// uint64_t lastWrittenKmer = 0;
// uint64_t entryKmer = lookingKmers[idxOfMin];
// TargetKmerInfo entryInfo = lookingInfos[idxOfMin];
//
// // write first k-mer
// getDiffIdx(lastWrittenKmer, entryKmer, mergedDiffFile, diffBuffer, diffBufferIdx, totalBufferIdx);
// lastWrittenKmer = entryKmer;
// writeInfo(&entryInfo, mergedInfoFile, infoBuffer, infoBufferIdx, totalInfoIdx);
// writtenKmerCnt++;
// int splitCheck = 0;
// int endFlag = 0;
//
// while(true){
// // update entry k-mer
// entryKmer = lookingKmers[idxOfMin];
// entryInfo = lookingInfos[idxOfMin];
//
// ///update looking k-mers
// lookingKmers[idxOfMin] = getNextKmer(entryKmer, diffFileList[idxOfMin], diffFileIdx[idxOfMin]);
// lookingInfos[idxOfMin] = infoFileList[idxOfMin].data[infoFileIdx[idxOfMin]];
// infoFileIdx[idxOfMin] ++;
// if( diffFileIdx[idxOfMin] > maxIdxOfEachFiles[idxOfMin] ){
// lookingKmers[idxOfMin] = UINT64_MAX;
// numOfincompletedFiles--;
// if(numOfincompletedFiles == 0) break;
// }
// idxOfMin = smallest(lookingKmers, lookingInfos, taxIdListAtRank, numOfSplitFiles);
//
// int hasSeenOtherStrains = 0;
// while(taxIdListAtRank[entryInfo.sequenceID] == taxIdListAtRank[lookingInfos[idxOfMin].sequenceID]){
// if(entryKmer != lookingKmers[idxOfMin]) break;
//
// hasSeenOtherStrains += (taxIdList[entryInfo.sequenceID] != taxIdList[lookingInfos[idxOfMin].sequenceID]);
//
// lookingKmers[idxOfMin] = getNextKmer(entryKmer, diffFileList[idxOfMin], diffFileIdx[idxOfMin]);
// lookingInfos[idxOfMin] = infoFileList[idxOfMin].data[infoFileIdx[idxOfMin]];
// infoFileIdx[idxOfMin] ++;
//
// if(diffFileIdx[idxOfMin] > maxIdxOfEachFiles[idxOfMin] ){
// lookingKmers[idxOfMin] = UINT64_MAX;
// numOfincompletedFiles--;
// if(numOfincompletedFiles == 0){
// endFlag = 1;
// break;
// }
// }
// idxOfMin = smallest(lookingKmers, lookingInfos, taxIdListAtRank, numOfSplitFiles);
// }
//
// entryInfo.redundancy = (hasSeenOtherStrains > 0 || entryInfo.redundancy);
// getDiffIdx(lastWrittenKmer, entryKmer, mergedDiffFile, diffBuffer, diffBufferIdx, totalBufferIdx);
// lastWrittenKmer = entryKmer;
// writeInfo(&entryInfo, mergedInfoFile, infoBuffer, infoBufferIdx, totalInfoIdx);
// writtenKmerCnt++;
//
// if(AminoAcid(lastWrittenKmer) != AAofTempSplitOffset && splitCheck == 1){
// splitList[splitListIdx++] = {lastWrittenKmer, totalBufferIdx, totalInfoIdx};
// splitCheck = 0;
// }
//
// if(writtenKmerCnt == offsetList[offsetListIdx]){
// AAofTempSplitOffset = AminoAcid(lastWrittenKmer);
// splitCheck = 1;
// offsetListIdx++;
// }
//
// if(endFlag == 1) break;
// }
//
// cre->flushInfoBuf(infoBuffer, mergedInfoFile, infoBufferIdx);
// cre->flushKmerBuf(diffBuffer, mergedDiffFile, diffBufferIdx);
// fwrite(splitList, sizeof(DiffIdxSplit), SplitNum, diffIdxSplitFile);
// for(int i = 0; i < SplitNum; i++){
// cout<<splitList[i].ADkmer<< " "<<splitList[i].diffIdxOffset<< " "<<splitList[i].infoIdxOffset<<endl;
// }
// free(diffBuffer);
// free(infoBuffer);
// fclose(mergedDiffFile);
// fclose(mergedInfoFile);
// fclose(diffIdxSplitFile);
//
// for(size_t file = 0; file < numOfSplitFiles; file++){
// munmap(diffFileList[file].data, diffFileList[file].fileSize + 1);
// munmap(infoFileList[file].data, infoFileList[file].fileSize + 1);
// }
// cout<<"Creating target DB is done"<<endl;
// cout<<"Total k-mer count : " << numOfKmerBeforeMerge <<endl;
// cout<<"Written k-mer count : " << writtenKmerCnt << endl;
//
// delete[] diffFileList;
// delete[] infoFileList;
// delete[] lookingInfos;
// delete[] lookingKmers;
// delete[] diffFileIdx;
// delete[] infoFileIdx;
//}


// Merge differential index and k-mer information files, reducing redundancy
void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits) {
size_t writtenKmerCnt = 0;
Expand Down
3 changes: 1 addition & 2 deletions src/commons/IndexCreator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1076,7 +1076,6 @@ void IndexCreator::editTaxonomyDumpFiles(const vector<pair<string, pair<TaxID, T
}

std::string line;
size_t count = 0;
unordered_map<int, int> mergedMap;
while (std::getline(ss, line)) {
std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
Expand Down Expand Up @@ -1171,4 +1170,4 @@ TaxID IndexCreator::getMaxTaxID() {
ss.close();

return maxTaxID;
}
}
11 changes: 10 additions & 1 deletion src/commons/IndexCreator.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,23 +185,32 @@ class IndexCreator{
unordered_map<string, TaxID> & foundAcc2taxid);

static void getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments, const char * seqFileName);

IndexCreator(const LocalParameters & par);

IndexCreator() {taxonomy = nullptr;}

~IndexCreator();

int getNumOfFlush();
void startIndexCreatingParallel(const LocalParameters & par);

void createIndex(const LocalParameters & par);

void updateIndex(const LocalParameters & par);

void getDiffIdx(const uint64_t & lastKmer, const uint64_t & entryToWrite, FILE* handleKmerTable,
uint16_t *kmerBuf, size_t & localBufIdx);

void getDiffIdx(const uint64_t & lastKmer, const uint64_t & entryToWrite, FILE* handleKmerTable,
uint16_t *kmerBuf, size_t & localBufIdx, size_t & totalBufferIdx);

void writeInfo(TargetKmerInfo * entryToWrite, FILE * infoFile, TargetKmerInfo * infoBuffer, size_t & infoBufferIdx);

static void flushKmerBuf(uint16_t *buffer, FILE *handleKmerTable, size_t & localBufIdx);

static void flushInfoBuf(TargetKmerInfo * buffer, FILE * infoFile, size_t & localBufIdx );

void makeAAoffsets(const LocalParameters & par);

};
#endif //ADKMER4_INDEXCREATOR_H
1 change: 1 addition & 0 deletions src/commons/Kmer.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ struct DiffIdxSplit{
DiffIdxSplit(uint64_t ADkmer, size_t diffIdxOffset, size_t infoIdxOffset) : ADkmer(ADkmer), diffIdxOffset(diffIdxOffset), infoIdxOffset(infoIdxOffset) { }
DiffIdxSplit(const DiffIdxSplit & copy) {ADkmer = copy.ADkmer; diffIdxOffset = copy.diffIdxOffset; infoIdxOffset=copy.infoIdxOffset;}
DiffIdxSplit() {};
DiffIdxSplit& operator=(const DiffIdxSplit&) = default;
uint64_t ADkmer;
size_t diffIdxOffset;
size_t infoIdxOffset;
Expand Down
Loading

0 comments on commit cc0493c

Please sign in to comment.