Skip to content

Commit

Permalink
Merge pull request #83 from jaebeom-kim/windows
Browse files Browse the repository at this point in the history
Optimizing for Windows OS support.
  • Loading branch information
jaebeom-kim authored Aug 28, 2024
2 parents ed609df + ac24573 commit d04cb73
Show file tree
Hide file tree
Showing 15 changed files with 977 additions and 1,063 deletions.
53 changes: 49 additions & 4 deletions src/commons/Classifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Classifier::Classifier(LocalParameters & par) {
} else {
kmerMatcher = new KmerMatcher(par, taxonomy);
}
taxonomer = new Taxonomer(par, taxonomy);
// taxonomer = new Taxonomer(par, taxonomy);
reporter = new Reporter(par, taxonomy);
}

Expand All @@ -32,7 +32,7 @@ Classifier::~Classifier() {
delete queryIndexer;
delete kmerExtractor;
delete kmerMatcher;
delete taxonomer;
// delete taxonomer;
delete reporter;
}

Expand Down Expand Up @@ -113,7 +113,7 @@ void Classifier::startClassify(const LocalParameters &par) {
kmerMatcher->sortMatches(&matchBuffer);

// Classify queries based on the matches.
taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par);
assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par);

// Write classification results
reporter->writeReadClassification(queryList);
Expand Down Expand Up @@ -148,8 +148,53 @@ void Classifier::startClassify(const LocalParameters &par) {
reporter->closeReadClassificationFile();

// Write report files
reporter->writeReportFile(totalSeqCnt, taxonomer->getTaxCounts());
reporter->writeReportFile(totalSeqCnt, taxCounts);

// Memory deallocation
free(matchBuffer.buffer);
}

void Classifier::assignTaxonomy(const Match *matchList,
size_t numOfMatches,
std::vector<Query> &queryList,
const LocalParameters &par) {
time_t beforeAnalyze = time(nullptr);
cout << "Analyzing matches ..." << endl;

// Divide matches into blocks for multi threading
size_t seqNum = queryList.size();
MatchBlock *matchBlocks = new MatchBlock[seqNum];
size_t matchIdx = 0;
size_t blockIdx = 0;
uint32_t currentQuery;
while (matchIdx < numOfMatches) {
currentQuery = matchList[matchIdx].qInfo.sequenceID;
matchBlocks[blockIdx].id = currentQuery;
matchBlocks[blockIdx].start = matchIdx;
while ((currentQuery == matchList[matchIdx].qInfo.sequenceID) && (matchIdx < numOfMatches)) ++matchIdx;
matchBlocks[blockIdx].end = matchIdx - 1;
blockIdx++;
}
// Process each block
#pragma omp parallel default(none), shared(cout, matchBlocks, matchList, seqNum, queryList, blockIdx, par)
{
Taxonomer taxonomer(par, taxonomy);
#pragma omp for schedule(dynamic, 1)
for (size_t i = 0; i < blockIdx; ++i) {
taxonomer.chooseBestTaxon(matchBlocks[i].id,
matchBlocks[i].start,
matchBlocks[i].end,
matchList,
queryList,
par);
}
}

for (size_t i = 0; i < seqNum; i++) {
++taxCounts[queryList[i].classification];
}

delete[] matchBlocks;
cout << "Time spent for analyzing: " << double(time(nullptr) - beforeAnalyze) << endl;

}
11 changes: 10 additions & 1 deletion src/commons/Classifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,26 @@ class Classifier {
QueryIndexer * queryIndexer;
KmerExtractor * kmerExtractor;
KmerMatcher * kmerMatcher;
Taxonomer * taxonomer;
// Taxonomer * taxonomer;
Reporter * reporter;
NcbiTaxonomy * taxonomy;

unordered_map<TaxID, unsigned int> taxCounts;

public:
void startClassify(const LocalParameters &par);

void assignTaxonomy(const Match *matchList,
size_t numOfMatches,
std::vector<Query> & queryList,
const LocalParameters &par);

explicit Classifier(LocalParameters & par);

virtual ~Classifier();

unordered_map<TaxID, unsigned int> & getTaxCounts() { return taxCounts; }

};


Expand Down
19 changes: 12 additions & 7 deletions src/commons/IndexCreator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
kseq_buffer_t buffer;
kseq_t *seq;
vector<uint64_t> intergenicKmers;
vector<int> aaSeq;
#pragma omp for schedule(dynamic, 1)
for (size_t i = 0; i < fnaSplits.size(); i++) {
if (!checker[i] && !hasOverflow) {
Expand Down Expand Up @@ -917,7 +918,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
prodigal->getPredictedGenes(seq->seq.s);
seqIterator.generateIntergenicKmerList(prodigal->genes, prodigal->nodes,
prodigal->getNumberOfPredictedGenes(),
intergenicKmers,seq->seq.s);
intergenicKmers, seq->seq.s);

// Get min k-mer hash list for determining strandness
seqIterator.getMinHashList(standardList, seq->seq.s);
Expand All @@ -941,7 +942,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
// Get extended ORFs
prodigal->getPredictedGenes(seq->seq.s);
prodigal->removeCompletelyOverlappingGenes();
seqIterator.getExtendedORFs(prodigal->finalGenes, prodigal->nodes, extendedORFs,
prodigal->getExtendedORFs(prodigal->finalGenes, prodigal->nodes, extendedORFs,
prodigal->fng, strlen(seq->seq.s),
orfNum, intergenicKmers, seq->seq.s);
// Get masked sequence
Expand All @@ -955,14 +956,16 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,

// Get k-mers from extended ORFs
for (size_t orfCnt = 0; orfCnt < orfNum; orfCnt++) {
seqIterator.translateBlock(maskedSeq, extendedORFs[orfCnt]);
aaSeq.clear();
seqIterator.translateBlock(maskedSeq, extendedORFs[orfCnt], aaSeq);
tempCheck = seqIterator.fillBufferWithKmerFromBlock(
extendedORFs[orfCnt],
maskedSeq,
kmerBuffer,
posToWrite,
int(processedSeqCnt[fnaSplits[i].file_idx] + fnaSplits[i].offset + s_cnt),
fnaSplits[i].speciesID);
fnaSplits[i].speciesID,
aaSeq);
if (tempCheck == -1) {
cout << "ERROR: Buffer overflow " << seq->name.s << seq->seq.l << endl;
}
Expand All @@ -976,7 +979,7 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
// Get extended ORFs
prodigal->getPredictedGenes(reverseCompliment);
prodigal->removeCompletelyOverlappingGenes();
seqIterator.getExtendedORFs(prodigal->finalGenes, prodigal->nodes, extendedORFs,
prodigal->getExtendedORFs(prodigal->finalGenes, prodigal->nodes, extendedORFs,
prodigal->fng, strlen(reverseCompliment),
orfNum, intergenicKmers, reverseCompliment);

Expand All @@ -990,14 +993,16 @@ size_t IndexCreator::fillTargetKmerBuffer(TargetKmerBuffer &kmerBuffer,
}

for (size_t orfCnt = 0; orfCnt < orfNum; orfCnt++) {
seqIterator.translateBlock(maskedSeq, extendedORFs[orfCnt]);
aaSeq.clear();
seqIterator.translateBlock(maskedSeq, extendedORFs[orfCnt], aaSeq);
tempCheck = seqIterator.fillBufferWithKmerFromBlock(
extendedORFs[orfCnt],
maskedSeq,
kmerBuffer,
posToWrite,
int(processedSeqCnt[fnaSplits[i].file_idx] + fnaSplits[i].offset + s_cnt),
fnaSplits[i].speciesID);
fnaSplits[i].speciesID,
aaSeq);
if (tempCheck == -1) {
cout << "ERROR: Buffer overflow " << seq->name.s << seq->seq.l << endl;
}
Expand Down
Loading

0 comments on commit d04cb73

Please sign in to comment.