diff --git a/src/commons/Classifier.cpp b/src/commons/Classifier.cpp index 09d8bb6c..66a4cec7 100644 --- a/src/commons/Classifier.cpp +++ b/src/commons/Classifier.cpp @@ -5,8 +5,7 @@ #include Classifier::Classifier(LocalParameters & par) : maskMode(par.maskMode), maskProb(par.maskProb) { -// probMatrix(*(new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0))) { - + // Load parameters if (par.seqMode == 2){ queryPath_1 = par.filenames[0]; queryPath_2 = par.filenames[1]; @@ -126,54 +125,32 @@ static inline bool compareForLinearSearch(const QueryKmer &a, const QueryKmer &b void Classifier::startClassify(const LocalParameters &par) { - size_t totalNumOfQueryKmer = 0; - // Check if the query file is in FASTA format - bool isFasta = false; - ifstream check(queryPath_1.c_str()); - if (check.is_open()) { - string line; - getline(check, line); - if (line[0] == '>') { - isFasta = true; - } - } - check.close(); - // Calculate maximum number of k-mers for each iteration. size_t matchPerKmer = par.matchPerKmer; size_t c = sizeof(QueryKmer) + matchPerKmer * sizeof(Match); size_t ram_threads = ((size_t) par.ramUsage * (size_t) 1024 * 1024 * 1024) - - ((size_t) 134217728 * (size_t) par.threads); + - ((size_t) 134217728 * (size_t) par.threads); + // Load query file cout << "Indexing query file ..."; - size_t totalReadLength = 0; - MmapedData queryFile{}; - MmapedData queryFile2{}; - vector sequences; - vector sequences2; - vector> queryReadSplit; - vector splitKmerCnt; + vector sequences_read1; + vector sequences_read2; size_t numOfSeq = 0; size_t start = 0; size_t kmerCnt = 0; size_t currentKmerCnt = 0; size_t seqCnt = 0; + vector splitKmerCnt; + vector> queryReadSplit; + size_t totalReadLength = 0; if (par.seqMode == 1 || par.seqMode == 3) { - queryFile = mmapData(queryPath_1.c_str(), 2); // mmap readonly - madvise(queryFile.data, queryFile.fileSize, MADV_SEQUENTIAL); - - // Get start and end positions of each read - if (isFasta) { - splitFASTA(sequences, queryPath_1); - } else { - splitFASTQ(sequences, queryPath_1); - } + splitQueryFile(sequences_read1, queryPath_1); // Make query read splits - numOfSeq = sequences.size(); + numOfSeq = sequences_read1.size(); for (size_t i = 0; i < numOfSeq; i++) { - currentKmerCnt = getQueryKmerNumber(sequences[i].seqLength); + currentKmerCnt = getQueryKmerNumber(sequences_read1[i].seqLength); kmerCnt += currentKmerCnt; seqCnt++; if (c * kmerCnt + ((size_t) 200 * seqCnt) > ram_threads) { @@ -183,34 +160,26 @@ void Classifier::startClassify(const LocalParameters &par) { start = i; seqCnt = 1; } - totalReadLength += sequences[i].seqLength; + totalReadLength += sequences_read1[i].seqLength; } queryReadSplit.emplace_back(start, numOfSeq); splitKmerCnt.push_back(kmerCnt); - } else if (par.seqMode == 2) { - - // Get start and end positions of each read - if (isFasta) { - splitFASTA(sequences, queryPath_1); - splitFASTA(sequences2, queryPath_2); - } else { - splitFASTQ(sequences, queryPath_1); - splitFASTQ(sequences2, queryPath_2); - } + } else { + splitQueryFile(sequences_read1, queryPath_1); + splitQueryFile(sequences_read2, queryPath_2); - if (sequences.size() != sequences2.size()) { + // Check if the number of reads in the two files are equal + if (sequences_read1.size() != sequences_read2.size()) { Debug(Debug::ERROR) << "The number of reads in the two files are not equal." << "\n"; EXIT(EXIT_FAILURE); } - numOfSeq = sequences.size(); - // Make query read splits + numOfSeq = sequences_read1.size(); for (size_t i = 0; i < numOfSeq; i++) { - totalReadLength += sequences[i].seqLength; - totalReadLength += sequences2[i].seqLength; - currentKmerCnt = getQueryKmerNumber(sequences[i].seqLength) + - getQueryKmerNumber(sequences2[i].seqLength); + totalReadLength += sequences_read1[i].seqLength + sequences_read2[i].seqLength; + currentKmerCnt = getQueryKmerNumber(sequences_read1[i].seqLength) + + getQueryKmerNumber(sequences_read2[i].seqLength); kmerCnt += currentKmerCnt; seqCnt ++; if (c * kmerCnt + ((size_t) 200 * seqCnt) > ram_threads) { @@ -224,6 +193,7 @@ void Classifier::startClassify(const LocalParameters &par) { queryReadSplit.emplace_back(start, numOfSeq); splitKmerCnt.push_back(kmerCnt); } + cout << "Done" << endl; cout << "Total number of sequences: " << numOfSeq << endl; cout << "Total read length: " << totalReadLength << "nt" << endl; @@ -243,6 +213,10 @@ void Classifier::startClassify(const LocalParameters &par) { #endif // Extract k-mers from query sequences and compare them to target k-mer DB double vm, rss; + KSeqWrapper* kseq1 = KSeqFactory(par.filenames[0].c_str()); + KSeqWrapper* kseq2 = nullptr; + if (par.seqMode == 2) { kseq2 = KSeqFactory(par.filenames[1].c_str()); } + for (size_t splitIdx = 0; splitIdx < queryReadSplit.size(); splitIdx++) { // Allocate memory for query list queryList.clear(); @@ -266,21 +240,21 @@ void Classifier::startClassify(const LocalParameters &par) { // Extract query k-mer time_t beforeKmerExtraction = time(nullptr); cout << "Extracting query metamers ... " << endl; - if (! par.maskMode) { // No masking - if (par.seqMode == 1 || par.seqMode == 3) { // Single-end short-read sequence or long-read sequence - fillQueryKmerBufferParallel(kmerBuffer, queryFile, sequences, queryList, queryReadSplit[splitIdx], par); - } else if (par.seqMode == 2) { - fillQueryKmerBufferParallel(kmerBuffer, sequences, sequences2, queryList, queryReadSplit[splitIdx], - par); - } - } else { // Masking low-complexity regions - if (par.seqMode == 1 || par.seqMode == 3) { // Single-end short-read sequence or long-read sequence - fillQueryKmerBufferParallel(kmerBuffer, queryFile, sequences, queryList, queryReadSplit[splitIdx], par); - } else if (par.seqMode == 2) { - fillQueryKmerBufferParallel(kmerBuffer, sequences, sequences2, queryList, queryReadSplit[splitIdx], - par); - } + if (par.seqMode == 1 || par.seqMode == 3) { // Single-end short-read sequence or long-read sequence + fillQueryKmerBufferParallel(kseq1, + kmerBuffer, + queryList, + queryReadSplit[splitIdx], + par); + } else if (par.seqMode == 2) { + fillQueryKmerBufferParallel_paired(kseq1, + kseq2, + kmerBuffer, + queryList, + queryReadSplit[splitIdx], + par); } + numOfTatalQueryKmerCnt += kmerBuffer.startIndexOfReserve; cout << "Time spent for metamer extraction: " << double(time(nullptr) - beforeKmerExtraction) << endl; @@ -318,7 +292,7 @@ void Classifier::startClassify(const LocalParameters &par) { // << matchBuffer.buffer[i].position << " " << (int) matchBuffer.buffer[i].hamming << " " << taxIdList[matchBuffer.buffer[i].targetId] << endl; // } - // Classify queries based on the matches + //#ifdef OPENMP // if (par.printLog == 1) { // omp_set_num_threads(1); @@ -327,6 +301,7 @@ void Classifier::startClassify(const LocalParameters &par) { // } //#endif + // Classify queries based on the matches time_t beforeAnalyze = time(nullptr); cout << "Analyzing matches ..." << endl; fromMatchToClassification(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par); @@ -348,61 +323,74 @@ void Classifier::startClassify(const LocalParameters &par) { // Memory deallocation free(matchBuffer.buffer); - - munmap(queryFile.data, queryFile.fileSize + 1); - if (par.seqMode == 2) { - munmap(queryFile2.data, queryFile2.fileSize + 1); - } } -void Classifier::fillQueryKmerBufferParallel(QueryKmerBuffer &kmerBuffer, - MmapedData &seqFile, - const vector &seqs, +void Classifier::fillQueryKmerBufferParallel(KSeqWrapper* kseq1, + QueryKmerBuffer &kmerBuffer, vector & queryList, const pair & currentSplit, const LocalParameters &par) { + size_t queryNum = currentSplit.second - currentSplit.first; + size_t processedQueryNum = 0; -#pragma omp parallel default(none), shared(par, kmerBuffer, seqFile, seqs, cout, queryList, currentSplit) - { - SeqIterator seqIterator(par); - size_t posToWrite; + // Array to store reads of thread number + vector reads1(par.threads); + + while (processedQueryNum < queryNum) { + size_t currentQueryNum = min(queryNum - processedQueryNum, (size_t) par.threads); + size_t count = 0; + while (count < currentQueryNum) { + // Read query + kseq1->ReadEntry(); + const KSeqWrapper::KSeqEntry & e1 = kseq1->entry; + + // Get k-mer count + int kmerCnt = getQueryKmerNumber((int) e1.sequence.l); + + // Query Info + queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l); + queryList[processedQueryNum].name = string(e1.name.s); + queryList[processedQueryNum].kmerCnt = (int) (kmerCnt); + + // Store reads + reads1[count] = string(kseq1->entry.sequence.s); + + processedQueryNum ++; + count ++; + } +#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1) + { + SeqIterator seqIterator(par); + size_t posToWrite; #pragma omp for schedule(dynamic, 1) - for (size_t i = currentSplit.first; i < currentSplit.second; i++) { - size_t queryIdx = i - currentSplit.first; - kseq_buffer_t buffer(const_cast(&seqFile.data[seqs[i].start]), seqs[i].length); - kseq_t *seq = kseq_init(&buffer); - kseq_read(seq); - int kmerCnt = getQueryKmerNumber ((int) seq->seq.l); - // Ignore short read - if (kmerCnt < 1) { - kseq_destroy(seq); - continue; - } + for (size_t i = 0; i < currentQueryNum; i ++) { + size_t queryIdx = processedQueryNum - currentQueryNum + i; + // Get k-mer count + auto kmerCnt = getQueryKmerNumber(reads1[i].length()); - // Get masked sequence - char *maskedSeq = nullptr; - if (maskMode) { - maskedSeq = new char[seq->seq.l + 1]; - SeqIterator::maskLowComplexityRegions(seq->seq.s, maskedSeq, *probMatrix, maskProb, subMat); - } else { - maskedSeq = seq->seq.s; - } + // Ignore short read + if (kmerCnt < 1) { continue; } - posToWrite = kmerBuffer.reserveMemory(kmerCnt); + // Get masked sequence + char *maskedSeq1 = nullptr; + if (maskMode) { + maskedSeq1 = new char[reads1[i].length() + 1]; + SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat); + } else { + maskedSeq1 = const_cast(reads1[i].c_str()); + } - seqIterator.sixFrameTranslation(maskedSeq, (int) seq->seq.l); - seqIterator.fillQueryKmerBuffer(maskedSeq, (int) seq->seq.l, kmerBuffer, posToWrite, - (int) queryIdx); - if (par.maskMode) { - delete[] maskedSeq; - } + posToWrite = kmerBuffer.reserveMemory(kmerCnt); - queryList[queryIdx].queryLength = getMaxCoveredLength((int) seq->seq.l); - queryList[queryIdx].queryId = (int) queryIdx; - queryList[queryIdx].name = string(seq->name.s); - queryList[queryIdx].kmerCnt = (int) kmerCnt; + // Process Read 1 + seqIterator.sixFrameTranslation(maskedSeq1, (int) reads1[i].length()); + seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) reads1[i].length(), kmerBuffer, posToWrite, + (uint32_t) queryIdx); - kseq_destroy(seq); + if (maskMode) { + delete[] maskedSeq1; + } + } } } } @@ -424,106 +412,96 @@ T Classifier::getQueryKmerNumber(T queryLength) { return (getMaxCoveredLength(queryLength) / 3 - kmerLength - spaceNum_int + 1) * 6; } -void Classifier::fillQueryKmerBufferParallel(QueryKmerBuffer &kmerBuffer, - const vector &seqs, - const vector &seqs2, - vector & queryList, - const pair & currentSplit, - const LocalParameters &par) { - vector> queryReadSplit; -// size_t numOfSeq = currentSplit.second - currentSplit.first; - size_t numOfSeqPerSplit = size_t(1000); - for (size_t i = currentSplit.first; i < currentSplit.second; i += numOfSeqPerSplit) { - queryReadSplit.emplace_back(i, std::min(i + numOfSeqPerSplit - 1, currentSplit.second- 1)); - } - -#pragma omp parallel default(none), shared(par, kmerBuffer, seqs, seqs2, cout, queryList, currentSplit, queryReadSplit, numOfSeqPerSplit) - { - FILE * query1 = fopen(par.filenames[0].c_str(), "r"); - FILE * query2 = fopen(par.filenames[1].c_str(), "r"); - char * readBuffer1 = (char *) malloc(3000 * numOfSeqPerSplit); - char * readBuffer2 = (char *) malloc(3000 * numOfSeqPerSplit); - size_t readBufferIdx1 = 0; - size_t readBufferIdx2 = 0; - SeqIterator seqIterator(par); - SeqIterator seqIterator2(par); - size_t posToWrite; - +void Classifier::fillQueryKmerBufferParallel_paired(KSeqWrapper* kseq1, + KSeqWrapper* kseq2, + QueryKmerBuffer &kmerBuffer, + vector & queryList, + const pair & currentSplit, + const LocalParameters &par) { + size_t queryNum = currentSplit.second - currentSplit.first; + size_t processedQueryNum = 0; + + // Array to store reads of thread number + vector reads1(par.threads); + vector reads2(par.threads); + + while (processedQueryNum < queryNum) { + size_t currentQueryNum = min(queryNum - processedQueryNum, (size_t) par.threads); + size_t count = 0; + + // Fill reads in sequential + while (count < currentQueryNum) { + // Read query + kseq1->ReadEntry(); + kseq2->ReadEntry(); + const KSeqWrapper::KSeqEntry & e1 = kseq1->entry; + const KSeqWrapper::KSeqEntry & e2 = kseq2->entry; + + // Get k-mer count + int kmerCnt = getQueryKmerNumber((int) e1.sequence.l); + int kmerCnt2 = getQueryKmerNumber((int) e2.sequence.l); + + // Query Info + queryList[processedQueryNum].queryLength = getMaxCoveredLength((int) e1.sequence.l); + queryList[processedQueryNum].queryLength2 = getMaxCoveredLength((int) e2.sequence.l); + queryList[processedQueryNum].name = string(e1.name.s); + queryList[processedQueryNum].kmerCnt = (int) (kmerCnt + kmerCnt2); + + // Store reads + reads1[count] = string(kseq1->entry.sequence.s); + reads2[count] = string(kseq2->entry.sequence.s); + + processedQueryNum ++; + count ++; + } + + // Process reads in parallel +#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1, reads2) + { + SeqIterator seqIterator(par); + SeqIterator seqIterator2(par); + size_t posToWrite; #pragma omp for schedule(dynamic, 1) - for (size_t j = 0; j < queryReadSplit.size(); j ++) { - // Load query reads of current split - fseek(query1, (long) seqs[queryReadSplit[j].first].start, SEEK_SET); - loadBuffer(query1, readBuffer1, readBufferIdx1, seqs[queryReadSplit[j].second].end - seqs[queryReadSplit[j].first].start + 1); - fseek(query2, (long) seqs2[queryReadSplit[j].first].start, SEEK_SET); - loadBuffer(query2, readBuffer2, readBufferIdx2, seqs2[queryReadSplit[j].second].end - seqs2[queryReadSplit[j].first].start + 1); - for (size_t i = queryReadSplit[j].first; i <= queryReadSplit[j].second; i++) { - size_t queryIdx = i - currentSplit.first; - // Load Read 1 - kseq_buffer_t buffer1(readBuffer1 + seqs[i].start - seqs[queryReadSplit[j].first].start, - seqs[i].length); - kseq_t *seq1 = kseq_init(&buffer1); - kseq_read(seq1); - int kmerCnt = getQueryKmerNumber((int) seq1->seq.l); - - // Load Read 2 - kseq_buffer_t buffer2(readBuffer2 + seqs2[i].start - seqs2[queryReadSplit[j].first].start, - seqs2[i].length); - kseq_t *seq2 = kseq_init(&buffer2); - kseq_read(seq2); - int kmerCnt2 = getQueryKmerNumber((int) seq2->seq.l); + for (size_t i = 0; i < currentQueryNum; i ++) { + size_t queryIdx = processedQueryNum - currentQueryNum + i; + // Get k-mer count + auto kmerCnt = getQueryKmerNumber(reads1[i].length()); + auto kmerCnt2 = getQueryKmerNumber(reads2[i].length()); // Ignore short read - if (kmerCnt2 < 1 || kmerCnt < 1) { - kseq_destroy(seq1); - kseq_destroy(seq2); - continue; - } + if (kmerCnt2 < 1 || kmerCnt < 1) { continue; } // Get masked sequence char *maskedSeq1 = nullptr; char *maskedSeq2 = nullptr; if (maskMode) { - maskedSeq1 = new char[seq1->seq.l + 1]; - maskedSeq2 = new char[seq2->seq.l + 1]; - SeqIterator::maskLowComplexityRegions(seq1->seq.s, maskedSeq1, *probMatrix, maskProb, subMat); - SeqIterator::maskLowComplexityRegions(seq2->seq.s, maskedSeq2, *probMatrix, maskProb, subMat); + maskedSeq1 = new char[reads1[i].length() + 1]; + maskedSeq2 = new char[reads2[i].length() + 1]; + SeqIterator::maskLowComplexityRegions(reads1[i].c_str(),maskedSeq1, *probMatrix, maskProb, subMat); + SeqIterator::maskLowComplexityRegions(reads2[i].c_str(),maskedSeq2, *probMatrix, maskProb, subMat); } else { - maskedSeq1 = seq1->seq.s; - maskedSeq2 = seq2->seq.s; + maskedSeq1 = const_cast(reads1[i].c_str()); + maskedSeq2 = const_cast(reads2[i].c_str()); } posToWrite = kmerBuffer.reserveMemory(kmerCnt + kmerCnt2); // Process Read 1 - seqIterator.sixFrameTranslation(maskedSeq1, (int) seq1->seq.l); - seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) seq1->seq.l, kmerBuffer, posToWrite, + seqIterator.sixFrameTranslation(maskedSeq1, (int) reads1[i].length()); + seqIterator.fillQueryKmerBuffer(maskedSeq1, (int) reads1[i].length(), kmerBuffer, posToWrite, (uint32_t) queryIdx); - queryList[queryIdx].queryLength = getMaxCoveredLength((int) seq1->seq.l); // Process Read 2 - seqIterator2.sixFrameTranslation(maskedSeq2, (int) seq2->seq.l); - seqIterator2.fillQueryKmerBuffer(maskedSeq2, (int) seq2->seq.l, kmerBuffer, posToWrite, + seqIterator2.sixFrameTranslation(maskedSeq2, (int) reads2[i].length()); + seqIterator2.fillQueryKmerBuffer(maskedSeq2, (int) reads2[i].length(), kmerBuffer, posToWrite, (uint32_t) queryIdx, queryList[queryIdx].queryLength); if (maskMode) { delete[] maskedSeq1; delete[] maskedSeq2; } - - // Query Info - queryList[queryIdx].queryLength2 = getMaxCoveredLength((int) seq2->seq.l); - queryList[queryIdx].queryId = (int) queryIdx; - queryList[queryIdx].name = string(seq1->name.s); - queryList[queryIdx].kmerCnt = (int) (kmerCnt + kmerCnt2); - - kseq_destroy(seq1); - kseq_destroy(seq2); } } - free(readBuffer1); - free(readBuffer2); - fclose(query1); - fclose(query2); } } @@ -2136,72 +2114,19 @@ unsigned int Classifier::cladeCountVal(const std::unordered_map & seqSegments, const string & queryPath) { - ifstream fastq; - fastq.open(queryPath); - if (!fastq.is_open()) { - cerr << "Error: Cannot open file " << queryPath << endl; - exit(1); - } - - string line; - size_t lineCnt = 0; - size_t start = 0; - size_t end; - while (getline(fastq, line)) { - if (lineCnt % 4 == 0){ - start = (size_t) fastq.tellg() - line.length() - 1; - } - if (lineCnt % 4 == 1){ - end = (size_t) fastq.tellg() - 1; - seqSegments.emplace_back(start, end, end - start + 1, line.length()); - } - lineCnt++; - } -} - -void Classifier::splitFASTA(vector & seqSegments, const string & queryPath) { - ifstream fasta; - fasta.open(queryPath); - if (!fasta.is_open()) { - cerr << "Error: Cannot open file " << queryPath << endl; - exit(1); - } - - string line; - size_t start = 0; - size_t end; - getline(fasta, line); - size_t seqLength = 0; - while (getline(fasta, line)) { - if (line[0] == '>') { - seqSegments.emplace_back(start, end, end - start + 1, seqLength); - start = end + 1; - seqLength = 0; - } else { - end = (size_t) fasta.tellg() - 1; - seqLength += line.length(); - } +void Classifier::splitQueryFile(vector & sequences, const std::string &queryPath) { + KSeqWrapper* kseq = nullptr; + kseq = KSeqFactory(queryPath.c_str()); + while (kseq->ReadEntry()) { + const KSeqWrapper::KSeqEntry & e = kseq->entry; + sequences.emplace_back(e.headerOffset - 1, + e.sequenceOffset + e.sequence.l, + e.sequenceOffset + e.sequence.l - e.headerOffset + 2, + e.sequence.l); } - seqSegments.emplace_back(start, end, end - start + 1, seqLength); } - bool Classifier::isConsecutive(const Match * match1, const Match * match2) { -// uint16_t hamming1 = match1->rightEndHamming; -// uint16_t hamming2 = match2->rightEndHamming; -// // move bits to right by 2 -// hamming1 >>= 2; -// // set most significant two bits to 0 -// hamming2 &= 0x3FFF; return (match1->rightEndHamming >> 2) == (match2->rightEndHamming & 0x3FFF); } diff --git a/src/commons/Classifier.h b/src/commons/Classifier.h index 0b3ca71e..06379189 100644 --- a/src/commons/Classifier.h +++ b/src/commons/Classifier.h @@ -24,6 +24,7 @@ #include #include "Match.h" #include + #define BufferSize 16'777'216 //16 * 1024 * 1024 // 16 M using namespace std; @@ -131,23 +132,21 @@ class Classifier { {3, 2, 3, 3, 4, 4, 1, 0}}; // Index reads in query file - static void splitFASTQ(vector & seqSegments, const string & queryPath); - static void splitFASTA(vector & seqSegments, const string & queryPath); + static void splitQueryFile(vector & seqSegments, const string & queryPath); // Extract query k-mer - void fillQueryKmerBufferParallel(QueryKmerBuffer &kmerBuffer, - MmapedData &seqFile, - const vector &seqs, + void fillQueryKmerBufferParallel(KSeqWrapper* kseq1, + QueryKmerBuffer &kmerBuffer, vector & queryList, const pair & currentSplit, const LocalParameters &par); - void fillQueryKmerBufferParallel(QueryKmerBuffer &kmerBuffer, - const vector &seqs, - const vector &seqs2, - vector & queryList, - const pair & currentSplit, - const LocalParameters &par); + void fillQueryKmerBufferParallel_paired(KSeqWrapper* kseq1, + KSeqWrapper* kseq2, + QueryKmerBuffer &kmerBuffer, + vector &queryList, + const pair ¤tSplit, + const LocalParameters &par); static int getMaxCoveredLength(int queryLength); diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index f588d3d0..9db9f473 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -53,9 +53,8 @@ IndexCreator::IndexCreator(const LocalParameters &par, string dbDir, string fnaL } IndexCreator::~IndexCreator() { - if (taxonomy != nullptr){ - delete taxonomy; - } + delete taxonomy; + delete subMat; } void IndexCreator::createIndex(const LocalParameters &par) { @@ -115,7 +114,6 @@ void IndexCreator::createIndex(const LocalParameters &par) { delete[] uniqKmerIdx; } delete[] splitChecker; - } void IndexCreator::updateIndex(const LocalParameters &par) { @@ -483,6 +481,7 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq for(size_t i = 0; i < splits.size(); i++){ delete[] idxOfEachSplit[i]; } + delete[] idxOfEachSplit; delete[] cntOfEachSplit; } diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp index c1f7c013..262d1258 100644 --- a/src/commons/SeqIterator.cpp +++ b/src/commons/SeqIterator.cpp @@ -777,7 +777,7 @@ void SeqIterator::generateIntergenicKmerList(_gene *genes, _node *nodes, int num free(kmer); } -void SeqIterator::maskLowComplexityRegions(char *seq, char *maskedSeq, ProbabilityMatrix & probMat, +void SeqIterator::maskLowComplexityRegions(const char *seq, char *maskedSeq, ProbabilityMatrix & probMat, float maskProb, const BaseMatrix * subMat) { unsigned int seqLen = 0; while (seq[seqLen] != '\0') { diff --git a/src/commons/SeqIterator.h b/src/commons/SeqIterator.h index 40393095..b3be8638 100644 --- a/src/commons/SeqIterator.h +++ b/src/commons/SeqIterator.h @@ -96,7 +96,7 @@ class SeqIterator { int fillBufferWithKmerFromBlock(const PredictedBlock &block, const char *seq, TargetKmerBuffer &kmerBuffer, size_t &posToWrite, int seqID, int taxIdAtRank); - static void maskLowComplexityRegions(char * seq, char * maskedSeq, ProbabilityMatrix & probMat, + static void maskLowComplexityRegions(const char * seq, char * maskedSeq, ProbabilityMatrix & probMat, float maskProb, const BaseMatrix * subMat); void printKmerInDNAsequence(uint64_t kmer); diff --git a/src/workflow/classify.cpp b/src/workflow/classify.cpp index 27050e86..69786781 100644 --- a/src/workflow/classify.cpp +++ b/src/workflow/classify.cpp @@ -3,6 +3,7 @@ #include "Parameters.h" #include "LocalParameters.h" #include "NcbiTaxonomy.h" +#include "FileUtil.h" void setClassifyDefaults(LocalParameters & par){ par.virusTaxId = 10239;// Taxonomy ID of virus taxon in NCBI @@ -34,6 +35,15 @@ int classify(int argc, const char **argv, const Command& command) setClassifyDefaults(par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0); + if (par.seqMode == 2) { + if (!FileUtil::directoryExists(par.filenames[3].c_str())) { + FileUtil::makeDir(par.filenames[3].c_str()); + } + } else { + if (!FileUtil::directoryExists(par.filenames[2].c_str())) { + FileUtil::makeDir(par.filenames[2].c_str()); + } + } #ifdef OPENMP omp_set_num_threads(par.threads); diff --git a/util/Metabuli-regression b/util/Metabuli-regression index 33db1464..1b9dd44a 160000 --- a/util/Metabuli-regression +++ b/util/Metabuli-regression @@ -1 +1 @@ -Subproject commit 33db1464458848765034fe4439ea358a667663fe +Subproject commit 1b9dd44ada5a0e4ee512194b0115a546d944283a