Skip to content

Commit

Permalink
Merge pull request #51 from jaebeom-kim/master
Browse files Browse the repository at this point in the history
Improved speed by removing unused copies.
  • Loading branch information
jaebeom-kim authored Dec 8, 2023
2 parents c077844 + 79e743d commit 0b60d81
Show file tree
Hide file tree
Showing 8 changed files with 98 additions and 220 deletions.
18 changes: 12 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,22 +49,26 @@ The built binary can be found in `./build/src`.
## Pre-built databases
You can download [pre-built databases](https://metabuli.steineggerlab.workers.dev/) using `databases` workflow.
```
Usage:
metabuli databases DB_NAME OUTDIR tmp
# RefSeq Complete/Chromosome (115.6 GiB)
# - Complete Genome or Chromosome level assemblies of virus and prokaryotes in RefSeq (2023-04-04) and human genome (GRCh38.p14)
metabuli databases RefSeq DBDIR tmp
metabuli databases RefSeq OUTDIR tmp
# RefSeq Releases 217 (480.5 GiB)
# - Viral and prokaryotic genomes of RefSeq release 217 and human genome (GRCh38.p14)
metabuli databases RefSeq217 DBDIR tmp
metabuli databases RefSeq217 OUTDIR tmp
# GTDB 207 (81.2 GiB)
# - Complete Genome or Chromosome level assemblies in GTDB207 (CheckM Completeness > 90, CheckM Contamination < 5) with GTDB taxonomy.
metabuli databases GTDB207 DBDIR tmp
metabuli databases GTDB207 OUTDIR tmp
# RefSeq Virus (1.5 GiB)
# - Viral RefSeq genomes and five SARS-CoV-2 variants (alpha, beta, delta, gamma, and omicron)
metabuli databases RefSeq_virus DBDIR tmp
metabuli databases RefSeq_virus OUT_DIR tmp
```
Downloaded files are stored in `OUTDIR/DB_NAME` directory, which can be provided for `classify` module as `DBDIR`.


## Classification
Expand Down Expand Up @@ -254,7 +258,9 @@ Classifying RNA-seq reads from a COVID-19 patient to identify the culprit varian
The whole process must take less than 10 mins using a personal machine.

#### 1. Download RefSeq Virus DB (1.5 GiB)
`metabuli databases RefSeq_virus refseq_virus tmp`
```
metabuli databases RefSeq_virus OUTDIR tmp
```

#### 2. Download an RNA-seq result (SRR14484345)
Option 1. Download using SRA Toolkit
Expand All @@ -270,7 +276,7 @@ cat SRR14484345.fastq | paste - - - - - - - - | tee >(cut -f 1-4 | tr "\t" "\n"

#### 3. Classify the reads using metabuli
```
metabuli classify SRR14484345_1.fq SRR14484345_2.fq refseq_virus RESULT_DIR JOB_ID --max-ram RAM_SIZE
metabuli classify SRR14484345_1.fq SRR14484345_2.fq OUTDIR/refseq_virus RESULT_DIR JOB_ID --max-ram RAM_SIZE
```
#### 4. Check RESULT_DIR/JOB_ID_report.tsv
Find a section like the example below
Expand Down
2 changes: 1 addition & 1 deletion src/commons/Classifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ void Classifier::startClassify(const LocalParameters &par) {
kmerMatcher->sortMatches(&matchBuffer);

// Classify queries based on the matches.
// omp_set_num_threads(1);
// omp_set_num_threads(1);
taxonomer->assignTaxonomy(matchBuffer.buffer, matchBuffer.startIndexOfReserve, queryList, par);
processedSeqCnt += queryReadSplit[splitIdx].end - queryReadSplit[splitIdx].start;
cout << "The number of processed sequences: " << processedSeqCnt << " (" << (double) processedSeqCnt / (double) numOfSeq << ")" << endl;
Expand Down
12 changes: 2 additions & 10 deletions src/commons/KmerMatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,11 +331,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
}
for (int k = 0; k < currMatchNum; k++) {
idx = selectedMatches[k];
// Check if candidateKmerInfos[idx].sequenceID is valid
// if (taxId2genusId.find(candidateKmerInfos[idx].sequenceID) == taxId2genusId.end() ||
// taxId2speciesId.find(candidateKmerInfos[idx].sequenceID) == taxId2speciesId.end()) {
// cout << "Error: " << candidateKmerInfos[idx].sequenceID << " is not found in the taxonomy database." << endl;
// }
matches[matchCnt] = {queryKmerList[j].info,
candidateKmerInfos[idx].sequenceID,
taxId2speciesId[candidateKmerInfos[idx].sequenceID],
Expand Down Expand Up @@ -474,10 +469,6 @@ querySplits, queryKmerList, matchBuffer, cout, targetDiffIdxFileName, numOfDiffI
free(splitCheckList);
queryKmerNum = 0;

#ifdef OPENMP
omp_set_num_threads(threads);
#endif

totalMatchCnt += matchBuffer->startIndexOfReserve;
return 1;
}
Expand Down Expand Up @@ -520,8 +511,9 @@ void KmerMatcher::compareDna(uint64_t query,
}

// Select target k-mers that passed hamming criteria
uint8_t maxHamming = min(minHammingSum * 2, 7);
for (size_t h = 0; h < size; h++) {
if (hammingSums[h] <= min(minHammingSum * 2, 7)) {
if (hammingSums[h] <= maxHamming) {
selectedMatches.push_back(h);
selectedHammingSum.push_back(hammingSums[h]);
if (frame < 3) { // Frame of query k-mer
Expand Down
6 changes: 3 additions & 3 deletions src/commons/LocalUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ void LocalUtil::splitQueryFile(std::vector<SequenceBlock> & sequences, const std

int LocalUtil::getMaxCoveredLength(int queryLength) {
if (queryLength % 3 == 2) {
return queryLength - 2; // 2
return queryLength - 2;
} else if (queryLength % 3 == 1) {
return queryLength - 4; // 4
return queryLength - 4;
} else {
return queryLength - 3; // 3
return queryLength - 3;
}
}

Expand Down
7 changes: 4 additions & 3 deletions src/commons/QueryIndexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) {
maxRam = par.ramUsage;
threads = par.threads;
bytesPerKmer = sizeof(QueryKmer) + matchPerKmer * sizeof(Match);
// std::cout << "bytesPerKmer: " << bytesPerKmer << "\n";
readNum_1 = 0;
readNum_2 = 0;
spaceNum = par.spaceMask.length() - kmerLength;
Expand All @@ -25,6 +26,7 @@ QueryIndexer::QueryIndexer(const LocalParameters & par) {
void QueryIndexer::setAvailableRam() {
availableRam = ((size_t) maxRam * (size_t) 1024 * 1024 * 1024)
- ((size_t) 128 * 1024 * 1024 * (size_t) threads);
// std::cout << "availableRam: " << availableRam << "\n";
}

void QueryIndexer::indexQueryFile() {
Expand All @@ -40,7 +42,6 @@ void QueryIndexer::indexQueryFile() {
totalReadLength += kseq->entry.sequence.l;
size_t currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(kseq->entry.sequence.l, spaceNum);
kmerCnt += currentKmerCnt;
// std::cout << "currentKmerCnt: " << kmerCnt << "\n";

if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) {
querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt);
Expand Down Expand Up @@ -72,7 +73,6 @@ void QueryIndexer::indexQueryFile() {
seqCnt_1++;
totalReadLength += kseq_1->entry.sequence.l;
currentKmerCnt = LocalUtil::getQueryKmerNumber<size_t>(kseq_1->entry.sequence.l, spaceNum);
kmerCnt += currentKmerCnt;
} else {
end = true;
}
Expand All @@ -82,7 +82,7 @@ void QueryIndexer::indexQueryFile() {
seqCnt_2++;
totalReadLength += kseq_2->entry.sequence.l;
currentKmerCnt += LocalUtil::getQueryKmerNumber<size_t>(kseq_2->entry.sequence.l, spaceNum);
kmerCnt += currentKmerCnt;

} else {
end = true;
}
Expand All @@ -92,6 +92,7 @@ void QueryIndexer::indexQueryFile() {
EXIT(EXIT_FAILURE);
}

kmerCnt += currentKmerCnt;
if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) {
querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt);
kmerCnt = currentKmerCnt;
Expand Down
Loading

0 comments on commit 0b60d81

Please sign in to comment.