Skip to content

Commit

Permalink
Merge pull request #89 from jaebeom-kim/master
Browse files Browse the repository at this point in the history
add extract module to extract reads classified into a certain taxon
  • Loading branch information
jaebeom-kim authored Sep 21, 2024
2 parents 5cf8577 + a04ab82 commit 5d950ad
Show file tree
Hide file tree
Showing 9 changed files with 226 additions and 7 deletions.
1 change: 1 addition & 0 deletions src/LocalCommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ extern int databaseReport(int argc, const char **argv, const Command& command);
extern int mapping2taxon(int argc, const char **argv, const Command& command);
extern int expand_diffidx(int argc, const char **argv, const Command& command);
extern int makeAAoffset(int argc, const char **argv, const Command& command);
extern int extract(int argc, const char **argv, const Command& command);

#endif //ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H
5 changes: 4 additions & 1 deletion src/commons/KmerExtractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,10 @@ void KmerExtractor::loadChunkOfReads(KSeqWrapper *kseq,
kseq->ReadEntry();
queryList[processedQueryNum].queryLength2 = LocalUtil::getMaxCoveredLength((int) kseq->entry.sequence.l);

if (emptyReads[i]) { continue; }
if (emptyReads[i]) {
count ++;
continue;
}

// Check if the read is too short
int kmerCnt = LocalUtil::getQueryKmerNumber<int>((int) kseq->entry.sequence.l, spaceNum);
Expand Down
12 changes: 12 additions & 0 deletions src/commons/LocalParameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,13 @@ LocalParameters::LocalParameters() :
typeid(float),
(void *) &tieRatio,
"^0(\\.[0-9]+)?|1(\\.0+)?$"),
TARGET_TAX_ID(TARGET_TAX_ID_ID,
"--tax-id",
"Tax. ID of clade to be extracted",
"Tax. ID of clade to be extracted",
typeid(int),
(void *) &targetTaxId,
"^[0-9]+$"),
LIBRARY_PATH(LIBRARY_PATH_ID,
"--library-path",
"Path to library where the FASTA files are stored",
Expand Down Expand Up @@ -385,6 +392,11 @@ LocalParameters::LocalParameters() :
classify.push_back(&TIE_RATIO);
// classify.push_back(&MIN_SS_MATCH);

// extract
extract.push_back(&TAXONOMY_PATH);
extract.push_back(&SEQ_MODE);
extract.push_back(&TARGET_TAX_ID);

// filter
filter.push_back(&PARAM_THREADS);
filter.push_back(&SEQ_MODE);
Expand Down
7 changes: 7 additions & 0 deletions src/commons/LocalParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class LocalParameters : public Parameters {
}

std::vector<MMseqsParameter*> classify;
std::vector<MMseqsParameter*> extract;
std::vector<MMseqsParameter*> filter;
std::vector<MMseqsParameter*> exclusiontest_hiv;
std::vector<MMseqsParameter*> seqHeader2TaxId;
Expand Down Expand Up @@ -59,6 +60,9 @@ class LocalParameters : public Parameters {
PARAMETER(MIN_SS_MATCH)
PARAMETER(TIE_RATIO)

// extract
PARAMETER(TARGET_TAX_ID)

// DB build parameters
PARAMETER(LIBRARY_PATH)
PARAMETER(TAXONOMY_PATH)
Expand Down Expand Up @@ -109,6 +113,9 @@ class LocalParameters : public Parameters {
int minSSMatch;
float tieRatio;

// Extract
int targetTaxId;

// Database creation
std::string tinfoPath;
std::string libraryPath;
Expand Down
92 changes: 88 additions & 4 deletions src/commons/Reporter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "taxonomyreport.cpp"

Reporter::Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxonomy(taxonomy) {
if (par.targetTaxId != 0) {return;}
if (par.contamList == "") { // classify module
if (par.seqMode == 2) {
outDir = par.filenames[3];
Expand All @@ -13,10 +14,7 @@ Reporter::Reporter(const LocalParameters &par, NcbiTaxonomy *taxonomy) : taxonom
// Output file names
reportFileName = outDir + + "/" + jobId + "_report.tsv";
readClassificationFileName = outDir + "/" + jobId + "_classifications.tsv";
}



}
}

void Reporter::openReadClassificationFile() {
Expand Down Expand Up @@ -105,4 +103,90 @@ unsigned int Reporter::cladeCountVal(const std::unordered_map<TaxID, TaxonCounts
} else {
return it->second.cladeCount;
}
}

void Reporter::getReadsClassifiedToClade(TaxID cladeId,
const string &readClassificationFileName,
vector<size_t> &readIdxs) {
FILE *results = fopen(readClassificationFileName.c_str(), "r");
if (!results) {
perror("Failed to open read-by-read classification file");
return;
}

char line[4096];
size_t idx = 0;
// int classification;

while (fgets(line, sizeof(line), results)) {
int taxId;
if (sscanf(line, "%*s %*s %d", &taxId) == 1) {
if (taxonomy->IsAncestor(cladeId, taxId)) {
readIdxs.push_back(idx);
}
}
idx++;
}

fclose(results);
}

void Reporter::printSpecifiedReads(const vector<size_t> & readIdxs,
const string & readFileName,
const string & outFileName) {
// Check FASTA or FASTQ
KSeqWrapper* tempKseq = KSeqFactory(readFileName.c_str());
tempKseq->ReadEntry();
bool isFasta = tempKseq->entry.qual.l == 0;
delete tempKseq;

KSeqWrapper* kseq = KSeqFactory(readFileName.c_str());

FILE *outFile = fopen(outFileName.c_str(), "w");
if (!outFile) {
perror("Failed to open file");
return;
}

size_t readCnt = 0;
size_t idx = 0;

if (isFasta) {
while (kseq->ReadEntry()) {
if (readCnt == readIdxs[idx]) {
fprintf(outFile, ">%s\n%s\n", kseq->entry.name.s, kseq->entry.sequence.s);
idx++;
if (idx == readIdxs.size()) {
break;
}
}
readCnt++;
}
} else {
while (kseq->ReadEntry()) {
if (readCnt == readIdxs[idx]) {
fprintf(outFile, "@%s", kseq->entry.name.s);
if (kseq->entry.comment.l > 0) {
fprintf(outFile, " %s\n", kseq->entry.comment.s);
} else {
fprintf(outFile, "\n");
}
fprintf(outFile, "%s\n", kseq->entry.sequence.s);
fprintf(outFile, "+%s", kseq->entry.name.s);
if (kseq->entry.comment.l > 0) {
fprintf(outFile, " %s\n", kseq->entry.comment.s);
} else {
fprintf(outFile, "\n");
}
fprintf(outFile, "%s\n", kseq->entry.qual.s);

idx++;
if (idx == readIdxs.size()) {
break;
}
}
readCnt++;
}
}
delete kseq;
}
11 changes: 10 additions & 1 deletion src/commons/Reporter.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
#include <unordered_map>
#include "NcbiTaxonomy.h"
#include "LocalParameters.h"
#include "KSeqWrapper.h"

using namespace std;


class Reporter {
private:
string outDir;
Expand All @@ -35,6 +35,15 @@ class Reporter {

unsigned int cladeCountVal(const std::unordered_map<TaxID, TaxonCounts> &map, TaxID key);

// Extract reads classified to a specific clade
void getReadsClassifiedToClade(TaxID cladeId,
const string &readClassificationFileName,
vector<size_t> &readIdxs);

void printSpecifiedReads(const vector<size_t> & readIdxs,
const string & readFileName,
const string & outFileName);

// Setter
void setReportFileName(const string &reportFileName) {
Reporter::reportFileName = reportFileName;
Expand Down
11 changes: 10 additions & 1 deletion src/metabuli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,19 @@ std::vector<Command> commands = {
"Jaebeom Kim <[email protected]>",
"<i:query file(s)> <i:database directory> <o:output directory> <job ID> ",
CITATION_SPACEPHARER,
{{"FASTA", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile},
{{"FASTA/Q", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile},
{"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
{"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory},
{"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
{"extract", extract, &localPar.extract, COMMAND_MAIN,
"It extracts reads classified into a certain taxon. It should be used after classification.",
nullptr,
"Jaebeom Kim <[email protected]>",
"<i:query file(s)> <i:read-by-read result> <i:database directory>",
CITATION_SPACEPHARER,
{{"FASTA/Q", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile},
{"read-by-read result", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
{"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}},
{"grade", grade, &localPar.grade, COMMAND_EXPERT,
"Grade the classification result (only for benchmarking)",
nullptr,
Expand Down
1 change: 1 addition & 0 deletions src/workflow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ set(workflow_source_files
workflow/add_to_library.cpp
workflow/build.cpp
workflow/filter.cpp
workflow/extract.cpp
PARENT_SCOPE)
93 changes: 93 additions & 0 deletions src/workflow/extract.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#include "LocalParameters.h"
#include "FileUtil.h"
#include "common.h"
#include "Reporter.h"

void setExtractDefaults(LocalParameters & par){
par.taxonomyPath = "" ;
par.seqMode = 2;
par.targetTaxId = 0;
}

int extract(int argc, const char **argv, const Command& command)
{
LocalParameters & par = LocalParameters::getLocalInstance();
setExtractDefaults(par);
par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0);

if (par.seqMode == 2) {
// Check if the second argument is a directory
if (FileUtil::directoryExists(par.filenames[2].c_str())) {
cerr << "For '--seq-mode 2', please provide two query files." << endl;
exit(1);
}
} else {
// Check if the second argument is file
if (!FileUtil::directoryExists(par.filenames[2].c_str())) {
cerr << "For '--seq-mode 1' and '--seq-mode 3', please provide one query file." << endl;
exit(1);
}
}

if (par.targetTaxId == 0) {
cerr << "Please provide a target taxon ID with --tax-id parameter." << endl;
exit(1);
}

string classificationFileName = par.filenames[1 + (par.seqMode == 2)];
string dbDir = par.filenames[2 + (par.seqMode == 2)];
TaxID targetTaxID = par.targetTaxId;

cout << "Loading taxonomy ... " << endl;
NcbiTaxonomy *taxonomy = loadTaxonomy(dbDir, par.taxonomyPath);
Reporter reporter(par, taxonomy);

vector<size_t> readIdxs;

cout << "Extracting reads classified to taxon " << targetTaxID << " ... " << flush;
reporter.getReadsClassifiedToClade(targetTaxID, classificationFileName, readIdxs);
cout << "done." << endl;

string queryFileName = par.filenames[0];
size_t lastDotPos = queryFileName.find_last_of('.');
string baseName = "";
string extension = "";

if (queryFileName.substr(lastDotPos) == ".gz") {
lastDotPos = queryFileName.substr(0, lastDotPos).find_last_of('.');
baseName = queryFileName.substr(0, lastDotPos);
extension = queryFileName.substr(lastDotPos);
// Remove the last ".gz" from the extension
extension = extension.substr(0, extension.length() - 3);
} else {
baseName = queryFileName.substr(0, lastDotPos);
extension = queryFileName.substr(lastDotPos);
}
string outFileName = baseName + "_" + to_string(targetTaxID) + extension;

cout << "Writing extracted reads to " << outFileName << " ... " << flush;
reporter.printSpecifiedReads(readIdxs, queryFileName, outFileName);
cout << "done." << endl;

if (par.seqMode == 2) {
queryFileName = par.filenames[1];
lastDotPos = queryFileName.find_last_of('.');

if (queryFileName.substr(lastDotPos) == ".gz") {
lastDotPos = queryFileName.substr(0, lastDotPos).find_last_of('.');
baseName = queryFileName.substr(0, lastDotPos);
extension = queryFileName.substr(lastDotPos);
extension = extension.substr(0, extension.length() - 3);
} else {
baseName = queryFileName.substr(0, lastDotPos);
extension = queryFileName.substr(lastDotPos);
}
outFileName = baseName + "_" + to_string(targetTaxID) + extension;
cout << "Writing extracted reads to " << outFileName << " ... " << flush;
reporter.printSpecifiedReads(readIdxs, queryFileName, outFileName);
cout << "done." << endl;
}

delete taxonomy;
return 0;
}

0 comments on commit 5d950ad

Please sign in to comment.