Skip to content

Commit

Permalink
Comments, cleaning unused options, Style changes
Browse files Browse the repository at this point in the history
Fixes style and consistency for textops.h (#10, #11, #12).

Some cleanup of code leftover from previous edits, which are no longer
required.
  • Loading branch information
[email protected] committed Jan 13, 2021
1 parent e4c0213 commit 1fd4a44
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 39 deletions.
6 changes: 5 additions & 1 deletion src/bergamot/batcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ void Batcher::addSentenceWithPriority(RequestSentence &sentence) {
}

void Batcher::cleave_batch(RequestSentences &sentences) {
/* Temporary stub, needs improvement this section */
// For now simply iterates on buckets and converts batches greedily. This
// has to be enhanced with optimizing over priority. The baseline
// implementation should at least be as fast as marian's maxi-batch with full
// corpus size as maxi-batch size.

int segments_added = 0;
int current_input_tokens = 0;
int padded_batch_size = 0;
Expand Down
2 changes: 0 additions & 2 deletions src/bergamot/batcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@

namespace marian {
namespace bergamot {
struct RequestSentence;

class Batcher {
unsigned int max_input_tokens_;
unsigned int max_input_sentence_tokens_;
Expand Down
19 changes: 2 additions & 17 deletions src/bergamot/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,27 +15,12 @@
int main(int argc, char *argv[]) {
marian::ConfigParser cp(marian::cli::mode::translation);

cp.addOption<int>("--port,-p", "Server Options", "server port", 18080);
cp.addOption<int>("--queue-timeout", "Server Options",
"max wait time (in ms) for new data before an underfull "
"batch is launched",
10);
cp.addOption<size_t>(
"--max-workers", "Bergamot Options",
"Maximum number of worker threads to deploy when using CPU.",
std::thread::hardware_concurrency());
cp.addOption<std::string>("--server-root", "Server Options",
"server's document root directory",
"${HOME}/marian/lib/ui/");
cp.addOption<std::string>(
"--ssplit-prefix-file", "Server Options",
"--ssplit-prefix-file", "Bergamot Options",
"File with nonbreaking prefixes for sentence splitting.");

cp.addOption<std::string>("--ssplit-mode", "Server Options",
"[paragraph, sentence, wrapped_text]");
cp.addOption<std::string>("--source-language", "Server Options",
"source language of translation service");
cp.addOption<std::string>("--target-language", "Server Options",
"target language of translation service");

cp.addOption<int>(
"--max-input-sentence-tokens", "Bergamot Options",
Expand Down
40 changes: 28 additions & 12 deletions src/bergamot/request.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,15 @@
// translating by the workers (BatchTranslator). Request is to be extended with
// notions of Priority (sequence, user-given).
//
// RequestSentence: is a mapping of (index, Request*). This provides the
// RequestSentence: is a tuple of (index, Request*). This provides the
// batching mechanism access to the segment within the request. The backref to
// Request allows event triggering the barrier upon completion of the last
// sentence by a worker.
//
// PCItem: is a vector of RequestSentences and the corresponding Segments,
// which is what the ProducerConsumer queue holds. Can probably get rid of
// Segment here and use RequestSentence directly to construct batches.
// Separation is worker(BatchTranslator) need not be aware of the notion of
// Request, but only a Batch of segments.
// PCItem: is a vector of RequestSentences and a batchNumber, which is what the
// PCQueue holds. The batches are constructed from segments returned by a
// RequestSentence. Can be enhanced with paddingSize, countTokens eventually for
// logging.

#ifndef SRC_BERGAMOT_REQUEST_H_
#define SRC_BERGAMOT_REQUEST_H_
Expand All @@ -38,21 +37,20 @@ class Request {
private:
unsigned int Id_;
string_view reference_;
std::atomic<int> counter_;
std::vector<Ptr<Vocab const>> *vocabs_;

UPtr<Segments> segments_;
UPtr<SourceAlignments> sourceAlignments_;
std::promise<TranslationResult> response_;
std::vector<Ptr<History>> histories_;
std::atomic<int> counter_;
std::vector<Ptr<Vocab const>> *vocabs_;

std::promise<TranslationResult> response_;

public:
Request(unsigned int, std::vector<Ptr<Vocab const>> &, string_view,
UPtr<Segments>, UPtr<SourceAlignments>,
std::promise<TranslationResult>);

void processHistory(int index, Ptr<History>);
void completeRequest();

// Obtain the count of tokens in a segment. Used to insert sentence from
// multiple requests into the corresponding size bucket.
int segmentTokens(int) const;
Expand All @@ -63,7 +61,15 @@ class Request {
// Obtains a segment to create a batch of segments among several requests.
Segment getSegment(int) const;

// For notions of priority among requests (used to enable <set> in Batcher).
bool operator<(const Request &) const;

// Processes a history obtained after translating in a heterogenous batch
// compiled from requests.
void processHistory(int index, Ptr<History>);

// On completion of last segment, sets value of the promise.
void completeRequest();
};

class RequestSentence {
Expand All @@ -73,6 +79,8 @@ class RequestSentence {

public:
RequestSentence(int, Request &);

// Returns token in Segment corresponding to index.
int numTokens();
Segment getUnderlyingSegment() const;
void completeSentence(Ptr<History>);
Expand All @@ -85,16 +93,24 @@ struct PCItem {
int batchNumber;
UPtr<RequestSentences> sentences;

// PCItem should be default constructible. Default constructed element is
// poison.
PCItem() : batchNumber(-1), sentences(nullptr) {}

// PCItem constructor to construct a legit PCItem.
PCItem(int batchNumber, UPtr<RequestSentences> sentences)
: batchNumber(batchNumber), sentences(std::move(sentences)) {}

// Overloads std::swap for PCQueue such that unique_ptr move is managed with
// swap. Using non-swapped versions lead to multiple copies and is not
// interoperable with unique_ptr
friend void swap(PCItem &a, PCItem &b) {
using std::swap;
swap(a.batchNumber, b.batchNumber);
swap(a.sentences, b.sentences);
}

// Convenience function to determine poison.
bool isPoison() { return (batchNumber == -1); }
};

Expand Down
12 changes: 11 additions & 1 deletion src/bergamot/service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,31 +33,41 @@ Service::Service(Ptr<Options> options)
std::future<TranslationResult> Service::queue(const string_view &input) {
// @TODO(jerin): Place a queue to keep track of requests here.

// Create segments and sourceAlignments, load via TextProcessor
UPtr<Segments> segments = UNew<Segments>();
UPtr<SourceAlignments> sourceAlignments = UNew<SourceAlignments>();

text_processor_.query_to_segments(input, *segments.get(),
*sourceAlignments.get());

// Create promise and obtain future.
std::promise<TranslationResult> translationResultPromise;
auto future = translationResultPromise.get_future();

// Move ownership and construct request.
UPtr<Request> request = UNew<Request>(
requestId_++, vocabs_, input, std::move(segments),
std::move(sourceAlignments), std::move(translationResultPromise));

// Adding sentences from a request to batcher.
for (int i = 0; i < request->numSegments(); i++) {
RequestSentence requestSentence(i, *request.get());
batcher_.addSentenceWithPriority(requestSentence);
}

// Moving request into service Ownership to keep UPtr from deleting once out
// of scope. Alternative: Use shared_ptr? Overhead doesn't seem to be much.
requests_.push(std::move(request));

// Construct batches of RequestSentences and add to PCQueue for workers to
// consume.
UPtr<RequestSentences> batchSentences;

int numSentences;
do {
batchSentences = UNew<std::vector<RequestSentence>>();
batcher_.cleave_batch(*batchSentences.get());

// Using batchSentences-> directly with move-semantics lead to segfault.
numSentences = batchSentences->size();

if (numSentences > 0) {
Expand Down
6 changes: 3 additions & 3 deletions src/bergamot/textops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ void TextProcessor::query_to_segments(const string_view &query,
offset += max_input_sentence_tokens_) {
auto start = tokenized_sentence.begin() + offset;
Segment segment(start, start + max_input_sentence_tokens_);
segment.push_back(tokenizer_.vocabs_[0]->getEosId());
segment.push_back(tokenizer_.sourceEosId());
segments.push_back(segment);

auto astart = snt_alignment.begin() + offset;
Expand All @@ -101,7 +101,7 @@ void TextProcessor::query_to_segments(const string_view &query,
if (offset < max_input_sentence_tokens_) {
auto start = tokenized_sentence.begin() + offset;
Segment segment(start, tokenized_sentence.end());
segment.push_back(tokenizer_.vocabs_[0]->getEosId());
segment.push_back(tokenizer_.sourceEosId());
segments.push_back(segment);

auto astart = snt_alignment.begin() + offset;
Expand All @@ -110,7 +110,7 @@ void TextProcessor::query_to_segments(const string_view &query,
}

} else {
tokenized_sentence.push_back(tokenizer_.vocabs_[0]->getEosId());
tokenized_sentence.push_back(tokenizer_.sourceEosId());
segments.push_back(tokenized_sentence);
sourceAlignments.push_back(snt_alignment);
}
Expand Down
11 changes: 8 additions & 3 deletions src/bergamot/textops.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,24 @@ class SentenceSplitter {
};

class Tokenizer {
public:
private:
std::vector<Ptr<Vocab const>> vocabs_;
bool inference_;
bool addEOS_;

public:
explicit Tokenizer(Ptr<Options>);
Segment tokenize(string_view const &, SourceAlignment &);
Word sourceEosId() { return vocabs_.front()->getEosId(); };
};

class TextProcessor {
public:
private:
Tokenizer tokenizer_;
unsigned int max_input_sentence_tokens_;
SentenceSplitter sentence_splitter_;
unsigned int max_input_sentence_tokens_;

public:
explicit TextProcessor(Ptr<Options>);
void query_to_segments(const string_view &query, Segments &,
SourceAlignments &);
Expand Down

0 comments on commit 1fd4a44

Please sign in to comment.