Skip to content

Commit

Permalink
Update to MXNet 1.3.0 (#534)
Browse files Browse the repository at this point in the history
Update to MXNet 1.3.0 to make use of the following new features:
- use of MXNet unravel_index & topk
- use of MXNet logical operators
- topk is now a HybridBlock.
- Inference HybridBlocks use static_alloc and static_shape
  • Loading branch information
fhieber authored Sep 20, 2018
1 parent 3cca0c3 commit 5144d25
Show file tree
Hide file tree
Showing 13 changed files with 163 additions and 67 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ Note that Sockeye has checks in place to not translate with an old model that wa

Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.

## [1.18.55]
## [1.18.56]
### Changed
- Update to MXNet 1.3.0.post0

## [1.18.55]
- Renamed `contrib` to less-generic `sockeye_contrib`

## [1.18.54]
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ Recent developments and changes are tracked in our [changelog](https://github.co

Sockeye requires:
- **Python3**
- [MXNet 1.2.1](https://github.com/apache/incubator-mxnet/tree/1.2.1)
- [MXNet 1.3.0](https://github.com/apache/incubator-mxnet/tree/1.3.0)
- numpy

## Installation
Expand Down Expand Up @@ -85,7 +85,7 @@ Depending on your version of CUDA, you can do this by running the following:
> pip install sockeye --no-deps -r requirements.gpu-cu${CUDA_VERSION}.txt
> rm requirements.gpu-cu${CUDA_VERSION}.txt
```
where `${CUDA_VERSION}` can be `75` (7.5), `80` (8.0), `90` (9.0), or `91` (9.1).
where `${CUDA_VERSION}` can be `75` (7.5), `80` (8.0), `90` (9.0), `91` (9.1), or `92` (9.2).

### Or: From Source

Expand All @@ -108,7 +108,7 @@ running the following:
> pip install -r requirements/requirements.gpu-cu${CUDA_VERSION}.txt
> pip install .
```
where `${CUDA_VERSION}` can be `75` (7.5), `80` (8.0), `90` (9.0), or `91` (9.1).
where `${CUDA_VERSION}` can be `75` (7.5), `80` (8.0), `90` (9.0), `91` (9.1), or `92` (9.2).

### Optional dependencies
In order to write training statistics to a Tensorboard event file for visualization, you can optionally install mxboard
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements.gpu-cu75.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pyyaml==3.12
mxnet-cu75mkl==1.2.1
mxnet-cu75mkl==1.3.0.post0
numpy>=1.14
typing
2 changes: 1 addition & 1 deletion requirements/requirements.gpu-cu80.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pyyaml==3.12
mxnet-cu80mkl==1.2.1
mxnet-cu80mkl==1.3.0.post0
numpy>=1.14
typing
2 changes: 1 addition & 1 deletion requirements/requirements.gpu-cu90.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pyyaml==3.12
mxnet-cu90mkl==1.2.1
mxnet-cu90mkl==1.3.0.post0
numpy>=1.14
typing
2 changes: 1 addition & 1 deletion requirements/requirements.gpu-cu91.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pyyaml==3.12
mxnet-cu91mkl==1.2.1
mxnet-cu91mkl==1.3.0.post0
numpy>=1.14
typing
4 changes: 4 additions & 0 deletions requirements/requirements.gpu-cu92.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
pyyaml==3.12
mxnet-cu92mkl==1.3.0.post0
numpy>=1.14
typing
2 changes: 1 addition & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pyyaml==3.12
mxnet-mkl==1.2.1
mxnet-mkl==1.3.0.post0
numpy>=1.14
typing
2 changes: 1 addition & 1 deletion sockeye/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

__version__ = '1.18.55'
__version__ = '1.18.56'
4 changes: 2 additions & 2 deletions sockeye/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1243,8 +1243,8 @@ def encode(self,
transform = mx.sym.Dropout(data=transform, p=self.dropout)
# Connection
seg_embedding = gate * transform + (1 - gate) * seg_embedding
# (batch_size, seq_len/stride, outut_dim) aka
# (batch_size, encoded_seq_len, num_segment_emded)
# (batch_size, seq_len/stride, output_dim) aka
# (batch_size, encoded_seq_len, num_segment_embed)
seg_embedding = mx.sym.Reshape(data=seg_embedding,
shape=(-1, encoded_seq_len, self.output_dim))

Expand Down
141 changes: 112 additions & 29 deletions sockeye/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1052,32 +1052,46 @@ def __init__(self,

self._update_scores = UpdateScores()
self._update_scores.initialize(ctx=self.context)
self._update_scores.hybridize()
self._update_scores.hybridize(static_alloc=True, static_shape=True)

# topk function used in beam search
if self.skip_topk:
self._top = partial(utils.top1,
offset=self.offset)
# Vocabulary selection leads to different vocabulary sizes across requests. Hence, we cannot use a
# statically-shaped HybridBlock for the topk operation in this case; resorting to imperative topk
# function in this case.
if self.restrict_lexicon:
if self.skip_topk:
self._top = partial(utils.top1, offset=self.offset) # type: Callable
else:
self._top = partial(utils.topk,
k=self.beam_size,
offset=self.offset,
use_mxnet_topk=True) # type: Callable
else:
self._top = partial(utils.topk,
k=self.beam_size,
batch_size=self.batch_size,
offset=self.offset,
use_mxnet_topk=self.context != mx.cpu()) # MXNet implementation is faster on GPUs
if self.skip_topk:
self._top = Top1(k=self.beam_size,
batch_size=self.batch_size) # type: mx.gluon.HybridBlock
self._top.initialize(ctx=self.context)
self._top.hybridize(static_alloc=True, static_shape=True)
else:
self._top = TopK(k=self.beam_size,
batch_size=self.batch_size,
vocab_size=len(self.vocab_target)) # type: mx.gluon.HybridBlock
self._top.initialize(ctx=self.context)
self._top.hybridize(static_alloc=True, static_shape=True)

self._sort_by_index = SortByIndex()
self._sort_by_index.initialize(ctx=self.context)
self._sort_by_index.hybridize()
self._sort_by_index.hybridize(static_alloc=True, static_shape=True)

self._update_finished = NormalizeAndUpdateFinished(pad_id=C.PAD_ID,
eos_id=self.vocab_target[C.EOS_SYMBOL],
length_penalty_alpha=self.length_penalty.alpha,
length_penalty_beta=self.length_penalty.beta)
self._update_finished.initialize(ctx=self.context)
self._update_finished.hybridize()
self._update_finished.hybridize(static_alloc=True, static_shape=True)

self._prune_hyps = PruneHypotheses(threshold=self.beam_prune, beam_size=self.beam_size)
self._prune_hyps.initialize(ctx=self.context)
self._prune_hyps.hybridize()
self._prune_hyps.hybridize(static_alloc=True, static_shape=True)

self.global_avoid_trie = None
if avoid_list is not None:
Expand Down Expand Up @@ -1663,11 +1677,8 @@ def _beam_search(self,
# (9) Sort the hypotheses within each sentence (normalization for finished hyps may have unsorted them).
folded_accumulated_scores = scores_accumulated.reshape((self.batch_size,
self.beam_size * scores_accumulated.shape[-1]))
indices = mx.nd.argsort(folded_accumulated_scores, axis=1)
best_hyp_indices = mx.nd.array(np.unravel_index(indices.astype(np.int32).asnumpy().ravel(),
scores_accumulated.shape),
dtype='int32',
ctx=self.offset.context)[0] + self.offset
indices = mx.nd.cast(mx.nd.argsort(folded_accumulated_scores, axis=1), dtype='int32').reshape((-1,))
best_hyp_indices, _ = mx.nd.unravel_index(indices, scores_accumulated.shape) + self.offset
best_hyp_indices_list.append(best_hyp_indices)
lengths = lengths.take(best_hyp_indices)
scores_accumulated = scores_accumulated.take(best_hyp_indices)
Expand Down Expand Up @@ -1844,6 +1855,82 @@ def hybrid_forward(self, F, indices, *args):
return [F.take(arg, indices) for arg in args]


class TopK(mx.gluon.HybridBlock):
"""
A HybridBlock for a statically-shaped batch-wise topk operation.
"""

def __init__(self, k: int, batch_size: int, vocab_size: int) -> None:
"""
:param k: The number of smallest scores to return.
:param batch_size: Number of sentences being decoded at once.
:param vocab_size: Vocabulary size.
"""
super().__init__()
self.k = k
self.batch_size = batch_size
self.vocab_size = vocab_size
with self.name_scope():
offset = mx.nd.repeat(mx.nd.arange(0, batch_size * k, k, dtype='int32'), k)
self.offset = self.params.get_constant(name='offset', value=offset)

def hybrid_forward(self, F, scores, offset):
"""
Get the lowest k elements per sentence from a `scores` matrix.
:param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
:param offset: Array to add to the hypothesis indices for offsetting in batch decoding.
:return: The row indices, column indices and values of the k smallest items in matrix.
"""
folded_scores = F.reshape(scores, shape=(self.batch_size, self.k * self.vocab_size))
values, indices = F.topk(folded_scores, axis=1, k=self.k, ret_typ='both', is_ascend=True)
indices = F.reshape(F.cast(indices, 'int32'), shape=(-1,))
unraveled = F.unravel_index(indices, shape=(self.batch_size * self.k, self.vocab_size))
best_hyp_indices, best_word_indices = F.split(unraveled, axis=0, num_outputs=2, squeeze_axis=True)
best_hyp_indices = best_hyp_indices + offset
values = F.reshape(values, shape=(-1, 1))
return best_hyp_indices, best_word_indices, values


class Top1(mx.gluon.HybridBlock):
"""
A HybridBlock for a statically-shaped batch-wise first-best operation.
Get the single lowest element per sentence from a `scores` matrix. Expects that
beam size is 1, for greedy decoding.
NOTE(mathmu): The current implementation of argmin in MXNet much slower than topk with k=1.
"""
def __init__(self, k: int, batch_size: int) -> None:
"""
:param k: The number of smallest scores to return.
:param batch_size: Number of sentences being decoded at once.
:param vocab_size: Vocabulary size.
"""
super().__init__()
with self.name_scope():
offset = mx.nd.repeat(mx.nd.arange(0, batch_size * k, k, dtype='int32'), k)
self.offset = self.params.get_constant(name='offset', value=offset)

def hybrid_forward(self, F, scores, offset):
"""
Get the single lowest element per sentence from a `scores` matrix. Expects that
beam size is 1, for greedy decoding.
:param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
:param offset: Array to add to the hypothesis indices for offsetting in batch decoding.
:return: The row indices, column indices and values of the smallest items in matrix.
"""
best_word_indices = F.cast(F.argmin(scores, axis=1), dtype='int32')
values = F.pick(scores, best_word_indices, axis=1)
values = F.reshape(values, shape=(-1, 1))

# for top1, the best hyp indices are equal to the plain offset
best_hyp_indices = offset

return best_hyp_indices, best_word_indices, values


class NormalizeAndUpdateFinished(mx.gluon.HybridBlock):
"""
A HybridBlock for normalizing newly finished hypotheses scores with LengthPenalty.
Expand All @@ -1860,8 +1947,8 @@ def __init__(self, pad_id: int,
self.length_penalty = LengthPenalty(alpha=length_penalty_alpha, beta=length_penalty_beta)

def hybrid_forward(self, F, best_word_indices, max_output_lengths, finished, scores_accumulated, lengths):
all_finished = ((best_word_indices == self.pad_id) + (best_word_indices == self.eos_id))
newly_finished = all_finished - finished
all_finished = F.broadcast_logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id)
newly_finished = F.broadcast_logical_xor(all_finished, finished)
scores_accumulated = F.where(newly_finished,
scores_accumulated / self.length_penalty(lengths),
scores_accumulated)
Expand All @@ -1874,11 +1961,9 @@ def hybrid_forward(self, F, best_word_indices, max_output_lengths, finished, sco
# - extended with <pad>, or
# - extended with <eos>, or
# - at their maximum length.
finished = F.clip(
(best_word_indices == self.pad_id) +
(best_word_indices == self.eos_id) +
(F.cast(F.reshape(lengths, shape=(-1,)), 'int32') >= max_output_lengths),
a_min=0, a_max=1)
finished = F.broadcast_logical_or(F.broadcast_logical_or(best_word_indices == self.pad_id,
best_word_indices == self.eos_id),
(F.cast(F.reshape(lengths, shape=(-1,)), 'int32') >= max_output_lengths))

return finished, scores_accumulated, lengths

Expand All @@ -1901,10 +1986,8 @@ def hybrid_forward(self, F, scores, finished, inactive, scores_accumulated, inf_
# infinity otherwise.
scores = F.broadcast_add(scores, scores_accumulated)
# pylint: disable=invalid-sequence-index
pad_id_scores = F.where(F.clip(finished - inactive, 0, 1),
scores_accumulated,
inf_array)
pad_id_scores = F.where(F.broadcast_logical_and(finished, F.logical_not(inactive)), scores_accumulated, inf_array)
# pad_dist. Shape: (batch*beam, vocab_size)
pad_dist = F.concat(pad_id_scores, pad_dist)
scores = F.where(finished + inactive, pad_dist, scores)
scores = F.where(F.broadcast_logical_or(finished, inactive), pad_dist, scores)
return scores
36 changes: 12 additions & 24 deletions sockeye/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
import mxnet as mx
import numpy as np

from sockeye import __version__, constants as C
from sockeye.log import log_sockeye_version, log_mxnet_version
from . import __version__, constants as C
from .log import log_sockeye_version, log_mxnet_version

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -276,29 +276,26 @@ def top1(scores: mx.nd.NDArray,

def topk(scores: mx.nd.NDArray,
k: int,
batch_size: int,
offset: mx.nd.NDArray,
use_mxnet_topk: bool) -> Tuple[mx.nd.NDArray, mx.nd.NDArray, mx.nd.NDArray]:
"""
Get the lowest k elements per sentence from a `scores` matrix.
:param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
:param k: The number of smallest scores to return.
:param batch_size: Number of sentences being decoded at once.
:param offset: Array to add to the hypothesis indices for offsetting in batch decoding.
:param use_mxnet_topk: True to use the mxnet implementation or False to use the numpy one.
:return: The row indices, column indices and values of the k smallest items in matrix.
"""
# (batch_size, beam_size * target_vocab_size)
folded_scores = scores.reshape((batch_size, k * scores.shape[-1]))
folded_scores = scores.reshape((-1, k * scores.shape[-1]))
batch_size = folded_scores.shape[0]

if use_mxnet_topk:
# pylint: disable=unbalanced-tuple-unpacking
values, indices = mx.nd.topk(folded_scores, axis=1, k=k, ret_typ='both', is_ascend=True)
best_hyp_indices, best_word_indices = mx.nd.array(np.unravel_index(indices.astype(np.int32).asnumpy().ravel(),
scores.shape),
dtype='int32',
ctx=scores.context)
indices = mx.nd.cast(indices, 'int32').reshape((-1,))
best_hyp_indices, best_word_indices = mx.nd.unravel_index(indices, scores.shape)

else:
folded_scores = folded_scores.asnumpy()
Expand Down Expand Up @@ -455,14 +452,12 @@ def average_arrays(arrays: List[mx.nd.NDArray]) -> mx.nd.NDArray:
:param arrays: A list of NDArrays with the same shape that will be averaged.
:return: The average of the NDArrays in the same context as arrays[0].
"""
if not arrays:
raise ValueError("arrays is empty.")
if len(arrays) == 1:
return arrays[0]
check_condition(all(arrays[0].shape == a.shape for a in arrays), "nd array shapes do not match")
new_array = mx.nd.zeros(arrays[0].shape, dtype=arrays[0].dtype, ctx=arrays[0].context)
for a in arrays:
new_array += a.as_in_context(new_array.context)
new_array /= len(arrays)
return new_array
return mx.nd.add_n(*arrays) / len(arrays)


def get_num_gpus() -> int:
Expand All @@ -471,14 +466,7 @@ def get_num_gpus() -> int:
:return: The number of GPUs on the system.
"""
# TODO (domhant): Switch to mx.context.num_gpus() with mxnet version 1.3
for device_id in itertools.count():
try:
mx.nd.zeros((1,), ctx=mx.gpu(device_id))
except mx.MXNetError:
return device_id
# Note: Return statement to make mypy happy, the for loop is infinite, so an exception is the only way out.
return device_id + 1
return mx.context.num_gpus()


def get_gpu_memory_usage(ctx: List[mx.context.Context]) -> Dict[int, Tuple[int, int]]:
Expand Down Expand Up @@ -852,8 +840,8 @@ def infer_type(self, in_type):

def create_operator(self, ctx, shapes, dtypes):
return PrintValue(self.print_name,
print_grad=self.print_grad,
use_logger=self.use_logger)
print_grad=str(self.print_grad),
use_logger=str(self.use_logger))


def grouper(iterable: Iterable, size: int) -> Iterable:
Expand Down
Loading

0 comments on commit 5144d25

Please sign in to comment.