From 5a734f9926af0d215246f4bc47738281e1006aba Mon Sep 17 00:00:00 2001
From: Samaneh Saadat <ssaadat@google.com>
Date: Thu, 13 Jun 2024 16:40:13 -0700
Subject: [PATCH] Add get_vocabulary, id_to_token and token_to_id methods to
 ByteTokenizer and UnicodeCodepointTokenizer. (#1664)

---
 keras_nlp/src/tokenizers/byte_tokenizer.py    | 24 +++++++++++++++++++
 .../src/tokenizers/byte_tokenizer_test.py     | 14 +++++++++++
 .../tokenizers/unicode_codepoint_tokenizer.py | 24 +++++++++++++++++++
 .../unicode_codepoint_tokenizer_test.py       | 14 +++++++++++
 4 files changed, 76 insertions(+)

diff --git a/keras_nlp/src/tokenizers/byte_tokenizer.py b/keras_nlp/src/tokenizers/byte_tokenizer.py
index 128aac277..deffb7edc 100644
--- a/keras_nlp/src/tokenizers/byte_tokenizer.py
+++ b/keras_nlp/src/tokenizers/byte_tokenizer.py
@@ -209,6 +209,12 @@ def vocabulary_size(self):
         """Get the integer size of the tokenizer vocabulary."""
         return 256
 
+    def get_vocabulary(self):
+        vocab = {}
+        for i in range(self.vocabulary_size()):
+            vocab[chr(i)] = i
+        return vocab
+
     def tokenize(self, inputs):
         if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
             inputs = tf.convert_to_tensor(inputs)
@@ -264,6 +270,24 @@ def detokenize(self, inputs):
             outputs = tf.squeeze(outputs, 0)
         return outputs
 
+    def id_to_token(self, id):
+        """Convert an integer id to a string token."""
+        if id >= self.vocabulary_size() or id < 0:
+            raise ValueError(
+                f"`id` must be in range [0, {self.vocabulary_size() - 1}]. "
+                f"Received: {id}"
+            )
+        return chr(id)
+
+    def token_to_id(self, token):
+        """Convert a string token to an integer id."""
+        id = ord(token)
+        if id >= self.vocabulary_size():
+            raise ValueError(
+                f"Token {token} is not supported by `ByteTokenizer`."
+            )
+        return id
+
     def get_config(self):
         config = super().get_config()
         config.update(
diff --git a/keras_nlp/src/tokenizers/byte_tokenizer_test.py b/keras_nlp/src/tokenizers/byte_tokenizer_test.py
index 0ae09688d..efa323d44 100644
--- a/keras_nlp/src/tokenizers/byte_tokenizer_test.py
+++ b/keras_nlp/src/tokenizers/byte_tokenizer_test.py
@@ -222,3 +222,17 @@ def test_config(self):
             tokenizer(input_data),
             cloned_tokenizer(input_data),
         )
+
+    def test_token_to_id(self):
+        input_tokens = ["f", "u", "n"]
+        expected_ids = [102, 117, 110]
+        tokenizer = ByteTokenizer()
+        ids = [tokenizer.token_to_id(t) for t in input_tokens]
+        self.assertAllEqual(ids, expected_ids)
+
+    def test_id_to_token(self):
+        input_ids = [102, 117, 110]
+        expected_tokens = ["f", "u", "n"]
+        tokenizer = ByteTokenizer()
+        tokens = [tokenizer.id_to_token(i) for i in input_ids]
+        self.assertAllEqual(tokens, expected_tokens)
diff --git a/keras_nlp/src/tokenizers/unicode_codepoint_tokenizer.py b/keras_nlp/src/tokenizers/unicode_codepoint_tokenizer.py
index 8403f6fb5..8afaf2786 100644
--- a/keras_nlp/src/tokenizers/unicode_codepoint_tokenizer.py
+++ b/keras_nlp/src/tokenizers/unicode_codepoint_tokenizer.py
@@ -280,6 +280,12 @@ def vocabulary_size(self):
         size was provided"""
         return self._vocabulary_size
 
+    def get_vocabulary(self):
+        vocab = {}
+        for i in range(self.vocabulary_size()):
+            vocab[chr(i)] = i
+        return vocab
+
     def tokenize(self, inputs):
         if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
             inputs = tf.convert_to_tensor(inputs)
@@ -331,3 +337,21 @@ def detokenize(self, inputs):
         if unbatched:
             outputs = tf.squeeze(outputs, 0)
         return outputs
+
+    def id_to_token(self, id):
+        """Convert an integer id to a string token."""
+        if id >= self.vocabulary_size() or id < 0:
+            raise ValueError(
+                f"`id` must be in range [0, {self.vocabulary_size() - 1}]. "
+                f"Received: {id}"
+            )
+        return chr(id)
+
+    def token_to_id(self, token):
+        """Convert a string token to an integer id."""
+        id = ord(token)
+        if id >= self.vocabulary_size():
+            raise ValueError(
+                f"Token {token} is not supported by `UnicodeCodepointTokenizer`."
+            )
+        return id
diff --git a/keras_nlp/src/tokenizers/unicode_codepoint_tokenizer_test.py b/keras_nlp/src/tokenizers/unicode_codepoint_tokenizer_test.py
index 4f324da15..a759db9ec 100644
--- a/keras_nlp/src/tokenizers/unicode_codepoint_tokenizer_test.py
+++ b/keras_nlp/src/tokenizers/unicode_codepoint_tokenizer_test.py
@@ -280,3 +280,17 @@ def test_config(self):
             tokenizer(input_data),
             cloned_tokenizer(input_data),
         )
+
+    def test_token_to_id(self):
+        input_tokens = ["ب", "و", "خ"]
+        expected_ids = [1576, 1608, 1582]
+        tokenizer = UnicodeCodepointTokenizer(vocabulary_size=2000)
+        ids = [tokenizer.token_to_id(t) for t in input_tokens]
+        self.assertAllEqual(ids, expected_ids)
+
+    def test_id_to_token(self):
+        input_ids = [1576, 1608, 1582]
+        expected_tokens = ["ب", "و", "خ"]
+        tokenizer = UnicodeCodepointTokenizer(vocabulary_size=2000)
+        tokens = [tokenizer.id_to_token(i) for i in input_ids]
+        self.assertAllEqual(tokens, expected_tokens)