fixing an issue with tokenizer vocab_size mismatch

RWKV · Aug 21, 2023 · 5b8320b · 5b8320b
1 parent 5191ada
commit 5b8320b
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 3 deletions.
diff --git a/RWKV-v4neo/src/dataflow/trie_tokenizer.py b/RWKV-v4neo/src/dataflow/trie_tokenizer.py
@@ -52,7 +52,7 @@ def find_longest(self, key:bytes, idx:int=0):
 
 class TRIE_TOKENIZER():
     def __init__(self, file_name):
-        self.vocab_size = 65525
+        self.vocab_size = 65529
         self.idx2token = {}
         sorted = [] # must be already sorted
         with open(file_name, "r", encoding="utf-8") as f:

diff --git a/RWKV-v5/src/dataflow/trie_tokenizer.py b/RWKV-v5/src/dataflow/trie_tokenizer.py
@@ -52,7 +52,7 @@ def find_longest(self, key:bytes, idx:int=0):
 
 class TRIE_TOKENIZER():
     def __init__(self, file_name):
-        self.vocab_size = 65525
+        self.vocab_size = 65529
         self.idx2token = {}
         sorted = [] # must be already sorted
         with open(file_name, "r", encoding="utf-8") as f:
@@ -130,7 +130,14 @@ def get_world_tokenizer():
 
 # Provide a global function for the world tokenizer
 def world_tokenizer_encode(src):
-    return get_world_tokenizer().encode(src)
+    res = get_world_tokenizer().encode(src)
+
+    # Check the result for any null, < 0, or > 65525
+    for i in res:
+        if i < 0 or i > 65529:
+            raise Exception(f"world_tokenizer_encode: Invalid token {i} from: {src}")
+
+    return res
 
 ########################################################################################################
 # Tensor specific tokenizer