Skip to content

Commit

Permalink
Add tokenizer test
Browse files Browse the repository at this point in the history
  • Loading branch information
jonatanklosko committed Jul 28, 2023
1 parent b2007e6 commit d9a5a08
Showing 1 changed file with 16 additions and 0 deletions.
16 changes: 16 additions & 0 deletions test/bumblebee/text/bert_tokenizer_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -70,5 +70,21 @@ defmodule Bumblebee.Text.BertTokenizerTest do
assert_equal(inputs["start_offsets"], Nx.tensor([[0, 0, 5, 14, 19, 25, 0]]))
assert_equal(inputs["end_offsets"], Nx.tensor([[0, 4, 13, 18, 25, 26, 0]]))
end

test "encoding with multiple lengths" do
assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-cased"})

inputs =
Bumblebee.apply_tokenizer(tokenizer, "This is short.", length: [8, 16])

assert {1, 8} = Nx.shape(inputs["input_ids"])

inputs =
Bumblebee.apply_tokenizer(tokenizer, "This is definitely much longer than the above.",
length: [8, 16]
)

assert {1, 16} = Nx.shape(inputs["input_ids"])
end
end
end

0 comments on commit d9a5a08

Please sign in to comment.