-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer_test.go
84 lines (81 loc) · 2.12 KB
/
tokenizer_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
package utc_test
import (
"testing"
"github.com/go-aie/paddle"
"github.com/go-aie/utc"
"github.com/google/go-cmp/cmp"
)
func TestPromptTokenizer_Encode(t *testing.T) {
tests := []struct {
inInputs []utc.Input
wantEncoding utc.Encoding
}{
{
inInputs: []utc.Input{
{
Text: "[O-MASK]肯定[O-MASK]否定",
Positions: 0,
TokenTypes: 1,
DoTruncate: false,
},
{
Text: "[SEP]",
Positions: 0,
TokenTypes: 0,
DoTruncate: false,
},
{
Text: "好的",
Positions: -1,
TokenTypes: 0,
DoTruncate: true,
},
{
Text: "[SEP]",
Positions: -1,
TokenTypes: 1,
DoTruncate: false,
},
{
Text: "",
Positions: -1,
TokenTypes: 1,
DoTruncate: true,
},
},
wantEncoding: utc.Encoding{
InputIDs: []int{1, 17964, 1566, 91, 17964, 955, 91, 2, 170, 5, 2, 2},
SoftTokenIDs: []int{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
PositionIDs: []int{0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0},
TokenTypeIDs: []int{0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0},
AttentionMask: paddle.NewMatrix(12, 12, []float32{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, -10000, -10000, -10000, 0, 0, 0, 0, 0,
0, 0, 0, 0, -10000, -10000, -10000, 0, 0, 0, 0, 0,
0, 0, 0, 0, -10000, -10000, -10000, 0, 0, 0, 0, 0,
0, -10000, -10000, -10000, 0, 0, 0, 0, 0, 0, 0, 0,
0, -10000, -10000, -10000, 0, 0, 0, 0, 0, 0, 0, 0,
0, -10000, -10000, -10000, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
}),
OMaskPositions: []int{1, 4},
ClsPositions: 7,
},
},
}
for _, tt := range tests {
tk, err := utc.NewPromptTokenizer("./utc-large/vocab.txt", true, 512)
if err != nil {
t.Fatalf("err: %v\n", err)
}
gotEncoding := tk.Encode(tt.inInputs)
if !cmp.Equal(gotEncoding, tt.wantEncoding) {
diff := cmp.Diff(gotEncoding, tt.wantEncoding)
t.Errorf("Want - Got: %s", diff)
}
}
}