-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.go
58 lines (47 loc) · 1.28 KB
/
model.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
package tokenizer
import (
"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/model/wordlevel"
)
type RuneLevelVocab interface {
Vocab() map[string]int
UnkToken() string
TokenToID(token string) (int, error)
}
// RuneLevel is a model tokenizer that splits each word into runes and maps runes to IDs.
type RuneLevel struct {
*wordlevel.WordLevel
vocab RuneLevelVocab
}
func NewRuneLevel(vocab RuneLevelVocab) *RuneLevel {
return &RuneLevel{
WordLevel: NewWordLevel(vocab.Vocab(), vocab.UnkToken()),
vocab: vocab,
}
}
// Tokenize transforms given input token into a list of rune-level sub-tokens.
func (rl *RuneLevel) Tokenize(token string) ([]tokenizer.Token, error) {
var tokens []tokenizer.Token
var offset int
for _, r := range []rune(token) {
s := string(r)
id, err := rl.vocab.TokenToID(s)
if err != nil {
return nil, err
}
tokens = append(tokens, tokenizer.Token{
Id: id,
Value: s,
Offsets: []int{offset, offset + len(s)},
})
offset += len(s)
}
return tokens, nil
}
// NewWordLevel creates a WordLevel model from a given vocab.
func NewWordLevel(vocab map[string]int, unkToken string) *wordlevel.WordLevel {
builder := wordlevel.NewWordLevelBuilder()
builder.Vocab(vocab)
builder.UnkToken(unkToken)
return builder.Build()
}