Skip to content

Commit

Permalink
Sort this.added_tokens before creating regex (.toSorted is not av…
Browse files Browse the repository at this point in the history
…ailable in Node.js < 20)
  • Loading branch information
xenova committed Sep 9, 2024
1 parent c40a151 commit 30315b2
Showing 1 changed file with 2 additions and 3 deletions.
5 changes: 2 additions & 3 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -2538,6 +2538,8 @@ export class PreTrainedTokenizer extends Callable {
this.all_special_ids.push(token.id);
}
}
// Sort by length (desc) to avoid early partial matches
this.added_tokens.sort((a, b) => b.content.length - a.content.length)

// Update additional_special_tokens
this.additional_special_tokens = tokenizerConfig.additional_special_tokens ?? [];
Expand All @@ -2555,11 +2557,8 @@ export class PreTrainedTokenizer extends Callable {
this.decoder.end_of_word_suffix = this.model.end_of_word_suffix;
}


this.added_tokens_regex = this.added_tokens.length > 0 ? new RegExp(
this.added_tokens
// Sort by length (desc) to avoid early partial matches
.toSorted((a, b) => b.content.length - a.content.length)
.map(x => `${x.lstrip ? '\\s*' : ''}(${escapeRegExp(x.content)})${x.rstrip ? '\\s*' : ''}`)
.join('|')
) : null;
Expand Down

0 comments on commit 30315b2

Please sign in to comment.