Skip to content

Commit

Permalink
[pt] Fix word tokeniser (#10476)
Browse files Browse the repository at this point in the history
* [pt] Fix word tokeniser and add a new test to check it's okay
  • Loading branch information
p-goulart authored Apr 9, 2024
1 parent 417c567 commit ecbaf3f
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ public List<String> tokenize(final String text) {
String token = tokeniserMatcher.group();
// 0xFE00-0xFE0F are non-spacing marks
if (!tokenList.isEmpty() && token.length() == 1 && token.codePointAt(0)>=0xFE00 && token.codePointAt(0)<=0xFE0F) {
tokenList.set(tokenList.size() - 1, tokenList.get(tokenList.size() - 1) + tokenList);
tokenList.set(tokenList.size() - 1, tokenList.get(tokenList.size() - 1) + token);
continue;
}
token = token.replace(DECIMAL_COMMA_SUBST, ',');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.junit.Test;

import java.sql.Struct;
import java.util.List;

import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
Expand Down Expand Up @@ -272,4 +273,9 @@ public void testTokeniseRarePunctuation() {
public void testTokeniseParagraphSymbol() {
testTokenise("§1º", "§", "1º");
}

@Test
public void testTokeniseComplexEmoji() {
testTokenise("🧝🏽‍♀️", "🧝", "🏽", "‍", "♀️");
}
}

0 comments on commit ecbaf3f

Please sign in to comment.