[pt] Fix word tokeniser (#10476)

* [pt] Fix word tokeniser and add a new test to check it's okay
languagetool-org · Apr 9, 2024 · ecbaf3f · ecbaf3f
1 parent 417c567
commit ecbaf3f
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 1 deletion.
diff --git a/...uage-modules/pt/src/main/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizer.java b/...uage-modules/pt/src/main/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizer.java
@@ -144,7 +144,7 @@ public List<String> tokenize(final String text) {
       String token = tokeniserMatcher.group();
       // 0xFE00-0xFE0F are non-spacing marks
       if (!tokenList.isEmpty() && token.length() == 1 && token.codePointAt(0)>=0xFE00 && token.codePointAt(0)<=0xFE0F) {
-        tokenList.set(tokenList.size() - 1, tokenList.get(tokenList.size() - 1) + tokenList);
+        tokenList.set(tokenList.size() - 1, tokenList.get(tokenList.size() - 1) + token);
         continue;
       }
       token = token.replace(DECIMAL_COMMA_SUBST, ',');

diff --git a/...-modules/pt/src/test/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizerTest.java b/...-modules/pt/src/test/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizerTest.java
@@ -22,6 +22,7 @@
 import org.junit.Test;
 
 import java.sql.Struct;
+import java.util.List;
 
 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
@@ -272,4 +273,9 @@ public void testTokeniseRarePunctuation() {
   public void testTokeniseParagraphSymbol() {
     testTokenise("§1º", "§", "1º");
   }
+
+  @Test
+  public void testTokeniseComplexEmoji() {
+    testTokenise("🧝🏽‍♀️", "🧝", "🏽", "‍", "♀️");
+  }
 }