Skip to content

Commit

Permalink
ICU-22404 Strip default ignorable code points in the skeleton for con…
Browse files Browse the repository at this point in the history
…fusable detection
  • Loading branch information
eggrobin committed Aug 10, 2023
1 parent 86193b1 commit a6fc915
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 2 deletions.
4 changes: 3 additions & 1 deletion icu4c/source/i18n/uspoof.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,9 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
for (inputIndex=0; inputIndex < normalizedLen; ) {
UChar32 c = nfdId.char32At(inputIndex);
inputIndex += U16_LENGTH(c);
This->fSpoofData->confusableLookup(c, skelStr);
if (!u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
This->fSpoofData->confusableLookup(c, skelStr);
}
}

gNfdNormalizer->normalize(skelStr, dest, *status);
Expand Down
19 changes: 19 additions & 0 deletions icu4c/source/test/intltest/itspoof.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,18 @@ void IntlTestSpoof::testSpoofAPI() {
TEST_ASSERT(UnicodeString("lllOO") == dest);
TEST_ASSERT(&dest == &retStr);
TEST_TEARDOWN;

TEST_SETUP
// Example from UTS #55, Section 5.1.3 https://www.unicode.org/reports/tr55/#General-Security-Profile
// of a minimal pair with a ZWNJ in Persian.
const UnicodeString behrooz(u"بهروز");
const UnicodeString update(u"به‌روز");
// These strings differ only by a ZWNJ.
TEST_ASSERT(UnicodeString(update).findAndReplace(u"\u200C", u"") == behrooz);
int32_t checkResults = uspoof_areConfusableUnicodeString(sc, behrooz, update, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
TEST_TEARDOWN;
}


Expand Down Expand Up @@ -384,6 +396,13 @@ void IntlTestSpoof::testConfData() {
continue;
}

if (u_hasBinaryProperty(from.char32At(0), UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
// The source character is a default ignorable code point.
// Skip this case; the second step in obtaining a skeleton is to remove DIs,
// so the mapping in this line of confusables.txt will never be applied.
continue;
}

UnicodeString rawExpected = parseHex(parseLine.group(2, status));
UnicodeString expected;
Normalizer::decompose(rawExpected, false /*NFD*/, 0, expected, status);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import java.util.regex.Pattern;

import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.ICUBinary.Authenticate;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
Expand Down Expand Up @@ -1509,7 +1510,9 @@ public String getSkeleton(CharSequence str) {
for (int inputIndex = 0; inputIndex < normalizedLen;) {
int c = Character.codePointAt(nfdId, inputIndex);
inputIndex += Character.charCount(c);
this.fSpoofData.confusableLookup(c, skelSB);
if (!UCharacter.hasBinaryProperty(c, UProperty.DEFAULT_IGNORABLE_CODE_POINT)) {
this.fSpoofData.confusableLookup(c, skelSB);
}
}
String skelStr = skelSB.toString();
skelStr = nfdNormalizer.normalize(skelStr);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.dev.test.TestUtil.JavaVendor;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.SpoofChecker;
Expand Down Expand Up @@ -66,6 +69,7 @@ public class SpoofCheckerTest extends TestFmwk {

String han_Hiragana = "\u3086\u308A \u77F3\u7530"; // Hiragana, space, Han

static final UnicodeSet DEFAULT_IGNORABLE_CODE_POINT = new UnicodeSet("\\p{di}");

/*
* Test basic constructor.
Expand Down Expand Up @@ -376,6 +380,15 @@ public void TestSpoofAPI() {
s = "I1l0O";
String dest = sc.getSkeleton(SpoofChecker.ANY_CASE, s);
assertEquals("", dest, "lllOO");

// Example from UTS #55, Section 5.1.3 https://www.unicode.org/reports/tr55/#General-Security-Profile,
// of a minimal pair with a ZWNJ in Persian.
final String behrooz = "بهروز";
final String update = "به‌روز";
// These strings differ only by a ZWNJ.
assertEquals("", update.replace("\u200C", ""), behrooz);
checkResult = sc.areConfusable(behrooz, update);
assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResult);
}

@Test
Expand Down Expand Up @@ -728,6 +741,13 @@ public void testConfData() {
continue;
}

if (DEFAULT_IGNORABLE_CODE_POINT.containsSome(from)) {
// The source character is a default ignorable code point.
// Skip this case; the second step in obtaining a skeleton is to remove DIs,
// so the mapping in this line of confusables.txt will never be applied.
continue;
}

String rawExpected = parseHex(parseLine.group(2));
String expected = normalizer.normalize(rawExpected);

Expand Down

0 comments on commit a6fc915

Please sign in to comment.