From a6fc915e05cd00252fada5c38fc29f26968c6002 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 9 Aug 2023 16:03:23 +0200 Subject: [PATCH] ICU-22404 Strip default ignorable code points in the skeleton for confusable detection --- icu4c/source/i18n/uspoof.cpp | 4 +++- icu4c/source/test/intltest/itspoof.cpp | 19 ++++++++++++++++++ .../src/com/ibm/icu/text/SpoofChecker.java | 5 ++++- .../icu/dev/test/text/SpoofCheckerTest.java | 20 +++++++++++++++++++ 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/icu4c/source/i18n/uspoof.cpp b/icu4c/source/i18n/uspoof.cpp index 1f9288e01329..b14a496d4100 100644 --- a/icu4c/source/i18n/uspoof.cpp +++ b/icu4c/source/i18n/uspoof.cpp @@ -721,7 +721,9 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, for (inputIndex=0; inputIndex < normalizedLen; ) { UChar32 c = nfdId.char32At(inputIndex); inputIndex += U16_LENGTH(c); - This->fSpoofData->confusableLookup(c, skelStr); + if (!u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) { + This->fSpoofData->confusableLookup(c, skelStr); + } } gNfdNormalizer->normalize(skelStr, dest, *status); diff --git a/icu4c/source/test/intltest/itspoof.cpp b/icu4c/source/test/intltest/itspoof.cpp index bb6fddb1631a..1e2e983c61fe 100644 --- a/icu4c/source/test/intltest/itspoof.cpp +++ b/icu4c/source/test/intltest/itspoof.cpp @@ -140,6 +140,18 @@ void IntlTestSpoof::testSpoofAPI() { TEST_ASSERT(UnicodeString("lllOO") == dest); TEST_ASSERT(&dest == &retStr); TEST_TEARDOWN; + + TEST_SETUP + // Example from UTS #55, Section 5.1.3 https://www.unicode.org/reports/tr55/#General-Security-Profile + // of a minimal pair with a ZWNJ in Persian. + const UnicodeString behrooz(u"بهروز"); + const UnicodeString update(u"به‌روز"); + // These strings differ only by a ZWNJ. + TEST_ASSERT(UnicodeString(update).findAndReplace(u"\u200C", u"") == behrooz); + int32_t checkResults = uspoof_areConfusableUnicodeString(sc, behrooz, update, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults); + TEST_TEARDOWN; } @@ -384,6 +396,13 @@ void IntlTestSpoof::testConfData() { continue; } + if (u_hasBinaryProperty(from.char32At(0), UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) { + // The source character is a default ignorable code point. + // Skip this case; the second step in obtaining a skeleton is to remove DIs, + // so the mapping in this line of confusables.txt will never be applied. + continue; + } + UnicodeString rawExpected = parseHex(parseLine.group(2, status)); UnicodeString expected; Normalizer::decompose(rawExpected, false /*NFD*/, 0, expected, status); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java b/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java index 28043de39c3a..f22fd49c0a26 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java @@ -32,6 +32,7 @@ import java.util.regex.Pattern; import com.ibm.icu.impl.ICUBinary; +import com.ibm.icu.impl.UCharacterProperty; import com.ibm.icu.impl.ICUBinary.Authenticate; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; @@ -1509,7 +1510,9 @@ public String getSkeleton(CharSequence str) { for (int inputIndex = 0; inputIndex < normalizedLen;) { int c = Character.codePointAt(nfdId, inputIndex); inputIndex += Character.charCount(c); - this.fSpoofData.confusableLookup(c, skelSB); + if (!UCharacter.hasBinaryProperty(c, UProperty.DEFAULT_IGNORABLE_CODE_POINT)) { + this.fSpoofData.confusableLookup(c, skelSB); + } } String skelStr = skelSB.toString(); skelStr = nfdNormalizer.normalize(skelStr); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java index d76e01d9bde9..7a32a036f631 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java @@ -31,7 +31,10 @@ import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.dev.test.TestUtil; import com.ibm.icu.dev.test.TestUtil.JavaVendor; +import com.ibm.icu.impl.UCharacterProperty; import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; import com.ibm.icu.lang.UScript; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.SpoofChecker; @@ -66,6 +69,7 @@ public class SpoofCheckerTest extends TestFmwk { String han_Hiragana = "\u3086\u308A \u77F3\u7530"; // Hiragana, space, Han + static final UnicodeSet DEFAULT_IGNORABLE_CODE_POINT = new UnicodeSet("\\p{di}"); /* * Test basic constructor. @@ -376,6 +380,15 @@ public void TestSpoofAPI() { s = "I1l0O"; String dest = sc.getSkeleton(SpoofChecker.ANY_CASE, s); assertEquals("", dest, "lllOO"); + + // Example from UTS #55, Section 5.1.3 https://www.unicode.org/reports/tr55/#General-Security-Profile, + // of a minimal pair with a ZWNJ in Persian. + final String behrooz = "بهروز"; + final String update = "به‌روز"; + // These strings differ only by a ZWNJ. + assertEquals("", update.replace("\u200C", ""), behrooz); + checkResult = sc.areConfusable(behrooz, update); + assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResult); } @Test @@ -728,6 +741,13 @@ public void testConfData() { continue; } + if (DEFAULT_IGNORABLE_CODE_POINT.containsSome(from)) { + // The source character is a default ignorable code point. + // Skip this case; the second step in obtaining a skeleton is to remove DIs, + // so the mapping in this line of confusables.txt will never be applied. + continue; + } + String rawExpected = parseHex(parseLine.group(2)); String expected = normalizer.normalize(rawExpected);