From a6fc915e05cd00252fada5c38fc29f26968c6002 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 9 Aug 2023 16:03:23 +0200
Subject: [PATCH] ICU-22404 Strip default ignorable code points in the skeleton
 for confusable detection

---
 icu4c/source/i18n/uspoof.cpp                  |  4 +++-
 icu4c/source/test/intltest/itspoof.cpp        | 19 ++++++++++++++++++
 .../src/com/ibm/icu/text/SpoofChecker.java    |  5 ++++-
 .../icu/dev/test/text/SpoofCheckerTest.java   | 20 +++++++++++++++++++
 4 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/icu4c/source/i18n/uspoof.cpp b/icu4c/source/i18n/uspoof.cpp
index 1f9288e01329..b14a496d4100 100644
--- a/icu4c/source/i18n/uspoof.cpp
+++ b/icu4c/source/i18n/uspoof.cpp
@@ -721,7 +721,9 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
     for (inputIndex=0; inputIndex < normalizedLen; ) {
         UChar32 c = nfdId.char32At(inputIndex);
         inputIndex += U16_LENGTH(c);
-        This->fSpoofData->confusableLookup(c, skelStr);
+        if (!u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
+            This->fSpoofData->confusableLookup(c, skelStr);
+        }
     }
 
     gNfdNormalizer->normalize(skelStr, dest, *status);
diff --git a/icu4c/source/test/intltest/itspoof.cpp b/icu4c/source/test/intltest/itspoof.cpp
index bb6fddb1631a..1e2e983c61fe 100644
--- a/icu4c/source/test/intltest/itspoof.cpp
+++ b/icu4c/source/test/intltest/itspoof.cpp
@@ -140,6 +140,18 @@ void IntlTestSpoof::testSpoofAPI() {
         TEST_ASSERT(UnicodeString("lllOO") == dest);
         TEST_ASSERT(&dest == &retStr);
     TEST_TEARDOWN;
+
+    TEST_SETUP
+        // Example from UTS #55, Section 5.1.3 https://www.unicode.org/reports/tr55/#General-Security-Profile
+        // of a minimal pair with a ZWNJ in Persian.
+        const UnicodeString behrooz(u"بهروز");
+        const UnicodeString update(u"به‌روز");
+        // These strings differ only by a ZWNJ.
+        TEST_ASSERT(UnicodeString(update).findAndReplace(u"\u200C", u"") == behrooz);
+        int32_t checkResults = uspoof_areConfusableUnicodeString(sc, behrooz, update, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
+    TEST_TEARDOWN;
 }
 
 
@@ -384,6 +396,13 @@ void IntlTestSpoof::testConfData() {
             continue;
         }
 
+        if (u_hasBinaryProperty(from.char32At(0), UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
+            // The source character is a default ignorable code point.
+            // Skip this case; the second step in obtaining a skeleton is to remove DIs,
+            // so the mapping in this line of confusables.txt will never be applied.
+            continue;
+        }
+
         UnicodeString rawExpected = parseHex(parseLine.group(2, status));
         UnicodeString expected;
         Normalizer::decompose(rawExpected, false /*NFD*/, 0, expected, status);
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java b/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java
index 28043de39c3a..f22fd49c0a26 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java
@@ -32,6 +32,7 @@
 import java.util.regex.Pattern;
 
 import com.ibm.icu.impl.ICUBinary;
+import com.ibm.icu.impl.UCharacterProperty;
 import com.ibm.icu.impl.ICUBinary.Authenticate;
 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.lang.UCharacter;
@@ -1509,7 +1510,9 @@ public String getSkeleton(CharSequence str) {
         for (int inputIndex = 0; inputIndex < normalizedLen;) {
             int c = Character.codePointAt(nfdId, inputIndex);
             inputIndex += Character.charCount(c);
-            this.fSpoofData.confusableLookup(c, skelSB);
+            if (!UCharacter.hasBinaryProperty(c, UProperty.DEFAULT_IGNORABLE_CODE_POINT)) {
+                this.fSpoofData.confusableLookup(c, skelSB);
+            }
         }
         String skelStr = skelSB.toString();
         skelStr = nfdNormalizer.normalize(skelStr);
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java
index d76e01d9bde9..7a32a036f631 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java
@@ -31,7 +31,10 @@
 import com.ibm.icu.dev.test.TestFmwk;
 import com.ibm.icu.dev.test.TestUtil;
 import com.ibm.icu.dev.test.TestUtil.JavaVendor;
+import com.ibm.icu.impl.UCharacterProperty;
 import com.ibm.icu.impl.Utility;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.lang.UScript;
 import com.ibm.icu.text.Normalizer2;
 import com.ibm.icu.text.SpoofChecker;
@@ -66,6 +69,7 @@ public class SpoofCheckerTest extends TestFmwk {
 
     String han_Hiragana = "\u3086\u308A \u77F3\u7530";  // Hiragana, space, Han
 
+    static final UnicodeSet DEFAULT_IGNORABLE_CODE_POINT = new UnicodeSet("\\p{di}");
 
     /*
      * Test basic constructor.
@@ -376,6 +380,15 @@ public void TestSpoofAPI() {
         s = "I1l0O";
         String dest = sc.getSkeleton(SpoofChecker.ANY_CASE, s);
         assertEquals("", dest, "lllOO");
+
+        // Example from UTS #55, Section 5.1.3 https://www.unicode.org/reports/tr55/#General-Security-Profile,
+        // of a minimal pair with a ZWNJ in Persian.
+        final String behrooz = "بهروز";
+        final String update = "به‌روز";
+        // These strings differ only by a ZWNJ.
+        assertEquals("", update.replace("\u200C", ""), behrooz);
+        checkResult = sc.areConfusable(behrooz, update);
+        assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResult);
     }
 
     @Test
@@ -728,6 +741,13 @@ public void testConfData() {
                     continue;
                 }
 
+                if (DEFAULT_IGNORABLE_CODE_POINT.containsSome(from)) {
+                    // The source character is a default ignorable code point.
+                    // Skip this case; the second step in obtaining a skeleton is to remove DIs,
+                    // so the mapping in this line of confusables.txt will never be applied.
+                    continue;
+                }
+
                 String rawExpected = parseHex(parseLine.group(2));
                 String expected = normalizer.normalize(rawExpected);