xkbcommon · wismill · Sep 23, 2024 · Sep 12, 2024 · Sep 11, 2024
diff --git a/changes/tools/+how-to-type-format.feature.md b/changes/tools/+how-to-type-format.feature.md
@@ -0,0 +1,11 @@
+`xkbcli how-to-type`: added new input formats and their corresponding documentation.
+
+*Unicode code points* can be passed in the following formats:
+- Literal character (requires UTF-8 character encoding of the terminal);
+- Decimal number;
+- Hexadecimal number: either `0xNNNN` or `U+NNNN`.
+
+*Keysyms* can to be passed in the following formats:
+- Decimal number;
+- Hexadecimal number: `0xNNNN`;
+- Name.
diff --git a/meson.build b/meson.build
@@ -518,6 +518,8 @@ if build_tools
     # Tool: how-to-type
     executable('xkbcli-how-to-type',
                'tools/how-to-type.c',
+               'src/utf8-decoding.c',
+               'src/utf8-decoding.h',
                dependencies: tools_dep,
                install: true,
                install_dir: dir_libexec)
@@ -753,7 +755,13 @@ test(
 )
 test(
     'utf8',
-    executable('test-utf8', 'test/utf8.c', dependencies: test_dep),
+    executable(
+        'test-utf8',
+        'test/utf8.c',
+        'src/utf8-decoding.c',
+        'src/utf8-decoding.h',
+        dependencies: test_dep
+    ),
     env: test_env,
 )
 test(

diff --git a/src/utf8-decoding.c b/src/utf8-decoding.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright © 2024 Pierre Le Marre <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "config.h"
+
+#include "utf8-decoding.h"
+
+/* Array mapping the leading byte to the length of a UTF-8 sequence.
+ * A value of zero indicates that the byte can not begin a UTF-8 sequence. */
+static const uint8_t utf8_sequence_length_by_leading_byte[256] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00-0x0F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x10-0x1F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x20-0x2F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x30-0x3F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40-0x4F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x50-0x5F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60-0x6F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x70-0x7F */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80-0x8F */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90-0x9F */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0-0xAF */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0-0xBF */
+    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xC0-0xCF */
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xD0-0xDF */
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xE0-0xEF */
+    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0-0xFF */
+};
+
+/* Length of next utf-8 sequence */
+uint8_t
+utf8_sequence_length(const char *s)
+{
+    return utf8_sequence_length_by_leading_byte[(unsigned char)s[0]];
+}
+
+/* Reads the next UTF-8 sequence in a string */
+uint32_t
+utf8_next_code_point(const char *s, size_t max_size, size_t *size_out)
+{
+    uint32_t cp = 0;
+    uint8_t len = utf8_sequence_length(s);
+    *size_out = 0;
+
+    if (!max_size || len > max_size)
+        return INVALID_UTF8_CODE_POINT;
+
+    /* Handle leading byte */
+    switch (len) {
+    case 1:
+        *size_out = 1;
+        return (uint32_t)s[0];
+    case 2:
+        cp = (uint32_t)s[0] & 0x1f;
+        break;
+    case 3:
+        cp = (uint32_t)s[0] & 0x0f;
+        break;
+    case 4:
+        cp = (uint32_t)s[0] & 0x07;
+        break;
+    default:
+        return INVALID_UTF8_CODE_POINT;
+    }
+
+    /* Process remaining bytes of the UTF-8 sequence */
+    for (size_t k = 1; k < len; k++) {
+        if (((uint32_t)s[k] & 0xc0) != 0x80)
+            return INVALID_UTF8_CODE_POINT;
+        cp <<= 6;
+        cp |= (uint32_t)s[k] & 0x3f;
+    }
+
+    /* Check surrogates */
+    if (cp >= 0xd800 && cp <= 0xdfff)
+        return INVALID_UTF8_CODE_POINT;
+
+    *size_out = len;
+    return cp;
+}
diff --git a/src/utf8-decoding.h b/src/utf8-decoding.h
@@ -0,0 +1,20 @@
+
+#ifndef UTF8_DECODING_H
+#define UTF8_DECODING_H
+
+#include "config.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* Check if a char is the start of a UTF-8 sequence */
+#define is_utf8_start(c) (((c) & 0xc0) != 0x80)
+#define INVALID_UTF8_CODE_POINT UINT32_MAX
+
+uint8_t
+utf8_sequence_length(const char *s);
+
+uint32_t
+utf8_next_code_point(const char *s, size_t max_size, size_t *size_out);
+
+#endif
diff --git a/test/utf8.c b/test/utf8.c
@@ -32,6 +32,7 @@
 #include "src/keysym.h"
 #include "test.h"
 #include "utf8.h"
+#include "utf8-decoding.h"
 #include "utils.h"
 
 #define VALID(lit) assert(is_valid_utf8(lit, sizeof(lit)-1))
@@ -179,13 +180,34 @@ test_utf32_to_utf8(void)
     check_utf32_to_utf8(0xffffffff, 0, "");
 }
 
+static void
+/* Check roundtrip UTF-32 → UTF-8 → UTF-32 */
+test_utf8_to_utf32(void)
+{
+    char buffer[XKB_KEYSYM_UTF8_MAX_SIZE];
+    for (uint32_t cp = 0; cp < 0x10ffff; cp++) {
+        int length = utf32_to_utf8(cp, buffer) - 1;
+        /* Check surrogates */
+        if (cp >= 0xd800 && cp <= 0xdfff) {
+            assert(length == -1);
+        } else {
+            assert(length > 0);
+            size_t length2 = 0;
+            uint32_t cp2 = utf8_next_code_point(buffer, (size_t)length, &length2);
+            assert(cp2 != INVALID_UTF8_CODE_POINT && cp2 == cp &&
+                   length2 == (size_t)length);
+        }
+    }
+}
+
 int
 main(void)
 {
     test_init();
 
     test_is_valid_utf8();
     test_utf32_to_utf8();
+    test_utf8_to_utf32();
 
     return 0;
 }
diff --git a/tools/how-to-type.c b/tools/how-to-type.c
@@ -30,16 +30,94 @@
 #include <errno.h>
 
 #include "xkbcommon/xkbcommon.h"
+#include "src/utils.h"
 #include "src/keysym.h"
+#include "src/utf8-decoding.h"
 
 #define ARRAY_SIZE(arr) ((sizeof(arr) / sizeof(*(arr))))
 
+static uint32_t
+parse_char_or_codepoint(const char *raw) {
+    size_t raw_length = strlen_safe(raw);
+    size_t length = 0;
+
+    if (!raw_length)
+        return INVALID_UTF8_CODE_POINT;
+
+    /* Try to parse the parameter as a UTF-8 encoded single character */
+    uint32_t codepoint = utf8_next_code_point(raw, raw_length, &length);
+
+    /* If parsing failed or did not consume all the string, then try other formats */
+    if (codepoint == INVALID_UTF8_CODE_POINT ||
+        length == 0 || length != raw_length) {
+        char *endp;
+        long val;
+        int base = 10;
+        /* Detect U+NNNN format standard Unicode code point format */
+        if (raw_length >= 2 && raw[0] == 'U' && raw[1] == '+') {
+            base = 16;
+            raw += 2;
+        }
+        /* Use strtol with explicit bases instead of `0` in order to avoid
+         * unexpected parsing as octal. */
+        for (; base <= 16; base += 6) {
+            errno = 0;
+            val = strtol(raw, &endp, base);
+            if (errno != 0 || !isempty(endp) || val < 0 || val > 0x10FFFF) {
+                val = -1;
+            } else {
+                break;
+            }
+        }
+        if (val < 0) {
+            fprintf(stderr, "ERROR: Failed to convert argument to Unicode code point\n");
+            return INVALID_UTF8_CODE_POINT;
+        }
+        codepoint = (uint32_t) val;
+    }
+    return codepoint;
+}
+
 static void
 usage(const char *argv0, FILE *fp)
 {
-    fprintf(fp, "Usage: %s [--keysym] [--rules <rules>] [--model <model>] "
+    fprintf(fp, "Usage: %s [--help] [--keysym] [--rules <rules>] [--model <model>] "
                 "[--layout <layout>] [--variant <variant>] [--options <options>]"
-                " <unicode codepoint/keysym>\n", argv0);
+                " <character/codepoint/keysym>\n", argv0);
+    fprintf(
+        fp,
+        "\n"
+        "Prints the key combinations (keycode + modifiers) in the keymap's layouts which\n"
+        "would produce the given Unicode code point or keysym.\n"
+        "\n"
+        "<character/codepoint/keysym> is either:\n"
+        "- a single character (requires a terminal which uses UTF-8 character encoding);\n"
+        "- a Unicode code point, interpreted as hexadecimal if prefixed with '0x' or 'U+'\n"
+        "  else as decimal;\n"
+        "- a keysym if --keysym is used: either a numeric value (hexadecimal if prefixed\n"
+        "  with '0x' else decimal) or a keysym name.\n"
+        "\n"
+        "Options:\n"
+        " --help\n"
+        "    Print this help and exit\n"
+        " --keysym\n"
+        "    Treat the argument as a keysym, not a Unicode code point\n"
+        "\n"
+        "XKB-specific options:\n"
+        " --rules <rules>\n"
+        "    The XKB ruleset (default: '%s')\n"
+        " --model <model>\n"
+        "    The XKB model (default: '%s')\n"
+        " --layout <layout>\n"
+        "    The XKB layout (default: '%s')\n"
+        " --variant <variant>\n"
+        "    The XKB layout variant (default: '%s')\n"
+        " --options <options>\n"
+        "    The XKB options (default: '%s')\n"
+        "\n",
+        DEFAULT_XKB_RULES, DEFAULT_XKB_MODEL, DEFAULT_XKB_LAYOUT,
+        DEFAULT_XKB_VARIANT ? DEFAULT_XKB_VARIANT : "<none>",
+        DEFAULT_XKB_OPTIONS ? DEFAULT_XKB_OPTIONS : "<none>");
 }
 
 int
@@ -117,41 +195,43 @@ main(int argc, char *argv[])
         }
     }
     if (argc - optind != 1) {
-        usage(argv[0], stderr);
-        exit(EXIT_INVALID_USAGE);
+        fprintf(stderr, "ERROR: missing positional parameter\n");
+        goto parse_error;
     }
 
     if (keysym_mode) {
+        // Try to parse keysym name or hexadecimal value (0xNNNN)
         keysym = xkb_keysym_from_name(argv[optind], XKB_KEYSYM_NO_FLAGS);
         if (keysym == XKB_KEY_NoSymbol) {
-            fprintf(stderr, "Failed to convert argument to keysym\n");
-            goto err;
+            // Try to parse numeric keysym in base 10, without prefix
+            val = strtol(argv[optind], &endp, 10);
+            if (errno != 0 || !isempty(endp) || val <= 0 || val > XKB_KEYSYM_MAX) {
+                fprintf(stderr, "ERROR: Failed to convert argument to keysym\n");
+                goto parse_error;
+            }
+            keysym = (uint32_t) val;
         }
     } else {
-        errno = 0;
-        val = strtol(argv[optind], &endp, 0);
-        if (errno != 0 || endp == argv[optind] || val < 0 || val > 0x10FFFF) {
-            usage(argv[0], stderr);
-            exit(EXIT_INVALID_USAGE);
-        }
-        codepoint = (uint32_t) val;
+        codepoint = parse_char_or_codepoint(argv[optind]);
+        if (codepoint == INVALID_UTF8_CODE_POINT)
+            goto parse_error;
 
         keysym = xkb_utf32_to_keysym(codepoint);
         if (keysym == XKB_KEY_NoSymbol) {
-            fprintf(stderr, "Failed to convert codepoint to keysym\n");
-            goto err;
+            fprintf(stderr, "ERROR: Failed to convert code point to keysym\n");
+            goto parse_error;
         }
     }
 
     ret = xkb_keysym_get_name(keysym, name, sizeof(name));
     if (ret < 0 || (size_t) ret >= sizeof(name)) {
-        fprintf(stderr, "Failed to get name of keysym\n");
+        fprintf(stderr, "ERROR: Failed to get name of keysym\n");
         goto err;
     }
 
     ctx = xkb_context_new(XKB_CONTEXT_NO_FLAGS);
     if (!ctx) {
-        fprintf(stderr, "Failed to create XKB context\n");
+        fprintf(stderr, "ERROR: Failed to create XKB context\n");
         goto err;
     }
 
@@ -165,7 +245,7 @@ main(int argc, char *argv[])
     keymap = xkb_keymap_new_from_names(ctx, &names,
                                        XKB_KEYMAP_COMPILE_NO_FLAGS);
     if (!keymap) {
-        fprintf(stderr, "Failed to create XKB keymap\n");
+        fprintf(stderr, "ERROR: Failed to create XKB keymap\n");
         goto err;
     }
 
@@ -237,4 +317,7 @@ main(int argc, char *argv[])
     xkb_keymap_unref(keymap);
     xkb_context_unref(ctx);
     return err;
+parse_error:
+    usage(argv[0], stderr);
+    exit(EXIT_INVALID_USAGE);
 }