xkbcli how-to-type: Enhance arguments parsing & doc

Currently the positional parameter of the CLI is either a Unicode code point or a keysym. However their respective format is not documented. It turns out that there are multiple issues due to the use of `strtol`: - Code points can be parsed as octal, decimal and hexadecimal while keysyms can only be parsed as hexadecimal. Some programs outputs keysyms in their decimal form (e.g. `wev`) so it is worth to bring symmetry with code points. - Octal format is unusual for both and is triggered by leading zeros, which is unintuitive in this context. - `U+NNNN` format is the standard format for Unicode code points but is not supported. - Plain characters are not supported, e.g.: a, é, ß, Æ, γ, 🦆, etc. Although this is probably the easiest format for most users. Fixed the issues above: - Allow the code point to be passed exactly in the following formats: - Literal character (requires UTF-8 character encoding of the terminal); - Decimal number; - Hexadecimal number: either `0xNNNN` or `U+NNNN`. - Allow the keysym to be passed exactly in the following formats: - Decimal number; - Hexadecimal number: either `0xNNNN`; - Name. - Improve both `--help` message and manual.
xkbcommon · Sep 12, 2024 · 9d4f1bf · 9d4f1bf
1 parent 9af1f9f
commit 9d4f1bf
Show file tree

Hide file tree

Showing 2 changed files with 236 additions and 22 deletions.
diff --git a/tools/how-to-type.c b/tools/how-to-type.c
@@ -30,16 +30,171 @@
 #include <errno.h>
 
 #include "xkbcommon/xkbcommon.h"
+#include "src/utils.h"
 #include "src/keysym.h"
 
 #define ARRAY_SIZE(arr) ((sizeof(arr) / sizeof(*(arr))))
 
+/* Array mapping the leading byte to the length of a UTF-8 sequence.
+ * A value of zero indicates that the byte can not begin a UTF-8 sequence. */
+static const uint8_t utf8_sequence_length_by_leading_byte[256] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00-0x0F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x10-0x1F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x20-0x2F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x30-0x3F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40-0x4F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x50-0x5F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60-0x6F */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x70-0x7F */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80-0x8F */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90-0x9F */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0-0xAF */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0-0xBF */
+    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xC0-0xCF */
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xD0-0xDF */
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xE0-0xEF */
+    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0-0xFF */
+};
+
+/* Length of next utf-8 sequence */
+static uint8_t
+utf8_sequence_length(const char *s)
+{
+    return utf8_sequence_length_by_leading_byte[(unsigned char)s[0]];
+}
+
+/* Check if a char is the start of a UTF-8 sequence */
+#define is_utf8_start(c) (((c) & 0xc0) != 0x80)
+#define INVALID_UTF8_CODE_POINT UINT32_MAX
+
+/* Reads the next UTF-8 sequence in a string */
+static uint32_t
+utf8_next_code_point(const char *s, size_t max_size, size_t *size_out)
+{
+    uint32_t cp = 0;
+    uint8_t len = utf8_sequence_length(s);
+    *size_out = 0;
+
+    if (!max_size || len > max_size)
+        return INVALID_UTF8_CODE_POINT;
+
+    /* Handle leading byte */
+    switch (len) {
+    case 1:
+        *size_out = 1;
+        return (uint32_t)s[0];
+    case 2:
+        cp = (uint32_t)s[0] & 0x1f;
+        break;
+    case 3:
+        cp = (uint32_t)s[0] & 0x0f;
+        break;
+    case 4:
+        cp = (uint32_t)s[0] & 0x07;
+        break;
+    default:
+        return INVALID_UTF8_CODE_POINT;
+    }
+
+    /* Process remaining bytes of the UTF-8 sequence */
+    for (size_t k = 1; k < len; k++) {
+        if (((uint32_t)s[k] & 0xc0) != 0x80)
+            return INVALID_UTF8_CODE_POINT;
+        cp <<= 6;
+        cp |= (uint32_t)s[k] & 0x3f;
+    }
+
+    /* Check surrogates */
+    if (cp >= 0xd800 && cp <= 0xdfff)
+        return INVALID_UTF8_CODE_POINT;
+
+    *size_out = len;
+    return cp;
+}
+
+static uint32_t
+parse_char_or_codepoint(const char *raw) {
+    size_t raw_length = strlen_safe(raw);
+    size_t length = 0;
+
+    if (!raw_length)
+        return INVALID_UTF8_CODE_POINT;
+
+    /* Try to parse the parameter as a UTF-8 encoded single character */
+    uint32_t codepoint = utf8_next_code_point(raw, raw_length, &length);
+
+    /* If parsing failed or did not consume all the string, then try other formats */
+    if (codepoint == INVALID_UTF8_CODE_POINT ||
+        length == 0 || length != raw_length) {
+        fprintf(stderr, "DEBUG: codepoint: %u; leader: %u; length: %zu/%zu\n",
+                codepoint, utf8_sequence_length(raw), length, raw_length);
+        char *endp;
+        long val;
+        int base = 10;
+        /* Detect U+NNNN format standard Unicode code point format */
+        if (raw_length >= 2 && raw[0] == 'U' && raw[1] == '+') {
+            base = 16;
+            raw += 2;
+        }
+        /* Use strtol with explicit bases instead of `0` in order to avoid
+         * unexpected parsing as octal. */
+        for (; base <= 16; base += 6) {
+            errno = 0;
+            val = strtol(raw, &endp, base);
+            if (errno != 0 || !isempty(endp) || val < 0 || val > 0x10FFFF) {
+                val = -1;
+            } else {
+                break;
+            }
+        }
+        if (val < 0) {
+            fprintf(stderr, "ERROR: Failed to convert argument to Unicode code point\n");
+            return INVALID_UTF8_CODE_POINT;
+        }
+        codepoint = (uint32_t) val;
+    }
+    return codepoint;
+}
+
 static void
 usage(const char *argv0, FILE *fp)
 {
-    fprintf(fp, "Usage: %s [--keysym] [--rules <rules>] [--model <model>] "
+    fprintf(fp, "Usage: %s [--help] [--keysym] [--rules <rules>] [--model <model>] "
                 "[--layout <layout>] [--variant <variant>] [--options <options>]"
-                " <unicode codepoint/keysym>\n", argv0);
+                " <character/codepoint/keysym>\n", argv0);
+    fprintf(fp,
+            "\n"
+            "Prints the key combinations (keycode + modifiers) in the keymap's "
+            "layouts which would produce the given Unicode codepoint or keysym.\n"
+            "\n"
+            "<character/codepoint/keysym> is either:\n"
+            "- a single character (requires a terminal which uses UTF-8 character "
+            "encoding);"
+            "- a Unicode code point, interpreted as hexadecimal if prefixed with"
+            "`0x` or `U+` else as decimal;\n"
+            "- a keysym if --keysym is used: either a numeric value (hexadecimal"
+            "if prefixed with 0x else decimal) or a keysym name.\n"
+            "\n"
+            "Options:\n"
+            " --help\n"
+            "    Print this help and exit\n"
+            " --keysym\n"
+            "    Treat the argument as a keysym, not a Unicode codepoint\n"
+            "XKB-specific options:\n"
+            " --rules <rules>\n"
+            "    The XKB ruleset (default: '%s')\n"
+            " --model <model>\n"
+            "    The XKB model (default: '%s')\n"
+            " --layout <layout>\n"
+            "    The XKB layout (default: '%s')\n"
+            " --variant <variant>\n"
+            "    The XKB layout variant (default: '%s')\n"
+            " --options <options>\n"
+            "    The XKB options (default: '%s')\n"
+            "\n",
+            DEFAULT_XKB_RULES, DEFAULT_XKB_MODEL, DEFAULT_XKB_LAYOUT,
+            DEFAULT_XKB_VARIANT ? DEFAULT_XKB_VARIANT : "<none>",
+            DEFAULT_XKB_OPTIONS ? DEFAULT_XKB_OPTIONS : "<none>");
 }
 
 int
@@ -117,41 +272,43 @@ main(int argc, char *argv[])
         }
     }
     if (argc - optind != 1) {
-        usage(argv[0], stderr);
-        exit(EXIT_INVALID_USAGE);
+        fprintf(stderr, "ERROR: missing positional parameter\n");
+        goto parse_error;
     }
 
     if (keysym_mode) {
+        // Try to parse keysym name or hexadecimal value (0xNNNN)
         keysym = xkb_keysym_from_name(argv[optind], XKB_KEYSYM_NO_FLAGS);
         if (keysym == XKB_KEY_NoSymbol) {
-            fprintf(stderr, "Failed to convert argument to keysym\n");
-            goto err;
+            // Try to parse numeric keysym in base 10, without prefix
+            val = strtol(argv[optind], &endp, 10);
+            if (errno != 0 || !isempty(endp) || val <= 0 || val > XKB_KEYSYM_MAX) {
+                fprintf(stderr, "ERROR: Failed to convert argument to keysym\n");
+                goto parse_error;
+            }
+            keysym = (uint32_t) val;
         }
     } else {
-        errno = 0;
-        val = strtol(argv[optind], &endp, 0);
-        if (errno != 0 || endp == argv[optind] || val < 0 || val > 0x10FFFF) {
-            usage(argv[0], stderr);
-            exit(EXIT_INVALID_USAGE);
-        }
-        codepoint = (uint32_t) val;
+        codepoint = parse_char_or_codepoint(argv[optind]);
+        if (codepoint == INVALID_UTF8_CODE_POINT)
+            goto parse_error;
 
         keysym = xkb_utf32_to_keysym(codepoint);
         if (keysym == XKB_KEY_NoSymbol) {
-            fprintf(stderr, "Failed to convert codepoint to keysym\n");
-            goto err;
+            fprintf(stderr, "ERROR: Failed to convert codepoint to keysym\n");
+            goto parse_error;
         }
     }
 
     ret = xkb_keysym_get_name(keysym, name, sizeof(name));
     if (ret < 0 || (size_t) ret >= sizeof(name)) {
-        fprintf(stderr, "Failed to get name of keysym\n");
+        fprintf(stderr, "ERROR: Failed to get name of keysym\n");
         goto err;
     }
 
     ctx = xkb_context_new(XKB_CONTEXT_NO_FLAGS);
     if (!ctx) {
-        fprintf(stderr, "Failed to create XKB context\n");
+        fprintf(stderr, "ERROR: Failed to create XKB context\n");
         goto err;
     }
 
@@ -165,7 +322,7 @@ main(int argc, char *argv[])
     keymap = xkb_keymap_new_from_names(ctx, &names,
                                        XKB_KEYMAP_COMPILE_NO_FLAGS);
     if (!keymap) {
-        fprintf(stderr, "Failed to create XKB keymap\n");
+        fprintf(stderr, "ERROR: Failed to create XKB keymap\n");
         goto err;
     }
 
@@ -237,4 +394,7 @@ main(int argc, char *argv[])
     xkb_keymap_unref(keymap);
     xkb_context_unref(ctx);
     return err;
+parse_error:
+    usage(argv[0], stderr);
+    exit(EXIT_INVALID_USAGE);
 }
diff --git a/tools/xkbcli-how-to-type.1 b/tools/xkbcli-how-to-type.1
@@ -1,21 +1,75 @@
-.Dd June 4, 2024
+.Dd September 11, 2024
 .Dt XKBCLI\-HOW\-TO\-TYPE 1
 .Os
 .
 .Sh NAME
 .Nm "xkbcli\-how\-to\-type"
-.Nd query how to type a given Unicode codepoint
+.Nd query how to type a given Unicode codepoint or keysym
 .
 .Sh SYNOPSIS
 .Nm
 .Op options
-.Ar codepoint/keysym
+.Ar character/codepoint/keysym
 .
 .Sh DESCRIPTION
 .Nm
 prints the key combinations (keycode + modifiers) in the keymap's layouts which
-would produce the given Unicode codepoint.
+would produce the given Unicode codepoint or keysym.
+.
+.Pp
+.Ar codepoint/keysym
+is either:
+.
+.Bl -bullet -compact
+.It
+a single character (requires a terminal which uses UTF-8 character encoding);
+.It
+a Unicode code point, interpreted as hexadecimal if prefixed with
+.Li 0x
+or
+.Li U+
+else as decimal;
+.
+.It
+a keysym if
+.Fl \-keysym
+is used: either a \fInumeric\fP value (hexadecimal if prefixed with
+.Li 0x
+else decimal) or a keysym \fIname\fP.
+.El
+.
+.Sh EXAMPLES
+.Bl -tag -width Ds
+.It Nm Fl \-layout Ar us 97
+.It Nm Fl \-layout Ar us 0x61
+.It Nm Fl \-layout Ar us U+0061
+.It Nm Fl \-layout Ar us a
+Print the key combinations that produce the letter "a"
+.Po
+decimal code point:
+.Ar 97 ,
+hexadecimal code point:
+.Ar 61
+.Pc
+in the default
+.Ar us
+layout.
+.It Nm Fl \-layout Ar us Fl \-keysym Ar 97
+.It Nm Fl \-layout Ar us Fl \-keysym Ar 0x61
+.It Nm Fl \-layout Ar us Fl \-keysym Ar a
+Print the key combinations that produce the keysym "a"
+.Po
+decimal code:
+.Ar 97 ,
+hexadecimal code:
+.Ar 61
+.Pc
+in the default
+.Ar us
+layout.
+.Be
 .
+.Sh OPTIONS
 .Bl -tag -width Ds
 .It Fl \-keysym
 Treat the argument as a keysym, not a Unicode codepoint