Skip to content

Commit

Permalink
xkbcli how-to-type: Enhance arguments parsing & doc
Browse files Browse the repository at this point in the history
Currently the positional parameter of the CLI is either a Unicode code
point or a keysym. However their respective format is not documented.

It turns out that there are multiple issues due to the use of `strtol`:
- Code points can be parsed as octal, decimal and hexadecimal while
  keysyms can only be parsed as hexadecimal. Some programs outputs
  keysyms in their decimal form (e.g. `wev`) so it is worth to bring
  symmetry with code points.
- Octal format is unusual for both and is triggered by leading zeros,
  which is unintuitive in this context.
- `U+NNNN` format is the standard format for Unicode code points but is
  not supported.
- Plain characters are not supported, e.g.: a, é, ß, Æ, γ, 🦆, etc.
  Although this is probably the easiest format for most users.

Fixed the issues above:
- Allow the code point to be passed exactly in the following formats:
  - Literal character (requires UTF-8 character encoding of the terminal);
  - Decimal number;
  - Hexadecimal number: either `0xNNNN` or `U+NNNN`.
- Allow the keysym to be passed exactly in the following formats:
  - Decimal number;
  - Hexadecimal number: either `0xNNNN`;
  - Name.
- Improve both `--help` message and manual.
  • Loading branch information
wismill committed Sep 12, 2024
1 parent 9af1f9f commit 9d4f1bf
Show file tree
Hide file tree
Showing 2 changed files with 236 additions and 22 deletions.
196 changes: 178 additions & 18 deletions tools/how-to-type.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,171 @@
#include <errno.h>

#include "xkbcommon/xkbcommon.h"
#include "src/utils.h"
#include "src/keysym.h"

#define ARRAY_SIZE(arr) ((sizeof(arr) / sizeof(*(arr))))

/* Array mapping the leading byte to the length of a UTF-8 sequence.
* A value of zero indicates that the byte can not begin a UTF-8 sequence. */
static const uint8_t utf8_sequence_length_by_leading_byte[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00-0x0F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x10-0x1F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x20-0x2F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x30-0x3F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40-0x4F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x50-0x5F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60-0x6F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x70-0x7F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80-0x8F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90-0x9F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0-0xAF */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0-0xBF */
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xC0-0xCF */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xD0-0xDF */
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xE0-0xEF */
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0-0xFF */
};

/* Length of next utf-8 sequence */
static uint8_t
utf8_sequence_length(const char *s)
{
return utf8_sequence_length_by_leading_byte[(unsigned char)s[0]];
}

/* Check if a char is the start of a UTF-8 sequence */
#define is_utf8_start(c) (((c) & 0xc0) != 0x80)
#define INVALID_UTF8_CODE_POINT UINT32_MAX

/* Reads the next UTF-8 sequence in a string */
static uint32_t
utf8_next_code_point(const char *s, size_t max_size, size_t *size_out)
{
uint32_t cp = 0;
uint8_t len = utf8_sequence_length(s);
*size_out = 0;

if (!max_size || len > max_size)
return INVALID_UTF8_CODE_POINT;

/* Handle leading byte */
switch (len) {
case 1:
*size_out = 1;
return (uint32_t)s[0];
case 2:
cp = (uint32_t)s[0] & 0x1f;
break;
case 3:
cp = (uint32_t)s[0] & 0x0f;
break;
case 4:
cp = (uint32_t)s[0] & 0x07;
break;
default:
return INVALID_UTF8_CODE_POINT;
}

/* Process remaining bytes of the UTF-8 sequence */
for (size_t k = 1; k < len; k++) {
if (((uint32_t)s[k] & 0xc0) != 0x80)
return INVALID_UTF8_CODE_POINT;
cp <<= 6;
cp |= (uint32_t)s[k] & 0x3f;
}

/* Check surrogates */
if (cp >= 0xd800 && cp <= 0xdfff)
return INVALID_UTF8_CODE_POINT;

*size_out = len;
return cp;
}

static uint32_t
parse_char_or_codepoint(const char *raw) {
size_t raw_length = strlen_safe(raw);
size_t length = 0;

if (!raw_length)
return INVALID_UTF8_CODE_POINT;

/* Try to parse the parameter as a UTF-8 encoded single character */
uint32_t codepoint = utf8_next_code_point(raw, raw_length, &length);

/* If parsing failed or did not consume all the string, then try other formats */
if (codepoint == INVALID_UTF8_CODE_POINT ||
length == 0 || length != raw_length) {
fprintf(stderr, "DEBUG: codepoint: %u; leader: %u; length: %zu/%zu\n",
codepoint, utf8_sequence_length(raw), length, raw_length);
char *endp;
long val;
int base = 10;
/* Detect U+NNNN format standard Unicode code point format */
if (raw_length >= 2 && raw[0] == 'U' && raw[1] == '+') {
base = 16;
raw += 2;
}
/* Use strtol with explicit bases instead of `0` in order to avoid
* unexpected parsing as octal. */
for (; base <= 16; base += 6) {
errno = 0;
val = strtol(raw, &endp, base);
if (errno != 0 || !isempty(endp) || val < 0 || val > 0x10FFFF) {
val = -1;
} else {
break;
}
}
if (val < 0) {
fprintf(stderr, "ERROR: Failed to convert argument to Unicode code point\n");
return INVALID_UTF8_CODE_POINT;
}
codepoint = (uint32_t) val;
}
return codepoint;
}

static void
usage(const char *argv0, FILE *fp)
{
fprintf(fp, "Usage: %s [--keysym] [--rules <rules>] [--model <model>] "
fprintf(fp, "Usage: %s [--help] [--keysym] [--rules <rules>] [--model <model>] "
"[--layout <layout>] [--variant <variant>] [--options <options>]"
" <unicode codepoint/keysym>\n", argv0);
" <character/codepoint/keysym>\n", argv0);
fprintf(fp,
"\n"
"Prints the key combinations (keycode + modifiers) in the keymap's "
"layouts which would produce the given Unicode codepoint or keysym.\n"
"\n"
"<character/codepoint/keysym> is either:\n"
"- a single character (requires a terminal which uses UTF-8 character "
"encoding);"
"- a Unicode code point, interpreted as hexadecimal if prefixed with"
"`0x` or `U+` else as decimal;\n"
"- a keysym if --keysym is used: either a numeric value (hexadecimal"
"if prefixed with 0x else decimal) or a keysym name.\n"
"\n"
"Options:\n"
" --help\n"
" Print this help and exit\n"
" --keysym\n"
" Treat the argument as a keysym, not a Unicode codepoint\n"
"XKB-specific options:\n"
" --rules <rules>\n"
" The XKB ruleset (default: '%s')\n"
" --model <model>\n"
" The XKB model (default: '%s')\n"
" --layout <layout>\n"
" The XKB layout (default: '%s')\n"
" --variant <variant>\n"
" The XKB layout variant (default: '%s')\n"
" --options <options>\n"
" The XKB options (default: '%s')\n"
"\n",
DEFAULT_XKB_RULES, DEFAULT_XKB_MODEL, DEFAULT_XKB_LAYOUT,
DEFAULT_XKB_VARIANT ? DEFAULT_XKB_VARIANT : "<none>",
DEFAULT_XKB_OPTIONS ? DEFAULT_XKB_OPTIONS : "<none>");
}

int
Expand Down Expand Up @@ -117,41 +272,43 @@ main(int argc, char *argv[])
}
}
if (argc - optind != 1) {
usage(argv[0], stderr);
exit(EXIT_INVALID_USAGE);
fprintf(stderr, "ERROR: missing positional parameter\n");
goto parse_error;
}

if (keysym_mode) {
// Try to parse keysym name or hexadecimal value (0xNNNN)
keysym = xkb_keysym_from_name(argv[optind], XKB_KEYSYM_NO_FLAGS);
if (keysym == XKB_KEY_NoSymbol) {
fprintf(stderr, "Failed to convert argument to keysym\n");
goto err;
// Try to parse numeric keysym in base 10, without prefix
val = strtol(argv[optind], &endp, 10);
if (errno != 0 || !isempty(endp) || val <= 0 || val > XKB_KEYSYM_MAX) {
fprintf(stderr, "ERROR: Failed to convert argument to keysym\n");
goto parse_error;
}
keysym = (uint32_t) val;
}
} else {
errno = 0;
val = strtol(argv[optind], &endp, 0);
if (errno != 0 || endp == argv[optind] || val < 0 || val > 0x10FFFF) {
usage(argv[0], stderr);
exit(EXIT_INVALID_USAGE);
}
codepoint = (uint32_t) val;
codepoint = parse_char_or_codepoint(argv[optind]);
if (codepoint == INVALID_UTF8_CODE_POINT)
goto parse_error;

keysym = xkb_utf32_to_keysym(codepoint);
if (keysym == XKB_KEY_NoSymbol) {
fprintf(stderr, "Failed to convert codepoint to keysym\n");
goto err;
fprintf(stderr, "ERROR: Failed to convert codepoint to keysym\n");
goto parse_error;
}
}

ret = xkb_keysym_get_name(keysym, name, sizeof(name));
if (ret < 0 || (size_t) ret >= sizeof(name)) {
fprintf(stderr, "Failed to get name of keysym\n");
fprintf(stderr, "ERROR: Failed to get name of keysym\n");
goto err;
}

ctx = xkb_context_new(XKB_CONTEXT_NO_FLAGS);
if (!ctx) {
fprintf(stderr, "Failed to create XKB context\n");
fprintf(stderr, "ERROR: Failed to create XKB context\n");
goto err;
}

Expand All @@ -165,7 +322,7 @@ main(int argc, char *argv[])
keymap = xkb_keymap_new_from_names(ctx, &names,
XKB_KEYMAP_COMPILE_NO_FLAGS);
if (!keymap) {
fprintf(stderr, "Failed to create XKB keymap\n");
fprintf(stderr, "ERROR: Failed to create XKB keymap\n");
goto err;
}

Expand Down Expand Up @@ -237,4 +394,7 @@ main(int argc, char *argv[])
xkb_keymap_unref(keymap);
xkb_context_unref(ctx);
return err;
parse_error:
usage(argv[0], stderr);
exit(EXIT_INVALID_USAGE);
}
62 changes: 58 additions & 4 deletions tools/xkbcli-how-to-type.1
Original file line number Diff line number Diff line change
@@ -1,21 +1,75 @@
.Dd June 4, 2024
.Dd September 11, 2024
.Dt XKBCLI\-HOW\-TO\-TYPE 1
.Os
.
.Sh NAME
.Nm "xkbcli\-how\-to\-type"
.Nd query how to type a given Unicode codepoint
.Nd query how to type a given Unicode codepoint or keysym
.
.Sh SYNOPSIS
.Nm
.Op options
.Ar codepoint/keysym
.Ar character/codepoint/keysym
.
.Sh DESCRIPTION
.Nm
prints the key combinations (keycode + modifiers) in the keymap's layouts which
would produce the given Unicode codepoint.
would produce the given Unicode codepoint or keysym.
.
.Pp
.Ar codepoint/keysym
is either:
.
.Bl -bullet -compact
.It
a single character (requires a terminal which uses UTF-8 character encoding);
.It
a Unicode code point, interpreted as hexadecimal if prefixed with
.Li 0x
or
.Li U+
else as decimal;
.
.It
a keysym if
.Fl \-keysym
is used: either a \fInumeric\fP value (hexadecimal if prefixed with
.Li 0x
else decimal) or a keysym \fIname\fP.
.El
.
.Sh EXAMPLES
.Bl -tag -width Ds
.It Nm Fl \-layout Ar us 97
.It Nm Fl \-layout Ar us 0x61
.It Nm Fl \-layout Ar us U+0061
.It Nm Fl \-layout Ar us a
Print the key combinations that produce the letter "a"
.Po
decimal code point:
.Ar 97 ,
hexadecimal code point:
.Ar 61
.Pc
in the default
.Ar us
layout.
.It Nm Fl \-layout Ar us Fl \-keysym Ar 97
.It Nm Fl \-layout Ar us Fl \-keysym Ar 0x61
.It Nm Fl \-layout Ar us Fl \-keysym Ar a
Print the key combinations that produce the keysym "a"
.Po
decimal code:
.Ar 97 ,
hexadecimal code:
.Ar 61
.Pc
in the default
.Ar us
layout.
.Be
.
.Sh OPTIONS
.Bl -tag -width Ds
.It Fl \-keysym
Treat the argument as a keysym, not a Unicode codepoint
Expand Down

0 comments on commit 9d4f1bf

Please sign in to comment.