Do not interpret nor emit invalid Unicode encoding forms Surrogates are invalid in both UTF-32 and UTF-8. See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875 and https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G31703
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
diff --git a/src/keysym-utf.c b/src/keysym-utf.c
index a9d46d1..0bb9a4f 100644
--- a/src/keysym-utf.c
+++ b/src/keysym-utf.c
@@ -41,6 +41,8 @@
#include "utils.h"
#include "utf8.h"
+#define NO_KEYSYM_UNICODE_CONVERSION 0
+
/* We don't use the uint32_t types here, to save some space. */
struct codepair {
uint16_t keysym;
@@ -847,7 +849,7 @@ bin_search(const struct codepair *table, size_t length, xkb_keysym_t keysym)
}
/* no matching Unicode value found in table */
- return 0;
+ return NO_KEYSYM_UNICODE_CONVERSION;
}
XKB_EXPORT uint32_t
@@ -871,6 +873,13 @@ xkb_keysym_to_utf32(xkb_keysym_t keysym)
return keysym & 0x7f;
/* also check for directly encoded Unicode codepoints */
+
+ /* Exclude surrogates: they are invalid in UTF-32.
+ * See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875
+ * for further details.
+ */
+ if (0x0100d800 <= keysym && keysym <= 0x0100dfff)
+ return NO_KEYSYM_UNICODE_CONVERSION;
/*
* In theory, this is supposed to start from 0x100100, such that the ASCII
* range, which is already covered by 0x00-0xff, can't be encoded in two
@@ -900,7 +909,8 @@ xkb_utf32_to_keysym(uint32_t ucs)
return XKB_KEY_Delete;
/* Unicode non-symbols and code points outside Unicode planes */
- if ((ucs >= 0xfdd0 && ucs <= 0xfdef) ||
+ if ((ucs >= 0xd800 && ucs <= 0xdfff) ||
+ (ucs >= 0xfdd0 && ucs <= 0xfdef) ||
ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
return XKB_KEY_NoSymbol;
@@ -948,7 +958,7 @@ xkb_keysym_to_utf8(xkb_keysym_t keysym, char *buffer, size_t size)
codepoint = xkb_keysym_to_utf32(keysym);
- if (codepoint == 0)
+ if (codepoint == NO_KEYSYM_UNICODE_CONVERSION)
return 0;
return utf32_to_utf8(codepoint, buffer);
diff --git a/src/utf8.c b/src/utf8.c
index 15aa237..d37ba8e 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -32,6 +32,11 @@
#include "utf8.h"
+/* Conformant encoding form conversion from UTF-32 to UTF-8.
+ *
+ * See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875
+ * for further details.
+*/
int
utf32_to_utf8(uint32_t unichar, char *buffer)
{
@@ -47,6 +52,10 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
length = 2;
head = 0xc0;
}
+ /* Handle surrogates */
+ else if (0xd800 <= unichar && unichar <= 0xdfff) {
+ goto ill_formed_code_unit_subsequence;
+ }
else if (unichar <= 0xffff) {
length = 3;
head = 0xe0;
@@ -56,8 +65,7 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
head = 0xf0;
}
else {
- buffer[0] = '\0';
- return 0;
+ goto ill_formed_code_unit_subsequence;
}
for (count = length - 1, shift = 0; count > 0; count--, shift += 6)
@@ -67,6 +75,10 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
buffer[length] = '\0';
return length + 1;
+
+ill_formed_code_unit_subsequence:
+ buffer[0] = '\0';
+ return 0;
}
bool
diff --git a/test/keysym.c b/test/keysym.c
index 38f967d..a4dba0c 100644
--- a/test/keysym.c
+++ b/test/keysym.c
@@ -222,6 +222,8 @@ main(void)
assert(test_utf8(0x10005d0, "א"));
assert(test_utf8(0x110ffff, "\xf4\x8f\xbf\xbf"));
+ assert(test_utf8(0x0100d800, NULL) == 0); // Unicode surrogates
+ assert(test_utf8(0x0100dfff, NULL) == 0); // Unicode surrogates
assert(test_utf8(0x1110000, NULL) == 0);
assert(test_utf32_to_keysym('y', XKB_KEY_y));
@@ -255,6 +257,8 @@ main(void)
assert(test_utf32_to_keysym(0x20ac, XKB_KEY_EuroSign));
// Unicode non-characters
+ assert(test_utf32_to_keysym(0xd800, XKB_KEY_NoSymbol)); // Unicode surrogates
+ assert(test_utf32_to_keysym(0xdfff, XKB_KEY_NoSymbol)); // Unicode surrogates
assert(test_utf32_to_keysym(0xfdd0, XKB_KEY_NoSymbol));
assert(test_utf32_to_keysym(0xfdef, XKB_KEY_NoSymbol));
assert(test_utf32_to_keysym(0xfffe, XKB_KEY_NoSymbol));
diff --git a/test/utf8.c b/test/utf8.c
index 214e356..aa3c0d5 100644
--- a/test/utf8.c
+++ b/test/utf8.c
@@ -170,6 +170,8 @@ test_utf32_to_utf8(void)
check_utf32_to_utf8(0x40, 2, "\x40");
check_utf32_to_utf8(0xA1, 3, "\xc2\xa1");
check_utf32_to_utf8(0x2701, 4, "\xe2\x9c\x81");
+ check_utf32_to_utf8(0xd800, 0, ""); // Unicode surrogates
+ check_utf32_to_utf8(0xdfff, 0, ""); // Unicode surrogates
check_utf32_to_utf8(0x1f004, 5, "\xf0\x9f\x80\x84");
check_utf32_to_utf8(0x110000, 0, "");
check_utf32_to_utf8(0xffffffff, 0, "");