Commit 183761ac24544b355aaf362e62d05fa1c184baf8

Pierre Le Marre 2023-05-13T17:26:24

Do not interpret nor emit invalid Unicode encoding forms Surrogates are invalid in both UTF-32 and UTF-8. See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875 and https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G31703

diff --git a/src/keysym-utf.c b/src/keysym-utf.c
index a9d46d1..0bb9a4f 100644
--- a/src/keysym-utf.c
+++ b/src/keysym-utf.c
@@ -41,6 +41,8 @@
 #include "utils.h"
 #include "utf8.h"
 
+#define NO_KEYSYM_UNICODE_CONVERSION 0
+
 /* We don't use the uint32_t types here, to save some space. */
 struct codepair {
     uint16_t keysym;
@@ -847,7 +849,7 @@ bin_search(const struct codepair *table, size_t length, xkb_keysym_t keysym)
     }
 
     /* no matching Unicode value found in table */
-    return 0;
+    return NO_KEYSYM_UNICODE_CONVERSION;
 }
 
 XKB_EXPORT uint32_t
@@ -871,6 +873,13 @@ xkb_keysym_to_utf32(xkb_keysym_t keysym)
         return keysym & 0x7f;
 
     /* also check for directly encoded Unicode codepoints */
+
+    /* Exclude surrogates: they are invalid in UTF-32.
+     * See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875
+     * for further details.
+    */
+    if (0x0100d800 <= keysym && keysym <= 0x0100dfff)
+        return NO_KEYSYM_UNICODE_CONVERSION;
     /*
      * In theory, this is supposed to start from 0x100100, such that the ASCII
      * range, which is already covered by 0x00-0xff, can't be encoded in two
@@ -900,7 +909,8 @@ xkb_utf32_to_keysym(uint32_t ucs)
         return XKB_KEY_Delete;
 
     /* Unicode non-symbols and code points outside Unicode planes */
-    if ((ucs >= 0xfdd0 && ucs <= 0xfdef) ||
+    if ((ucs >= 0xd800 && ucs <= 0xdfff) ||
+        (ucs >= 0xfdd0 && ucs <= 0xfdef) ||
         ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
         return XKB_KEY_NoSymbol;
 
@@ -948,7 +958,7 @@ xkb_keysym_to_utf8(xkb_keysym_t keysym, char *buffer, size_t size)
 
     codepoint = xkb_keysym_to_utf32(keysym);
 
-    if (codepoint == 0)
+    if (codepoint == NO_KEYSYM_UNICODE_CONVERSION)
         return 0;
 
     return utf32_to_utf8(codepoint, buffer);
diff --git a/src/utf8.c b/src/utf8.c
index 15aa237..d37ba8e 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -32,6 +32,11 @@
 
 #include "utf8.h"
 
+/* Conformant encoding form conversion from UTF-32 to UTF-8.
+ *
+ * See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875
+ * for further details.
+*/
 int
 utf32_to_utf8(uint32_t unichar, char *buffer)
 {
@@ -47,6 +52,10 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
         length = 2;
         head = 0xc0;
     }
+    /* Handle surrogates */
+    else if (0xd800 <= unichar && unichar <= 0xdfff) {
+        goto ill_formed_code_unit_subsequence;
+    }
     else if (unichar <= 0xffff) {
         length = 3;
         head = 0xe0;
@@ -56,8 +65,7 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
         head = 0xf0;
     }
     else {
-        buffer[0] = '\0';
-        return 0;
+        goto ill_formed_code_unit_subsequence;
     }
 
     for (count = length - 1, shift = 0; count > 0; count--, shift += 6)
@@ -67,6 +75,10 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
     buffer[length] = '\0';
 
     return length + 1;
+
+ill_formed_code_unit_subsequence:
+    buffer[0] = '\0';
+    return 0;
 }
 
 bool
diff --git a/test/keysym.c b/test/keysym.c
index 38f967d..a4dba0c 100644
--- a/test/keysym.c
+++ b/test/keysym.c
@@ -222,6 +222,8 @@ main(void)
 
     assert(test_utf8(0x10005d0, "א"));
     assert(test_utf8(0x110ffff, "\xf4\x8f\xbf\xbf"));
+    assert(test_utf8(0x0100d800, NULL) == 0); // Unicode surrogates
+    assert(test_utf8(0x0100dfff, NULL) == 0); // Unicode surrogates
     assert(test_utf8(0x1110000, NULL) == 0);
 
     assert(test_utf32_to_keysym('y', XKB_KEY_y));
@@ -255,6 +257,8 @@ main(void)
     assert(test_utf32_to_keysym(0x20ac, XKB_KEY_EuroSign));
 
     // Unicode non-characters
+    assert(test_utf32_to_keysym(0xd800, XKB_KEY_NoSymbol)); // Unicode surrogates
+    assert(test_utf32_to_keysym(0xdfff, XKB_KEY_NoSymbol)); // Unicode surrogates
     assert(test_utf32_to_keysym(0xfdd0, XKB_KEY_NoSymbol));
     assert(test_utf32_to_keysym(0xfdef, XKB_KEY_NoSymbol));
     assert(test_utf32_to_keysym(0xfffe, XKB_KEY_NoSymbol));
diff --git a/test/utf8.c b/test/utf8.c
index 214e356..aa3c0d5 100644
--- a/test/utf8.c
+++ b/test/utf8.c
@@ -170,6 +170,8 @@ test_utf32_to_utf8(void)
     check_utf32_to_utf8(0x40, 2, "\x40");
     check_utf32_to_utf8(0xA1, 3, "\xc2\xa1");
     check_utf32_to_utf8(0x2701, 4, "\xe2\x9c\x81");
+    check_utf32_to_utf8(0xd800, 0, ""); // Unicode surrogates
+    check_utf32_to_utf8(0xdfff, 0, ""); // Unicode surrogates
     check_utf32_to_utf8(0x1f004, 5, "\xf0\x9f\x80\x84");
     check_utf32_to_utf8(0x110000, 0, "");
     check_utf32_to_utf8(0xffffffff, 0, "");