Commit a2da57aba19b829825d54fa14cd004703ee33e56

Pierre Le Marre 2023-10-30T14:50:00

Compose: early detection of invalid encoding Also move “unrecognized token” error message before skiping the line, in order to fix token position.

diff --git a/src/compose/parser.c b/src/compose/parser.c
index 4112baa..ac11446 100644
--- a/src/compose/parser.c
+++ b/src/compose/parser.c
@@ -57,8 +57,8 @@ OR PERFORMANCE OF THIS SOFTWARE.
 #include <errno.h>
 
 #include "utils.h"
-#include "scanner-utils.h"
 #include "table.h"
+#include "scanner-utils.h"
 #include "paths.h"
 #include "utf8.h"
 #include "parser.h"
@@ -244,10 +244,9 @@ skip_more_whitespace_and_comments:
         return TOK_IDENT;
     }
 
+    scanner_err(s, "unrecognized token");
     /* Discard rest of line. */
     scanner_skip_to_eol(s);
-
-    scanner_err(s, "unrecognized token");
     return TOK_ERROR;
 }
 
@@ -527,9 +526,15 @@ parse(struct xkb_compose_table *table, struct scanner *s,
     enum { MAX_ERRORS = 10 };
     int num_errors = 0;
 
-    /* Skip UTF-8 encoded BOM (U+FEFF) */
-    /* See: https://www.unicode.org/faq/utf_bom.html#bom5 */
-    scanner_str(s, "\xef\xbb\xbf", 3);
+    /* Basic detection of wrong character encoding.
+       The first character relevant to the grammar must be ASCII:
+       whitespace, include, modifier list, keysym, comment */
+    if (!scanner_check_supported_char_encoding(s)) {
+        scanner_err(s,
+                    "This could be a file encoding issue. "
+                    "Supported file encodings are ASCII and UTF-8.");
+        goto fail;
+    }
 
 initial:
     production.len = 0;
diff --git a/src/scanner-utils.h b/src/scanner-utils.h
index 674ecaa..d9d2b42 100644
--- a/src/scanner-utils.h
+++ b/src/scanner-utils.h
@@ -212,4 +212,33 @@ scanner_hex(struct scanner *s, uint8_t *out)
     return i > 0;
 }
 
+/* Basic detection of wrong character encoding based on the first bytes */
+static inline bool
+scanner_check_supported_char_encoding(struct scanner *scanner)
+{
+    /* Skip UTF-8 encoded BOM (U+FEFF)
+     * See: https://www.unicode.org/faq/utf_bom.html#bom5 */
+    if (scanner_str(scanner, "\xef\xbb\xbf", 3) || scanner->len < 2) {
+        /* Assume UTF-8 encoding or trivial short input */
+        return true;
+    }
+
+    /* Early detection of wrong file encoding, e.g. UTF-16 or UTF-32 */
+    if (scanner->s[0] == '\0' || scanner->s[1] == '\0') {
+        if (scanner->s[0] != '\0')
+            scanner->token_column++;
+        scanner_err(scanner, "unexpected NULL character.");
+        return false;
+    }
+    /* Enforce the first character to be ASCII.
+       See the note before the use of this function, that explains the relevant
+       parts of the grammars of rules, keymap components and Compose. */
+    if (!is_ascii(scanner->s[0])) {
+        scanner_err(scanner, "unexpected non-ASCII character.");
+        return false;
+    }
+
+    return true;
+}
+
 #endif
diff --git a/src/utils.h b/src/utils.h
index aa7969c..d6efa51 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -174,6 +174,12 @@ strndup(const char *s, size_t n)
 
 /* ctype.h is locale-dependent and has other oddities. */
 static inline bool
+is_ascii(char ch)
+{
+    return (ch & ~0x7f) == 0;
+}
+
+static inline bool
 is_space(char ch)
 {
     return ch == ' ' || (ch >= '\t' && ch <= '\r');
diff --git a/test/compose.c b/test/compose.c
index d7192f6..56bd889 100644
--- a/test/compose.c
+++ b/test/compose.c
@@ -175,12 +175,86 @@ test_compose_seq_buffer(struct xkb_context *ctx, const char *buffer, ...)
 static void
 test_compose_utf8_bom(struct xkb_context *ctx)
 {
-    const char *buffer = "\xef\xbb\xbf<A> : X";
+    const char buffer[] = "\xef\xbb\xbf<A> : X";
     assert(test_compose_seq_buffer(ctx, buffer,
         XKB_KEY_A, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "X", XKB_KEY_X,
         XKB_KEY_NoSymbol));
 }
 
+static void
+test_invalid_encodings(struct xkb_context *ctx)
+{
+    struct xkb_compose_table *table;
+
+    /* ISO 8859-1 (latin1) */
+    const char iso_8859_1[] = "<A> : \"\xe1\" acute";
+    assert(!test_compose_seq_buffer(ctx, iso_8859_1,
+        XKB_KEY_A, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "\xc3\xa1", XKB_KEY_acute,
+        XKB_KEY_NoSymbol));
+
+    /* UTF-16LE */
+    const char utf_16_le[] =
+        "<\0A\0>\0 \0:\0 \0X\0\n\0"
+        "<\0B\0>\0 \0:\0 \0Y\0";
+    table = xkb_compose_table_new_from_buffer(ctx,
+                                              utf_16_le, sizeof(utf_16_le), "",
+                                              XKB_COMPOSE_FORMAT_TEXT_V1,
+                                              XKB_COMPOSE_COMPILE_NO_FLAGS);
+    assert(!table);
+
+    /* UTF-16BE */
+    const char utf_16_be[] =
+        "\0<\0A\0>\0 \0:\0 \0X\0\n"
+        "\0<\0B\0>\0 \0:\0 \0Y";
+    table = xkb_compose_table_new_from_buffer(ctx,
+                                              utf_16_be, sizeof(utf_16_be), "",
+                                              XKB_COMPOSE_FORMAT_TEXT_V1,
+                                              XKB_COMPOSE_COMPILE_NO_FLAGS);
+    assert(!table);
+
+    /* UTF-16BE with BOM */
+    const char utf_16_be_bom[] =
+        "\xfe\xff"
+        "\0<\0A\0>\0 \0:\0 \0X\0\n"
+        "\0<\0B\0>\0 \0:\0 \0Y";
+    table = xkb_compose_table_new_from_buffer(ctx,
+                                              utf_16_be_bom, sizeof(utf_16_be_bom), "",
+                                              XKB_COMPOSE_FORMAT_TEXT_V1,
+                                              XKB_COMPOSE_COMPILE_NO_FLAGS);
+    assert(!table);
+
+    /* UTF-32LE */
+    const char utf_32_le[] =
+        "<\0\0\0A\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0X\0\0\0\n\0\0\0"
+        "<\0\0\0B\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0Y\0\0\0";
+    table = xkb_compose_table_new_from_buffer(ctx,
+                                              utf_32_le, sizeof(utf_32_le), "",
+                                              XKB_COMPOSE_FORMAT_TEXT_V1,
+                                              XKB_COMPOSE_COMPILE_NO_FLAGS);
+    assert(!table);
+
+    /* UTF-32LE with BOM */
+    const char utf_32_le_bom[] =
+        "\xff\xfe\0\0"
+        "<\0\0\0A\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0X\0\0\0\n\0\0\0"
+        "<\0\0\0B\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0Y\0\0\0";
+    table = xkb_compose_table_new_from_buffer(ctx,
+                                              utf_32_le_bom, sizeof(utf_32_le_bom), "",
+                                              XKB_COMPOSE_FORMAT_TEXT_V1,
+                                              XKB_COMPOSE_COMPILE_NO_FLAGS);
+    assert(!table);
+
+    /* UTF-32BE */
+    const char utf_32_be[] =
+        "\0\0\0<\0\0\0A\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0X\0\0\0\n\0\0\0"
+        "<\0\0\0B\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0Y";
+    table = xkb_compose_table_new_from_buffer(ctx,
+                                              utf_32_be, sizeof(utf_32_be), "",
+                                              XKB_COMPOSE_FORMAT_TEXT_V1,
+                                              XKB_COMPOSE_COMPILE_NO_FLAGS);
+    assert(!table);
+}
+
 
 static void
 test_seqs(struct xkb_context *ctx)
@@ -734,6 +808,7 @@ main(int argc, char *argv[])
 #endif
 
     test_compose_utf8_bom(ctx);
+    test_invalid_encodings(ctx);
     test_seqs(ctx);
     test_conflicting(ctx);
     test_XCOMPOSEFILE(ctx);