Commit 3aaa4e2a534267dda2a22e97db3efcb8e4757536

Pierre Le Marre 2023-10-30T15:51:34

rules: early detection of invalid encoding

diff --git a/src/xkbcomp/rules.c b/src/xkbcomp/rules.c
index daa4f3d..918fe31 100644
--- a/src/xkbcomp/rules.c
+++ b/src/xkbcomp/rules.c
@@ -1084,29 +1084,36 @@ read_rules_file(struct xkb_context *ctx,
                 FILE *file,
                 const char *path)
 {
-    bool ret = false;
+    bool ret;
     char *string;
     size_t size;
     struct scanner scanner;
 
-    ret = map_file(file, &string, &size);
-    if (!ret) {
+    if (!map_file(file, &string, &size)) {
         log_err(ctx, XKB_LOG_MESSAGE_NO_ID,
                 "Couldn't read rules file \"%s\": %s\n",
                 path, strerror(errno));
-        goto out;
+        return false;
     }
 
     scanner_init(&scanner, matcher->ctx, string, size, path, NULL);
 
-    /* Skip UTF-8 encoded BOM (U+FEFF) */
-    /* See: https://www.unicode.org/faq/utf_bom.html#bom5 */
-    scanner_str(&scanner, "\xef\xbb\xbf", 3);
+    /* Basic detection of wrong character encoding.
+       The first character relevant to the grammar must be ASCII:
+       whitespace, !, / (for comment) */
+    if (!scanner_check_supported_char_encoding(&scanner)) {
+        scanner_err(&scanner,
+            "This could be a file encoding issue. "
+            "Supported encodings must be backward compatible with ASCII.");
+        scanner_err(&scanner,
+            "E.g. ISO/CEI 8859 and UTF-8 are supported "
+            "but UTF-16, UTF-32 and CP1026 are not.");
+        unmap_file(string, size);
+        return false;
+    }
 
     ret = matcher_match(matcher, &scanner, include_depth, string, size, path);
-
     unmap_file(string, size);
-out:
     return ret;
 }
 
diff --git a/test/data/rules/utf-16be_with_bom b/test/data/rules/utf-16be_with_bom
new file mode 100644
index 0000000..ea44bab
Binary files /dev/null and b/test/data/rules/utf-16be_with_bom differ
diff --git a/test/data/rules/utf-16le_with_bom b/test/data/rules/utf-16le_with_bom
new file mode 100644
index 0000000..9faf37e
Binary files /dev/null and b/test/data/rules/utf-16le_with_bom differ
diff --git a/test/data/rules/utf-32be b/test/data/rules/utf-32be
new file mode 100644
index 0000000..588e32e
Binary files /dev/null and b/test/data/rules/utf-32be differ
diff --git a/test/rules-file.c b/test/rules-file.c
index 302aa68..726ec89 100644
--- a/test/rules-file.c
+++ b/test/rules-file.c
@@ -106,6 +106,42 @@ main(int argc, char *argv[])
     };
     assert(test_rules(ctx, &test_utf_8_with_bom));
 
+    struct test_data test_utf_16le_with_bom = {
+        .rules = "utf-16le_with_bom",
+
+        .model = "my_model", .layout = "my_layout", .variant = "my_variant",
+        .options = "my_option",
+
+        .keycodes = "my_keycodes", .types = "my_types",
+        .compat = "my_compat|some:compat",
+        .symbols = "my_symbols+extra_variant",
+    };
+    assert(!test_rules(ctx, &test_utf_16le_with_bom));
+
+    struct test_data test_utf_16be_with_bom = {
+        .rules = "utf-16be_with_bom",
+
+        .model = "my_model", .layout = "my_layout", .variant = "my_variant",
+        .options = "my_option",
+
+        .keycodes = "my_keycodes", .types = "my_types",
+        .compat = "my_compat|some:compat",
+        .symbols = "my_symbols+extra_variant",
+    };
+    assert(!test_rules(ctx, &test_utf_16be_with_bom));
+
+    struct test_data test_utf_32be = {
+        .rules = "utf-32be",
+
+        .model = "my_model", .layout = "my_layout", .variant = "my_variant",
+        .options = "my_option",
+
+        .keycodes = "my_keycodes", .types = "my_types",
+        .compat = "my_compat|some:compat",
+        .symbols = "my_symbols+extra_variant",
+    };
+    assert(!test_rules(ctx, &test_utf_32be));
+
     struct test_data test1 = {
         .rules = "simple",