Commit d0e3ed79bffc29c3750a3d2b4f5c1f363ec9198a

Martin Mitas 2020-03-12T22:45:32

md2html: Skip UTF-8 BOM, if present in the input.

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e828bdd..c875352 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,18 @@
 # MD4C Change Log
 
 
+## Next Version (Work in Progress)
+
+New features:
+
+ * With `MD_RENDER_FLAG_SKIP_UTF8_BOM`, the HTML renderer now skips UTF-8 byte
+   order mark (BOM) if the input begins with it, before passing to the Markdown
+   parser.
+
+   `md2html` utility automatically enables the flag (unless it is custom-built
+   with `-DMD4C_USE_ASCII`).
+
+
 ## Version 0.4.3
 
 New features:
diff --git a/md2html/md2html.c b/md2html/md2html.c
index 8f887fb..01e947d 100644
--- a/md2html/md2html.c
+++ b/md2html/md2html.c
@@ -35,7 +35,11 @@
 
 /* Global options. */
 static unsigned parser_flags = 0;
-static unsigned renderer_flags = MD_RENDER_FLAG_DEBUG;
+#ifndef MD4C_USE_ASCII
+    static unsigned renderer_flags = MD_RENDER_FLAG_DEBUG | MD_RENDER_FLAG_SKIP_UTF8_BOM;
+#else
+    static unsigned renderer_flags = MD_RENDER_FLAG_DEBUG;
+#endif
 static int want_fullhtml = 0;
 static int want_stat = 0;
 
diff --git a/md2html/render_html.c b/md2html/render_html.c
index 896b37f..42b6fff 100644
--- a/md2html/render_html.c
+++ b/md2html/render_html.c
@@ -556,6 +556,15 @@ md_render_html(const MD_CHAR* input, MD_SIZE input_size,
             render.escape_map[i] |= NEED_URL_ESC_FLAG;
     }
 
+    /* Consider skipping UTF-8 byte order mark (BOM). */
+    if(renderer_flags & MD_RENDER_FLAG_SKIP_UTF8_BOM  &&  sizeof(MD_CHAR) == 1) {
+        static const MD_CHAR bom[3] = { 0xef, 0xbb, 0xbf };
+        if(input_size >= sizeof(bom)  &&  memcmp(input, bom, sizeof(bom)) == 0) {
+            input += sizeof(bom);
+            input_size -= sizeof(bom);
+        }
+    }
+
     return md_parse(input, input_size, &parser, (void*) &render);
 }
 
diff --git a/md2html/render_html.h b/md2html/render_html.h
index 968dc8e..2e9a77b 100644
--- a/md2html/render_html.h
+++ b/md2html/render_html.h
@@ -36,6 +36,7 @@
 /* If set, debug output from md_parse() is sent to stderr. */
 #define MD_RENDER_FLAG_DEBUG                0x0001
 #define MD_RENDER_FLAG_VERBATIM_ENTITIES    0x0002
+#define MD_RENDER_FLAG_SKIP_UTF8_BOM        0x0004
 
 
 /* Render Markdown into HTML.
diff --git a/test/pathological_tests.py b/test/pathological_tests.py
index 6dd2b40..ad9252e 100755
--- a/test/pathological_tests.py
+++ b/test/pathological_tests.py
@@ -21,6 +21,12 @@ cmark = CMark(prog=args.program, library_dir=args.library_dir)
 # list of pairs consisting of input and a regex that must match the output.
 pathological = {
     # note - some pythons have limit of 65535 for {num-matches} in re.
+    "U+0000":
+                 ("abc\u0000de\u0000",
+                  re.compile("abc\ufffd?de\ufffd?")),
+    "U+FEFF (Unicode BOM)":
+                 ("\ufefffoo",
+                  re.compile("<p>foo</p>")),
     "nested strong emph":
                 (("*a **a " * 65000) + "b" + (" a** a*" * 65000),
                  re.compile("(<em>a <strong>a ){65000}b( a</strong> a</em>){65000}")),
@@ -57,9 +63,6 @@ pathological = {
     "nested block quotes":
                  ((("> " * 50000) + "a"),
                   re.compile("(<blockquote>\r?\n){50000}")),
-    "U+0000 in input":
-                 ("abc\u0000de\u0000",
-                  re.compile("abc\ufffd?de\ufffd?")),
     "backticks":
                  ("".join(map(lambda x: ("e" + "`" * x), range(1,1000))),
                   re.compile("^<p>[e`]*</p>\r?\n$")),