md2html: Skip UTF-8 BOM, if present in the input.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e828bdd..c875352 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,18 @@
# MD4C Change Log
+## Next Version (Work in Progress)
+
+New features:
+
+ * With `MD_RENDER_FLAG_SKIP_UTF8_BOM`, the HTML renderer now skips UTF-8 byte
+ order mark (BOM) if the input begins with it, before passing to the Markdown
+ parser.
+
+ `md2html` utility automatically enables the flag (unless it is custom-built
+ with `-DMD4C_USE_ASCII`).
+
+
## Version 0.4.3
New features:
diff --git a/md2html/md2html.c b/md2html/md2html.c
index 8f887fb..01e947d 100644
--- a/md2html/md2html.c
+++ b/md2html/md2html.c
@@ -35,7 +35,11 @@
/* Global options. */
static unsigned parser_flags = 0;
-static unsigned renderer_flags = MD_RENDER_FLAG_DEBUG;
+#ifndef MD4C_USE_ASCII
+ static unsigned renderer_flags = MD_RENDER_FLAG_DEBUG | MD_RENDER_FLAG_SKIP_UTF8_BOM;
+#else
+ static unsigned renderer_flags = MD_RENDER_FLAG_DEBUG;
+#endif
static int want_fullhtml = 0;
static int want_stat = 0;
diff --git a/md2html/render_html.c b/md2html/render_html.c
index 896b37f..42b6fff 100644
--- a/md2html/render_html.c
+++ b/md2html/render_html.c
@@ -556,6 +556,15 @@ md_render_html(const MD_CHAR* input, MD_SIZE input_size,
render.escape_map[i] |= NEED_URL_ESC_FLAG;
}
+ /* Consider skipping UTF-8 byte order mark (BOM). */
+ if(renderer_flags & MD_RENDER_FLAG_SKIP_UTF8_BOM && sizeof(MD_CHAR) == 1) {
+ static const MD_CHAR bom[3] = { 0xef, 0xbb, 0xbf };
+ if(input_size >= sizeof(bom) && memcmp(input, bom, sizeof(bom)) == 0) {
+ input += sizeof(bom);
+ input_size -= sizeof(bom);
+ }
+ }
+
return md_parse(input, input_size, &parser, (void*) &render);
}
diff --git a/md2html/render_html.h b/md2html/render_html.h
index 968dc8e..2e9a77b 100644
--- a/md2html/render_html.h
+++ b/md2html/render_html.h
@@ -36,6 +36,7 @@
/* If set, debug output from md_parse() is sent to stderr. */
#define MD_RENDER_FLAG_DEBUG 0x0001
#define MD_RENDER_FLAG_VERBATIM_ENTITIES 0x0002
+#define MD_RENDER_FLAG_SKIP_UTF8_BOM 0x0004
/* Render Markdown into HTML.
diff --git a/test/pathological_tests.py b/test/pathological_tests.py
index 6dd2b40..ad9252e 100755
--- a/test/pathological_tests.py
+++ b/test/pathological_tests.py
@@ -21,6 +21,12 @@ cmark = CMark(prog=args.program, library_dir=args.library_dir)
# list of pairs consisting of input and a regex that must match the output.
pathological = {
# note - some pythons have limit of 65535 for {num-matches} in re.
+ "U+0000":
+ ("abc\u0000de\u0000",
+ re.compile("abc\ufffd?de\ufffd?")),
+ "U+FEFF (Unicode BOM)":
+ ("\ufefffoo",
+ re.compile("<p>foo</p>")),
"nested strong emph":
(("*a **a " * 65000) + "b" + (" a** a*" * 65000),
re.compile("(<em>a <strong>a ){65000}b( a</strong> a</em>){65000}")),
@@ -57,9 +63,6 @@ pathological = {
"nested block quotes":
((("> " * 50000) + "a"),
re.compile("(<blockquote>\r?\n){50000}")),
- "U+0000 in input":
- ("abc\u0000de\u0000",
- re.compile("abc\ufffd?de\ufffd?")),
"backticks":
("".join(map(lambda x: ("e" + "`" * x), range(1,1000))),
re.compile("^<p>[e`]*</p>\r?\n$")),