Fix UTF-16 sorrogate decoding (with -DMD4C_USE_UNICODE). See https://github.com/mity/md4c/pull/1#issuecomment-264842360
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
diff --git a/README.md b/README.md
index 93f8f4f..c34a9e7 100644
--- a/README.md
+++ b/README.md
@@ -44,8 +44,8 @@ MD4C is C Markdown parser with the following features:
be fairly simple to make it run also on most other systems.
* **Encoding:** MD4C can be compiled to recognize ASCII-only control characters,
- UTF-8 and, on Windows, also UTF-16 little endian, i.e. what is on Windows
- commonly called just "Unicode". See more details below.
+ UTF-8 and, on Windows, also UTF-16, i.e. what is on Windows commonly called
+ just "Unicode". See more details below.
* **Permissive license:** MD4C is available under the MIT license.
@@ -126,9 +126,9 @@ matters is determined by preprocessor macros:
in the specific situations.
* On Windows, if preprocessor macro `MD4C_USE_WIN_UNICODE` is defined, MD4C
- assumes little-endian UTF-16 and uses `WCHAR` instead of `char`. This allows
- usage of MD4C directly within Unicode applications on Windows, without any
- text conversion.
+ assumes UTF-16 and uses `WCHAR` instead of `char`. This allows usage of
+ MD4C directly within Unicode applications on Windows, without any text
+ conversions.
* When none of the macros is defined, ASCII-only approach is used even in
the listed situations. This effectively means that non-ASCII whitespace or
diff --git a/md4c/md4c.c b/md4c/md4c.c
index 5660525..db220a6 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -686,21 +686,21 @@ struct MD_UNICODE_FOLD_INFO_tag {
#if defined MD4C_USE_WIN_UNICODE
- #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc) == 0xd800)
- #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc) == 0xdc00)
- #define UTF16_COMPUTE_SURROGATE(hi, lo) ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0))
+ /* The encoding known called on Windows simply as "Unicode" is actually
+ * UTF-16. */
+
+ #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc) == 0xd800)
+ #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc) == 0xdc00)
+ #define UTF16_DECODE_SURROGATE(hi, lo) ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0))
static int
md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
{
- /* The encoding known called on Windows simply as "Unicode" is actually
- * little-endian UTF-16, i.e. the low surrogate precedes the high
- * surrogate. */
- if(IS_UTF16_SURROGATE_LO(str[0])) {
- if(1 < str_size && IS_UTF16_SURROGATE_HI(str[1])) {
+ if(IS_UTF16_SURROGATE_HI(str[0])) {
+ if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
if(p_size != NULL)
*p_size = 2;
- return UTF16_COMPUTE_SURROGATE(str[1], str[0]);
+ return UTF16_DECODE_SURROGATE(str[0], str[1]);
}
}
@@ -712,8 +712,8 @@ struct MD_UNICODE_FOLD_INFO_tag {
static int
md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
{
- if(off > 2 && IS_UTF16_SURROGATE_LO(CH(off-2)) && IS_UTF16_SURROGATE_HI(CH(off-1)))
- return UTF16_COMPUTE_SURROGATE(CH(off-1), CH(off-2));
+ if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
+ return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
return CH(off);
}