Commit 7d20152c39dbf094a774bbf34a808bf689dd2b6a

Martin Mitas 2016-12-05T13:45:57

Fix UTF-16 sorrogate decoding (with -DMD4C_USE_UNICODE). See https://github.com/mity/md4c/pull/1#issuecomment-264842360

diff --git a/README.md b/README.md
index 93f8f4f..c34a9e7 100644
--- a/README.md
+++ b/README.md
@@ -44,8 +44,8 @@ MD4C is C Markdown parser with the following features:
   be fairly simple to make it run also on most other systems.
 
 * **Encoding:** MD4C can be compiled to recognize ASCII-only control characters,
-  UTF-8 and, on Windows, also UTF-16 little endian, i.e. what is on Windows
-  commonly called just "Unicode". See more details below.
+  UTF-8 and, on Windows, also UTF-16, i.e. what is on Windows commonly called
+  just "Unicode". See more details below.
 
 * **Permissive license:** MD4C is available under the MIT license.
 
@@ -126,9 +126,9 @@ matters is determined by preprocessor macros:
    in the specific situations.
 
  * On Windows, if preprocessor macro `MD4C_USE_WIN_UNICODE` is defined, MD4C
-   assumes little-endian UTF-16 and uses `WCHAR` instead of `char`. This allows
-   usage of MD4C directly within Unicode applications on Windows, without any
-   text conversion.
+   assumes UTF-16 and uses `WCHAR` instead of `char`. This allows usage of
+   MD4C directly within Unicode applications on Windows, without any text
+   conversions.
 
  * When none of the macros is defined, ASCII-only approach is used even in
    the listed situations. This effectively means that non-ASCII whitespace or
diff --git a/md4c/md4c.c b/md4c/md4c.c
index 5660525..db220a6 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -686,21 +686,21 @@ struct MD_UNICODE_FOLD_INFO_tag {
 
 
 #if defined MD4C_USE_WIN_UNICODE
-    #define IS_UTF16_SURROGATE_HI(word)         (((WORD)(word) & 0xfc) == 0xd800)
-    #define IS_UTF16_SURROGATE_LO(word)         (((WORD)(word) & 0xfc) == 0xdc00)
-    #define UTF16_COMPUTE_SURROGATE(hi, lo)     ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0))
+    /* The encoding known called on Windows simply as "Unicode" is actually
+     * UTF-16. */
+
+    #define IS_UTF16_SURROGATE_HI(word)     (((WORD)(word) & 0xfc) == 0xd800)
+    #define IS_UTF16_SURROGATE_LO(word)     (((WORD)(word) & 0xfc) == 0xdc00)
+    #define UTF16_DECODE_SURROGATE(hi, lo)  ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0))
 
     static int
     md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
     {
-        /* The encoding known called on Windows simply as "Unicode" is actually
-         * little-endian UTF-16, i.e. the low surrogate precedes the high
-         * surrogate. */
-        if(IS_UTF16_SURROGATE_LO(str[0])) {
-            if(1 < str_size && IS_UTF16_SURROGATE_HI(str[1])) {
+        if(IS_UTF16_SURROGATE_HI(str[0])) {
+            if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
                 if(p_size != NULL)
                     *p_size = 2;
-                return UTF16_COMPUTE_SURROGATE(str[1], str[0]);
+                return UTF16_DECODE_SURROGATE(str[0], str[1]);
             }
         }
 
@@ -712,8 +712,8 @@ struct MD_UNICODE_FOLD_INFO_tag {
     static int
     md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
     {
-        if(off > 2 && IS_UTF16_SURROGATE_LO(CH(off-2)) && IS_UTF16_SURROGATE_HI(CH(off-1)))
-            return UTF16_COMPUTE_SURROGATE(CH(off-1), CH(off-2));
+        if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
+            return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
 
         return CH(off);
     }