Commit 6c90b37f1280a1061a82325fe02efefabfa8535b

Martin Mitas 2016-12-05T21:17:45

More fixes and enhancements to Windows Unicode support (issue #3). * Rename MD4C_USE_WIN_UNICODE to MD4C_USE_UTF16. * Update and improve related documentation in README.md.

diff --git a/README.md b/README.md
index 212d8a9..281fc47 100644
--- a/README.md
+++ b/README.md
@@ -111,6 +111,10 @@ inspection Unicode is actually used on very few occasions:
   * Unicode case folding. This is used to perform case-independent matching
     of link labels when resolving reference links.
 
+  * Translating HTML entities and numeric character references (e.g. `&`,
+    `#`). However MD4C leaves the translation on the renderer/application;
+    as the renderer is supposed to really know output encoding.
+
 MD4C uses this property of the standard and its implementation is, to a large
 degree, encoding-agnostic. Most of the code only assumes that the encoding of
 your choice is compatible with ASCII, i.e. that the codepoints below 128 have
@@ -119,21 +123,20 @@ the same numeric values as ASCII.
 All input MD4C does not understand is seen as a text and sent to the callbacks
 unchanged.
 
-The behavior of MD4C in the isolated situations where the encoding really
-matters is determined by preprocessor macros:
+The behavior of MD4C in the isolated listed situations where the encoding
+really matters is determined by preprocessor macros:
 
  * If preprocessor macro `MD4C_USE_UTF8` is defined, MD4C assumes UTF-8
    in the specific situations.
 
- * On Windows, if preprocessor macro `MD4C_USE_WIN_UNICODE` is defined, MD4C
-   assumes UTF-16 and uses `WCHAR` instead of `char`. This allows usage of
-   MD4C directly within Unicode applications on Windows, without any text
-   conversions.
+ * On Windows, if preprocessor macro `MD4C_USE_UTF16` is defined, MD4C assumes
+   UTF-16 and uses `WCHAR` instead of `char`. (UTF-16 is what Windows
+   developers usually call just "Unicode" and what Win32API works with.)
 
- * When none of the macros is defined, ASCII-only approach is used even in
-   the listed situations. This effectively means that non-ASCII whitespace or
-   punctuation characters won't be recognized as such and that case-folding is
-   performed only on ASCII letters (i.e. `[a-zA-Z]`).
+ * By default (when none of the macros is defined), ASCII-only mode is used
+   even in the situations listed above. This effectively means that non-ASCII
+   whitespace or punctuation characters won't be recognized as such and that
+   case-folding is performed only on ASCII letters (i.e. `[a-zA-Z]`).
 
 (Adding support for yet another encodings should be relatively simple due
 the isolation of the respective code.)
diff --git a/md4c/md4c.c b/md4c/md4c.c
index 9631729..d5e9137 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -45,7 +45,7 @@
 #ifdef _T
     #undef _T
 #endif
-#if defined MD4C_USE_WIN_UNICODE
+#if defined MD4C_USE_UTF16
     #define _T(x)           L##x
 #else
     #define _T(x)           x
@@ -432,7 +432,7 @@ struct MD_UNICODE_FOLD_INFO_tag {
 };
 
 
-#if defined MD4C_USE_WIN_UNICODE || defined MD4C_USE_UTF8
+#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
     static int
     md_is_unicode_whitespace__(int codepoint)
     {
@@ -441,7 +441,7 @@ struct MD_UNICODE_FOLD_INFO_tag {
             return ISWHITESPACE_(codepoint);
 
         /* Check for Unicode codepoints in Zs class above 127. */
-        if(codepoint == 0x00A0 || codepoint == 0x1680)
+        if(codepoint == 0x00a0 || codepoint == 0x1680)
             return TRUE;
         if(0x2000 <= codepoint && codepoint <= 0x200a)
             return TRUE;
@@ -685,13 +685,10 @@ struct MD_UNICODE_FOLD_INFO_tag {
 #endif
 
 
-#if defined MD4C_USE_WIN_UNICODE
-    /* The encoding known called on Windows simply as "Unicode" is actually
-     * UTF-16. */
-
-    #define IS_UTF16_SURROGATE_HI(word)     (((WORD)(word) & 0xfc) == 0xd800)
-    #define IS_UTF16_SURROGATE_LO(word)     (((WORD)(word) & 0xfc) == 0xdc00)
-    #define UTF16_DECODE_SURROGATE(hi, lo)  ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0))
+#if defined MD4C_USE_UTF16
+    #define IS_UTF16_SURROGATE_HI(word)     (((WORD)(word) & 0xfc00) == 0xd800)
+    #define IS_UTF16_SURROGATE_LO(word)     (((WORD)(word) & 0xfc00) == 0xdc00)
+    #define UTF16_DECODE_SURROGATE(hi, lo)  (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
 
     static int
     md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
diff --git a/md4c/md4c.h b/md4c/md4c.h
index fc66d0e..3048e17 100644
--- a/md4c/md4c.h
+++ b/md4c/md4c.h
@@ -31,14 +31,16 @@
 #endif
 
 
-/* Magic to support UTF16-LE (i.e. what is called Unicode among Windows
- * developers) input/output on Windows.
- */
-#if defined MD4C_USE_WIN_UNICODE
-    #include <windows.h>
-    typedef WCHAR   MD_CHAR;
+/* Magic to support UTF16. */
+#if defined MD4C_USE_UTF16
+    #ifdef _WIN32
+        #include <wchar.h>
+        typedef WCHAR       MD_CHAR;
+    #else
+        #error MD4C_USE_UTF16 is only upported on Windows.
+    #endif
 #else
-    typedef char    MD_CHAR;
+    typedef char            MD_CHAR;
 #endif
 
 typedef unsigned MD_SIZE;