More fixes and enhancements to Windows Unicode support (issue #3). * Rename MD4C_USE_WIN_UNICODE to MD4C_USE_UTF16. * Update and improve related documentation in README.md.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
diff --git a/README.md b/README.md
index 212d8a9..281fc47 100644
--- a/README.md
+++ b/README.md
@@ -111,6 +111,10 @@ inspection Unicode is actually used on very few occasions:
* Unicode case folding. This is used to perform case-independent matching
of link labels when resolving reference links.
+ * Translating HTML entities and numeric character references (e.g. `&`,
+ `#`). However MD4C leaves the translation on the renderer/application;
+ as the renderer is supposed to really know output encoding.
+
MD4C uses this property of the standard and its implementation is, to a large
degree, encoding-agnostic. Most of the code only assumes that the encoding of
your choice is compatible with ASCII, i.e. that the codepoints below 128 have
@@ -119,21 +123,20 @@ the same numeric values as ASCII.
All input MD4C does not understand is seen as a text and sent to the callbacks
unchanged.
-The behavior of MD4C in the isolated situations where the encoding really
-matters is determined by preprocessor macros:
+The behavior of MD4C in the isolated listed situations where the encoding
+really matters is determined by preprocessor macros:
* If preprocessor macro `MD4C_USE_UTF8` is defined, MD4C assumes UTF-8
in the specific situations.
- * On Windows, if preprocessor macro `MD4C_USE_WIN_UNICODE` is defined, MD4C
- assumes UTF-16 and uses `WCHAR` instead of `char`. This allows usage of
- MD4C directly within Unicode applications on Windows, without any text
- conversions.
+ * On Windows, if preprocessor macro `MD4C_USE_UTF16` is defined, MD4C assumes
+ UTF-16 and uses `WCHAR` instead of `char`. (UTF-16 is what Windows
+ developers usually call just "Unicode" and what Win32API works with.)
- * When none of the macros is defined, ASCII-only approach is used even in
- the listed situations. This effectively means that non-ASCII whitespace or
- punctuation characters won't be recognized as such and that case-folding is
- performed only on ASCII letters (i.e. `[a-zA-Z]`).
+ * By default (when none of the macros is defined), ASCII-only mode is used
+ even in the situations listed above. This effectively means that non-ASCII
+ whitespace or punctuation characters won't be recognized as such and that
+ case-folding is performed only on ASCII letters (i.e. `[a-zA-Z]`).
(Adding support for yet another encodings should be relatively simple due
the isolation of the respective code.)
diff --git a/md4c/md4c.c b/md4c/md4c.c
index 9631729..d5e9137 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -45,7 +45,7 @@
#ifdef _T
#undef _T
#endif
-#if defined MD4C_USE_WIN_UNICODE
+#if defined MD4C_USE_UTF16
#define _T(x) L##x
#else
#define _T(x) x
@@ -432,7 +432,7 @@ struct MD_UNICODE_FOLD_INFO_tag {
};
-#if defined MD4C_USE_WIN_UNICODE || defined MD4C_USE_UTF8
+#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
static int
md_is_unicode_whitespace__(int codepoint)
{
@@ -441,7 +441,7 @@ struct MD_UNICODE_FOLD_INFO_tag {
return ISWHITESPACE_(codepoint);
/* Check for Unicode codepoints in Zs class above 127. */
- if(codepoint == 0x00A0 || codepoint == 0x1680)
+ if(codepoint == 0x00a0 || codepoint == 0x1680)
return TRUE;
if(0x2000 <= codepoint && codepoint <= 0x200a)
return TRUE;
@@ -685,13 +685,10 @@ struct MD_UNICODE_FOLD_INFO_tag {
#endif
-#if defined MD4C_USE_WIN_UNICODE
- /* The encoding known called on Windows simply as "Unicode" is actually
- * UTF-16. */
-
- #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc) == 0xd800)
- #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc) == 0xdc00)
- #define UTF16_DECODE_SURROGATE(hi, lo) ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0))
+#if defined MD4C_USE_UTF16
+ #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800)
+ #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00)
+ #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
static int
md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
diff --git a/md4c/md4c.h b/md4c/md4c.h
index fc66d0e..3048e17 100644
--- a/md4c/md4c.h
+++ b/md4c/md4c.h
@@ -31,14 +31,16 @@
#endif
-/* Magic to support UTF16-LE (i.e. what is called Unicode among Windows
- * developers) input/output on Windows.
- */
-#if defined MD4C_USE_WIN_UNICODE
- #include <windows.h>
- typedef WCHAR MD_CHAR;
+/* Magic to support UTF16. */
+#if defined MD4C_USE_UTF16
+ #ifdef _WIN32
+ #include <wchar.h>
+ typedef WCHAR MD_CHAR;
+ #else
+ #error MD4C_USE_UTF16 is only upported on Windows.
+ #endif
#else
- typedef char MD_CHAR;
+ typedef char MD_CHAR;
#endif
typedef unsigned MD_SIZE;