Better Unicode support.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
diff --git a/md4c/CMakeLists.txt b/md4c/CMakeLists.txt
index b18629e..6267d3e 100644
--- a/md4c/CMakeLists.txt
+++ b/md4c/CMakeLists.txt
@@ -1,4 +1,6 @@
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DDEBUG")
+add_definitions(-DMD4C_USE_UNICODE)
+
add_library(md4c STATIC md4c.c md4c.h)
diff --git a/md4c/md4c.c b/md4c/md4c.c
index 7511478..c18f4d6 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -47,7 +47,7 @@
#ifdef _T
#undef _T
#endif
-#if defined _WIN32 && defined MD_WIN_UNICODE
+#if defined MD4C_USE_WIN_UNICODE
#define _T(x) L##x
#else
#define _T(x) x
@@ -65,9 +65,9 @@
************************/
/* These are omnipresent so lets save some typing. */
-typedef MD_CHAR CHAR;
-typedef MD_SIZE SZ;
-typedef MD_OFFSET OFF;
+#define CHAR MD_CHAR
+#define SZ MD_SIZE
+#define OFF MD_OFFSET
typedef struct MD_MARK_tag MD_MARK;
typedef struct MD_BLOCK_tag MD_BLOCK;
@@ -247,6 +247,94 @@ struct MD_VERBATIMLINE_tag {
#define ISANYOF(off, palette) ISANYOF_(CH(off), (palette))
+#if defined MD4C_USE_WIN_UNICODE
+ #include <ctype.h>
+
+ #define ISUNICODEWHITESPACE(off) iswspace(CH(off))
+ #define ISUNICODEPUNCT(off) iswpunct(CH(off))
+ #define ISUNICODEWHITESPACEBEFORE(off) iswspace(CH((off)-1))
+ #define ISUNICODEPUNCTBEFORE(off) iswpunct(CH((off)-1))
+#elif defined MD4C_USE_UNICODE
+ #ifdef _WIN32
+ /* Note Win32 supports only Unicode plane 0 but better then nothing. */
+ #include <ctype.h>
+ #else
+ #include <wctype.h>
+
+ #ifndef __STDC_ISO_10646__
+ #error "MD4C relies on wchar_t to support Unicode properly."
+ #endif
+ #endif
+
+ #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f)
+ #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0)
+ #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0)
+ #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0)
+ #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80)
+
+ static int
+ md_decode_utf8(MD_CTX* ctx, OFF off)
+ {
+ /* For any invalid UTF-8 sequence we use the Unicode replacement char
+ * for purposes of character classification. */
+ int codepoint = 0xfffd;
+
+ if(IS_UTF8_LEAD1(CH(off))) {
+ codepoint = CH(off);
+ } else if(IS_UTF8_LEAD2(CH(off))) {
+ if(off+1 < ctx->size)
+ codepoint = (((unsigned int)CH(off) & 0x1f) << 6) |
+ (((unsigned int)CH(off+1) & 0x3f) << 0);
+ } else if(IS_UTF8_LEAD3(CH(off))) {
+ if(off+2 < ctx->size)
+ codepoint = (((unsigned int)CH(off) & 0x0f) << 12) |
+ (((unsigned int)CH(off+1) & 0x3f) << 6) |
+ (((unsigned int)CH(off+2) & 0x3f) << 0);
+ } else if(IS_UTF8_LEAD4(CH(off))) {
+ if(off+3 < ctx->size)
+ codepoint = (((unsigned int)CH(off) & 0x07) << 18) |
+ (((unsigned int)CH(off+1) & 0x3f) << 12) |
+ (((unsigned int)CH(off+2) & 0x3f) << 6) |
+ (((unsigned int)CH(off+3) & 0x3f) << 0);
+ }
+
+#ifdef _WIN32
+ /* On Windows, iswpace() et al. gets garbage for codepoints above
+ * the Unicode plane 0. */
+ if(codepoint > 0xffff)
+ codepoint = 0xfffd;
+#endif
+
+ return codepoint;
+ }
+
+ static int
+ md_decode_utf8_before(MD_CTX* ctx, OFF off)
+ {
+ if(off > 0 && IS_UTF8_LEAD1(CH(off-1)))
+ return CH(off-1);
+ if(off > 1 && IS_UTF8_LEAD2(CH(off-2)))
+ return md_decode_utf8(ctx, off-2);
+ if(off > 2 && IS_UTF8_LEAD3(CH(off-3)))
+ return md_decode_utf8(ctx, off-3);
+ if(off > 3 && IS_UTF8_LEAD4(CH(off-4)))
+ return md_decode_utf8(ctx, off-4);
+
+ return 0xfffd;
+ }
+
+ #define ISUNICODEWHITESPACE(off) iswspace(md_decode_utf8(ctx, off))
+ #define ISUNICODEPUNCT(off) iswpunct(md_decode_utf8(ctx, off))
+ #define ISUNICODEWHITESPACEBEFORE(off) iswspace(md_decode_utf8_before(ctx, off))
+ #define ISUNICODEPUNCTBEFORE(off) iswpunct(md_decode_utf8_before(ctx, off))
+#else
+ #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off)
+ #define ISUNICODEPUNCT(off) ISPUNCT(off)
+ #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1)
+ #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1)
+#endif
+
+
static inline const CHAR*
md_strchr(const CHAR* str, CHAR ch)
{
@@ -1103,16 +1191,16 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
while(tmp < line_end && CH(tmp) == ch)
tmp++;
- if(off == line->beg || ISWHITESPACE(off-1))
+ if(off == line->beg || ISUNICODEWHITESPACEBEFORE(off))
left_level = 0;
- else if(ISPUNCT(off-1))
+ else if(ISUNICODEPUNCTBEFORE(off))
left_level = 1;
else
left_level = 2;
- if(tmp == line_end || ISWHITESPACE(tmp))
+ if(tmp == line_end || ISUNICODEWHITESPACE(tmp))
right_level = 0;
- else if(ISPUNCT(tmp))
+ else if(ISUNICODEPUNCT(tmp))
right_level = 1;
else
right_level = 2;
diff --git a/md4c/md4c.h b/md4c/md4c.h
index ec8ad3f..03542cb 100644
--- a/md4c/md4c.h
+++ b/md4c/md4c.h
@@ -41,10 +41,8 @@
* On Windows, when UNICODE is defined, we by default switch to WCHAR.
* This behavior may be disabled by predefining MD4C_DISABLE_WIN_UNICODE.
*/
-#if defined _WIN32 && defined UNICODE && !defined MD4C_DISABLE_WIN_UNICODE
+#if defined MD4C_USE_WIN_UNICODE
#include <windows.h>
-
- #define MD4C_USE_WIN_UNICODE
typedef WCHAR MD_CHAR;
#else
typedef char MD_CHAR;