md_collect_marks: Optimize the function. Use character map for a fast path and minimize count of branches for specially handled characters. When profiling md2html on a larger documents with output redirected to /dev/null to mitigate I/O, this function was quite a bottleneck. It consummed about 33% of CPU cycles on a longer document input, with this patch applied it drops down to 12%.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
diff --git a/md4c/md4c.c b/md4c/md4c.c
index 2c14d63..60c4bed 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -103,6 +103,8 @@ struct MD_CTX_tag {
unsigned n_marks;
unsigned alloc_marks;
+ char mark_char_map[128];
+
/* For resolving of inline spans. */
MD_MARKCHAIN mark_chains[4];
#define BACKTICK_OPENERS ctx->mark_chains[0]
@@ -1045,6 +1047,37 @@ md_split_mark(MD_CTX* ctx, int mark_index, SZ n)
return mark_index + 1;
}
+static void
+md_build_mark_char_map(MD_CTX* ctx)
+{
+ memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map));
+
+ ctx->mark_char_map['\\'] = 1;
+ ctx->mark_char_map['*'] = 1;
+ ctx->mark_char_map['_'] = 1;
+ ctx->mark_char_map['`'] = 1;
+ ctx->mark_char_map['&'] = 1;
+ ctx->mark_char_map[';'] = 1;
+ ctx->mark_char_map['<'] = 1;
+ ctx->mark_char_map['>'] = 1;
+ ctx->mark_char_map['\0'] = 1;
+
+ if(ctx->r.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
+ ctx->mark_char_map[':'] = 1;
+
+ if(ctx->r.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
+ ctx->mark_char_map['@'] = 1;
+
+ if(ctx->r.flags & MD_FLAG_COLLAPSEWHITESPACE) {
+ int i;
+
+ for(i = 0; i < sizeof(ctx->mark_char_map); i++) {
+ if(ISWHITESPACE_(i))
+ ctx->mark_char_map[i] = 1;
+ }
+ }
+}
+
static int
md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
{
@@ -1059,6 +1092,13 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
while(off < line_end) {
CHAR ch = CH(off);
+
+ /* Optimization: Fast path. */
+ if(ch >= sizeof(ctx->mark_char_map) || !ctx->mark_char_map[(int) ch]) {
+ off++;
+ continue;
+ }
+
/* A backslash escape.
* It can go beyond line->end as it may involve escaped new
* line to form a hard break. */
@@ -1076,20 +1116,6 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
continue;
}
- /* Turn non-trivial whitespace into single space. */
- if((ctx->r.flags & MD_FLAG_COLLAPSEWHITESPACE) && ISWHITESPACE_(ch)) {
- OFF tmp = off+1;
-
- while(tmp < line_end && ISWHITESPACE(tmp))
- tmp++;
-
- if(tmp - off > 1 || ch != _T(' ')) {
- PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED);
- off = tmp;
- continue;
- }
- }
-
/* A potential (string) emphasis start/end. */
if(ch == _T('*') || ch == _T('_')) {
OFF tmp = off+1;
@@ -1143,6 +1169,7 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
}
off = tmp;
+ continue;
}
/* A potential code span start/end. */
@@ -1174,24 +1201,24 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
/* A potential entity end. */
if(ch == _T(';')) {
/* We surely cannot be entity unless the previous mark is '&'. */
- if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&')) {
+ if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&'))
PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
- off++;
- continue;
- }
+
+ off++;
+ continue;
}
/* A potential autolink or raw HTML start/end. */
if(ch == _T('<') || ch == _T('>')) {
- if(!(ctx->r.flags & MD_FLAG_NOHTMLSPANS)) {
+ if(!(ctx->r.flags & MD_FLAG_NOHTMLSPANS))
PUSH_MARK(ch, off, off+1, (ch == _T('<') ? MD_MARK_POTENTIAL_OPENER : MD_MARK_POTENTIAL_CLOSER));
- off++;
- continue;
- }
+
+ off++;
+ continue;
}
/* A potential permissive URL autolink. */
- if((ctx->r.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS) && ch == _T(':')) {
+ if(ch == _T(':')) {
static struct {
const CHAR* scheme;
SZ scheme_size;
@@ -1223,19 +1250,37 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
continue;
}
}
+
+ off++;
+ continue;
}
/* A potential permissive e-mail autolink. */
- if((ctx->r.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS) && ch == _T('@')) {
+ if(ch == _T('@')) {
if(line->beg + 1 <= off && ISALNUM(off-1) &&
off + 3 < line->end && ISALNUM(off+1))
{
PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
/* Push a dummy as a reserve for a closer. */
PUSH_MARK('D', off, off, 0);
- off++;
- continue;
}
+
+ off++;
+ continue;
+ }
+
+ /* Turn non-trivial whitespace into single space. */
+ if(ISWHITESPACE_(ch)) {
+ OFF tmp = off+1;
+
+ while(tmp < line_end && ISWHITESPACE(tmp))
+ tmp++;
+
+ if(tmp - off > 1 || ch != _T(' '))
+ PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED);
+
+ off = tmp;
+ continue;
}
/* NULL character. */
@@ -2589,6 +2634,8 @@ md_process_doc(MD_CTX *ctx)
OFF off = 0;
int ret = 0;
+ md_build_mark_char_map(ctx);
+
MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL);
while(off < ctx->size) {