Move raw inline HTML detection from md_analyze_lt_qt() into md_collect_marks(). Fixes #58: For resolving raw inline HTML the function tried closer with all potential openers, because raw HTML can have '<' inside of an attribute. However this caused O(n^2) for input like "<><><><><><><>...". We solved by handling raw HTML in earlier stage, directly in md_collect_marks(), where we can scan linerary forward. Fixes #61: As a side effect, this also fixes the issue that MD_FLAG_NOHTMLSPANS disabled also recognition of CommonMark autolinks.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
diff --git a/md4c/md4c.c b/md4c/md4c.c
index 4c0ad5c..4476e8e 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -2893,9 +2893,33 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
/* A potential autolink or raw HTML start/end. */
if(ch == _T('<') || ch == _T('>')) {
- if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS))
- PUSH_MARK(ch, off, off+1, (ch == _T('<') ? MD_MARK_POTENTIAL_OPENER : MD_MARK_POTENTIAL_CLOSER));
-
+ if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS) && ch == _T('<')) {
+ int is_html;
+ OFF html_end;
+
+ /* Given the nature of the raw HTML, we have to recognize
+ * it here. Doing so later in md_analyze_lt_gt() could
+ * open can of worms of quadratic complexity. */
+ is_html = md_is_html_any(ctx, lines + i, n_lines - i, off,
+ lines[n_lines-1].end, &html_end);
+ if(is_html) {
+ PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
+ PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
+ ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
+ ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
+ off = html_end;
+
+ /* Advance the current line accordingly. */
+ while(off > line_end) {
+ i++;
+ line++;
+ line_end = line->end;
+ }
+ continue;
+ }
+ }
+
+ PUSH_MARK(ch, off, off+1, (ch == _T('<') ? MD_MARK_POTENTIAL_OPENER : MD_MARK_POTENTIAL_CLOSER));
off++;
continue;
}
@@ -3203,61 +3227,24 @@ md_analyze_lt_gt(MD_CTX* ctx, int mark_index, const MD_LINE* lines, int n_lines)
return;
}
- /* Otherwise we are potential closer and we try to resolve with since all
- * the chained unresolved openers. */
+ /* Otherwise we are potential closer: Try the most recent opener to resolve
+ * an autolink. */
opener_index = LOWERTHEN_OPENERS.head;
- while(opener_index >= 0) {
+ if(opener_index >= 0) {
MD_MARK* opener = &ctx->marks[opener_index];
- OFF detected_end;
- int is_autolink = 0;
int is_missing_mailto = 0;
- int is_raw_html = 0;
-
- is_autolink = (md_is_autolink(ctx, opener->beg, mark->end, &is_missing_mailto));
- if(is_autolink) {
+ if(md_is_autolink(ctx, opener->beg, mark->end, &is_missing_mailto)) {
if(is_missing_mailto)
opener->ch = _T('@');
- } else {
- /* Identify the line where the opening mark lives. */
- int line_index = 0;
- while(1) {
- if(opener->beg < lines[line_index].end)
- break;
- line_index++;
- }
-
- is_raw_html = (md_is_html_any(ctx, lines + line_index,
- n_lines - line_index, opener->beg, mark->end, &detected_end));
- }
- /* Check whether the range forms a valid raw HTML. */
- if(is_autolink || is_raw_html) {
md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_ALL);
md_resolve_range(ctx, &LOWERTHEN_OPENERS, opener_index, mark_index);
- if(is_raw_html) {
- /* If this fails, it means we have missed some earlier opportunity
- * to resolve the opener of raw HTML. */
- MD_ASSERT(detected_end == mark->end);
-
- /* Make these marks zero width so the '<' and '>' are part of its
- * contents. */
- opener->end = opener->beg;
- mark->beg = mark->end;
-
- opener->flags &= ~MD_MARK_AUTOLINK;
- mark->flags &= ~MD_MARK_AUTOLINK;
- } else {
- opener->flags |= MD_MARK_AUTOLINK;
- mark->flags |= MD_MARK_AUTOLINK;
- }
-
- /* And we are done. */
- return;
+ /* Distinguish it from raw HTML spans. */
+ opener->flags |= MD_MARK_AUTOLINK;
+ mark->flags |= MD_MARK_AUTOLINK;
}
-
- opener_index = opener->next;
}
}
diff --git a/test/pathological_tests.py b/test/pathological_tests.py
index cf31408..0f9b55c 100755
--- a/test/pathological_tests.py
+++ b/test/pathological_tests.py
@@ -67,7 +67,10 @@ pathological = {
re.compile("(\[0\] ){49999}")),
"deeply nested lists":
("".join(map(lambda x: (" " * x + "* a\n"), range(0,1000))),
- re.compile("<ul>\r?\n(<li>a<ul>\r?\n){999}<li>a</li>\r?\n</ul>\r?\n(</li>\r?\n</ul>\r?\n){999}"))
+ re.compile("<ul>\r?\n(<li>a<ul>\r?\n){999}<li>a</li>\r?\n</ul>\r?\n(</li>\r?\n</ul>\r?\n){999}")),
+ "many autolink/html openers and closers":
+ (("<>" * 50000),
+ re.compile("(<>){50000}"))
}
whitespace_re = re.compile('/s+/')