Commit 0cb61205b12ed9256002a42df0f76e6c6a0125fa

Martin Mitas 2019-03-10T10:50:23

Move raw inline HTML detection from md_analyze_lt_qt() into md_collect_marks(). Fixes #58: For resolving raw inline HTML the function tried closer with all potential openers, because raw HTML can have '<' inside of an attribute. However this caused O(n^2) for input like "<><><><><><><>...". We solved by handling raw HTML in earlier stage, directly in md_collect_marks(), where we can scan linerary forward. Fixes #61: As a side effect, this also fixes the issue that MD_FLAG_NOHTMLSPANS disabled also recognition of CommonMark autolinks.

diff --git a/md4c/md4c.c b/md4c/md4c.c
index 4c0ad5c..4476e8e 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -2893,9 +2893,33 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
 
             /* A potential autolink or raw HTML start/end. */
             if(ch == _T('<') || ch == _T('>')) {
-                if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS))
-                    PUSH_MARK(ch, off, off+1, (ch == _T('<') ? MD_MARK_POTENTIAL_OPENER : MD_MARK_POTENTIAL_CLOSER));
-
+                if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)  &&  ch == _T('<')) {
+                    int is_html;
+                    OFF html_end;
+
+                    /* Given the nature of the raw HTML, we have to recognize
+                     * it here. Doing so later in md_analyze_lt_gt() could
+                     * open can of worms of quadratic complexity. */
+                    is_html = md_is_html_any(ctx, lines + i, n_lines - i, off, 
+                                    lines[n_lines-1].end, &html_end);
+                    if(is_html) {
+                        PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
+                        PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
+                        ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
+                        ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
+                        off = html_end;
+
+                        /* Advance the current line accordingly. */
+                        while(off > line_end) {
+                            i++;
+                            line++;
+                            line_end = line->end;
+                        }
+                        continue;
+                    }
+                }
+                
+                PUSH_MARK(ch, off, off+1, (ch == _T('<') ? MD_MARK_POTENTIAL_OPENER : MD_MARK_POTENTIAL_CLOSER));
                 off++;
                 continue;
             }
@@ -3203,61 +3227,24 @@ md_analyze_lt_gt(MD_CTX* ctx, int mark_index, const MD_LINE* lines, int n_lines)
         return;
     }
 
-    /* Otherwise we are potential closer and we try to resolve with since all
-     * the chained unresolved openers. */
+    /* Otherwise we are potential closer: Try the most recent opener to resolve
+     * an autolink. */
     opener_index = LOWERTHEN_OPENERS.head;
-    while(opener_index >= 0) {
+    if(opener_index >= 0) {
         MD_MARK* opener = &ctx->marks[opener_index];
-        OFF detected_end;
-        int is_autolink = 0;
         int is_missing_mailto = 0;
-        int is_raw_html = 0;
-
-        is_autolink = (md_is_autolink(ctx, opener->beg, mark->end, &is_missing_mailto));
 
-        if(is_autolink) {
+        if(md_is_autolink(ctx, opener->beg, mark->end, &is_missing_mailto)) {
             if(is_missing_mailto)
                 opener->ch = _T('@');
-        } else {
-            /* Identify the line where the opening mark lives. */
-            int line_index = 0;
-            while(1) {
-                if(opener->beg < lines[line_index].end)
-                    break;
-                line_index++;
-            }
-
-            is_raw_html = (md_is_html_any(ctx, lines + line_index,
-                    n_lines - line_index, opener->beg, mark->end, &detected_end));
-        }
 
-        /* Check whether the range forms a valid raw HTML. */
-        if(is_autolink || is_raw_html) {
             md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_ALL);
             md_resolve_range(ctx, &LOWERTHEN_OPENERS, opener_index, mark_index);
 
-            if(is_raw_html) {
-                /* If this fails, it means we have missed some earlier opportunity
-                 * to resolve the opener of raw HTML. */
-                MD_ASSERT(detected_end == mark->end);
-
-                /* Make these marks zero width so the '<' and '>' are part of its
-                 * contents. */
-                opener->end = opener->beg;
-                mark->beg = mark->end;
-
-                opener->flags &= ~MD_MARK_AUTOLINK;
-                mark->flags &= ~MD_MARK_AUTOLINK;
-            } else {
-                opener->flags |= MD_MARK_AUTOLINK;
-                mark->flags |= MD_MARK_AUTOLINK;
-            }
-
-            /* And we are done. */
-            return;
+            /* Distinguish it from raw HTML spans. */
+            opener->flags |= MD_MARK_AUTOLINK;
+            mark->flags |= MD_MARK_AUTOLINK;
         }
-
-        opener_index = opener->next;
     }
 }
 
diff --git a/test/pathological_tests.py b/test/pathological_tests.py
index cf31408..0f9b55c 100755
--- a/test/pathological_tests.py
+++ b/test/pathological_tests.py
@@ -67,7 +67,10 @@ pathological = {
                   re.compile("(\[0\] ){49999}")),
     "deeply nested lists":
                  ("".join(map(lambda x: ("  " * x + "* a\n"), range(0,1000))),
-                  re.compile("<ul>\r?\n(<li>a<ul>\r?\n){999}<li>a</li>\r?\n</ul>\r?\n(</li>\r?\n</ul>\r?\n){999}"))
+                  re.compile("<ul>\r?\n(<li>a<ul>\r?\n){999}<li>a</li>\r?\n</ul>\r?\n(</li>\r?\n</ul>\r?\n){999}")),
+    "many autolink/html openers and closers":
+                 (("<>" * 50000),
+                  re.compile("(&lt;&gt;){50000}"))
     }
 
 whitespace_re = re.compile('/s+/')