Commit b42e7f5ceac7f72601b9c28c88693e63df95e416

Martin Mitas 2022-01-10T11:41:25

md_resolve_links: Avoid link ref. def. lookup if... if we know that the bracket pair contains nested brackets. That makes the label invalid anyway, therefore we know that there is no link ref. def. to be found anyway. In case of heavily nested bracket pairs, the lookup could lead to quadratic parsing times. Fixes #172.

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4156e64..17b17cc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -45,6 +45,10 @@ Fixes:
    One dash (optionally with a leading or tailing `:` appended or prepended)
    is now sufficient. This improves compatibility with the GFM.
 
+ * [#172](https://github.com/mity/md4c/issues/172):
+   Fix quadratic time behavior caused by unnecessary lookup for link reference
+   definition even if the potential label contains nested brackets.
+
 
 ## Version 0.4.8
 
diff --git a/src/md4c.c b/src/md4c.c
index 47b7e64..42a3289 100644
--- a/src/md4c.c
+++ b/src/md4c.c
@@ -2487,6 +2487,7 @@ struct MD_MARK_tag {
 #define MD_MARK_EMPH_MOD3_MASK              (0x40 | 0x80)
 #define MD_MARK_AUTOLINK                    0x20  /* Distinguisher for '<', '>'. */
 #define MD_MARK_VALIDPERMISSIVEAUTOLINK     0x20  /* For permissive autolinks. */
+#define MD_MARK_HASNESTEDBRACKETS           0x20  /* For '[' to rule out invalid link labels early */
 
 static MD_MARKCHAIN*
 md_asterisk_chain(MD_CTX* ctx, unsigned flags)
@@ -3367,17 +3368,20 @@ md_analyze_bracket(MD_CTX* ctx, int mark_index)
      * or enclosing pair of brackets (if the inner is the link, the outer
      * one cannot be.)
      *
-     * Therefore we here only construct a list of resolved '[' ']' pairs
-     * ordered by position of the closer. This allows ur to analyze what is
-     * or is not link in the right order, from inside to outside in case
-     * of nested brackets.
+     * Therefore we here only construct a list of '[' ']' pairs ordered by
+     * position of the closer. This allows us to analyze what is or is not
+     * link in the right order, from inside to outside in case of nested
+     * brackets.
      *
-     * The resolving itself is deferred into md_resolve_links().
+     * The resolving itself is deferred to md_resolve_links().
      */
 
     MD_MARK* mark = &ctx->marks[mark_index];
 
     if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
+        if(BRACKET_OPENERS.head != -1)
+            ctx->marks[BRACKET_OPENERS.tail].flags |= MD_MARK_HASNESTEDBRACKETS;
+
         md_mark_chain_append(ctx, &BRACKET_OPENERS, mark_index);
         return;
     }
@@ -3542,10 +3546,12 @@ md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
         if(next_opener != NULL  &&  next_opener->beg == closer->end) {
             if(next_closer->beg > closer->end + 1) {
                 /* Might be full reference link. */
-                is_link = md_is_link_reference(ctx, lines, n_lines, next_opener->beg, next_closer->end, &attr);
+                if(!(next_opener->flags & MD_MARK_HASNESTEDBRACKETS))
+                    is_link = md_is_link_reference(ctx, lines, n_lines, next_opener->beg, next_closer->end, &attr);
             } else {
                 /* Might be shortcut reference link. */
-                is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
+                if(!(opener->flags & MD_MARK_HASNESTEDBRACKETS))
+                    is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
             }
 
             if(is_link < 0)
@@ -3602,7 +3608,8 @@ md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
 
             if(!is_link) {
                 /* Might be collapsed reference link. */
-                is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
+                if(!(opener->flags & MD_MARK_HASNESTEDBRACKETS))
+                    is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
                 if(is_link < 0)
                     return -1;
             }
diff --git a/test/pathological_tests.py b/test/pathological_tests.py
index a7e1a33..76cb9df 100755
--- a/test/pathological_tests.py
+++ b/test/pathological_tests.py
@@ -92,7 +92,10 @@ pathological = {
             re.compile("(\[ \(\]\(){50000}")),
     "broken thematic break":
             (("* " * 50000 + "a"),
-            re.compile("<ul>\r?\n(<li><ul>\r?\n){49999}<li>a</li>\r?\n</ul>\r?\n(</li>\r?\n</ul>\r?\n){49999}"))
+            re.compile("<ul>\r?\n(<li><ul>\r?\n){49999}<li>a</li>\r?\n</ul>\r?\n(</li>\r?\n</ul>\r?\n){49999}")),
+    "nested invalid link references":
+            (("[" * 50000 + "]" * 50000 + "\n\n[a]: /b"),
+            re.compile("\[{50000}\]{50000}"))
 }
 
 whitespace_re = re.compile('/s+/')