Commit 83047d3eb1b928f7a5e0bedca1a46bd2be939448

Martin Mitas 2019-05-07T22:24:29

md_analyze_permissive_url_autolink: Improve. * Fix domain recognition so that it has to have at least two dot-delimited components. * Fix handling if parenthesis so that they have to form balanced pairs; i.e. the first ')' not having a preceding opener ends the path. Fixes #76.

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1bce543..505de1c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,13 @@
 
 ## Next Version (Work in Progress)
 
+Changes:
+ * Make permissive URL autolink and permssive WWW autolink extensions stricter.
+
+   This brings the behavior closer to GFM and mitigates risk of false positives.
+   In particular, the domain has to contain at least one dot and parenthesis
+   can be part of the link destination only if `(` and `)` are balanced.
+
 Fixes:
  * [#73](https://github.com/mity/md4c/issues/73):
    Some raw HTML inputs could lead to quadratic parsing times.
@@ -11,6 +18,8 @@ Fixes:
  * [#74](https://github.com/mity/md4c/issues/74):
    Fix input leading to a crash. Found by fuzzing.
 
+ * [#76](https://github.com/mity/md4c/issues/76):
+   Fix handling of parenthesis in some corner cases.
 
 ## Version 0.3.2
 
diff --git a/md4c/md4c.c b/md4c/md4c.c
index 2b171df..f91348e 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -3628,55 +3628,57 @@ md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index)
     MD_MARK* closer = &ctx->marks[closer_index];
     MD_MARK* next_resolved_mark;
     OFF off = opener->end;
-    int seen_dot = FALSE;
-    int seen_underscore_or_hyphen[2] = { FALSE, FALSE };
+    int n_dots = FALSE;
+    int has_underscore_in_last_seg = FALSE;
+    int has_underscore_in_next_to_last_seg = FALSE;
+    int n_opened_parenthesis = 0;
 
     /* Check for domain. */
     while(off < ctx->size) {
-        if(ISALNUM(off)) {
+        if(ISALNUM(off) || CH(off) == _T('-')) {
             off++;
         } else if(CH(off) == _T('.')) {
-            seen_dot = TRUE;
-            seen_underscore_or_hyphen[0] = seen_underscore_or_hyphen[1];
-            seen_underscore_or_hyphen[1] = FALSE;
+            /* We must see at least one period. */
+            n_dots++;
+            has_underscore_in_next_to_last_seg = has_underscore_in_last_seg;
+            has_underscore_in_last_seg = FALSE;
             off++;
-        } else if(ISANYOF2(off, _T('-'), _T('_'))) {
-            seen_underscore_or_hyphen[1] = TRUE;
+        } else if(CH(off) == _T('_')) {
+            /* No underscore may be present in the last two domain segments. */
+            has_underscore_in_last_seg = TRUE;
             off++;
         } else {
             break;
         }
     }
-
-    if(off <= opener->end || !seen_dot || seen_underscore_or_hyphen[0] || seen_underscore_or_hyphen[1])
+    if(off > opener->end  &&  CH(off-1) == _T('.')) {
+        off--;
+        n_dots--;
+    }
+    if(off <= opener->end || n_dots == 0 || has_underscore_in_next_to_last_seg || has_underscore_in_last_seg)
         return;
 
     /* Check for path. */
     next_resolved_mark = closer + 1;
     while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED))
         next_resolved_mark++;
-    while(off < next_resolved_mark->beg  &&  CH(off) != _T('<')  &&  !ISWHITESPACE(off)  &&  !ISNEWLINE(off))
-        off++;
-
-    /* Path validation. */
-    if(ISANYOF(off-1, _T("?!.,:*_~)"))) {
-        if(CH(off-1) != _T(')')) {
-            off--;
-        } else {
-            int parenthesis_balance = 0;
-            OFF tmp;
-
-            for(tmp = opener->end; tmp < off; tmp++) {
-                if(CH(tmp) == _T('('))
-                    parenthesis_balance++;
-                else if(CH(tmp) == _T(')'))
-                    parenthesis_balance--;
-            }
-
-            if(parenthesis_balance < 0)
-                off--;
+    while(off < next_resolved_mark->beg  &&  CH(off) != _T('<')  &&  !ISWHITESPACE(off)  &&  !ISNEWLINE(off)) {
+        /* Parenthesis must be balanced. */
+        if(CH(off) == _T('(')) {
+            n_opened_parenthesis++;
+        } else if(CH(off) == _T(')')) {
+            if(n_opened_parenthesis > 0)
+                n_opened_parenthesis--;
+            else
+                break;
         }
+
+        off++;
     }
+    /* These cannot be last char In such case they are more likely normal
+     * punctuation. */
+    if(ISANYOF(off-1, _T("?!.,:*_~")))
+        off--;
 
     /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
      * length so all the contents becomes the link text. */
diff --git a/test/permissive-url-autolinks.txt b/test/permissive-url-autolinks.txt
index ebe1504..89f60c2 100644
--- a/test/permissive-url-autolinks.txt
+++ b/test/permissive-url-autolinks.txt
@@ -77,3 +77,13 @@ This is [link](http://github.com/)X
 .
 <p>This is <a href="http://github.com/">link</a>X</p>
 ````````````````````````````````
+
+
+## [Issue 76](https://github.com/mity/md4c/issues/76)
+```````````````````````````````` example
+*(http://example.com)*
+.
+<p><em>(<a href="http://example.com">http://example.com</a>)</em></p>
+````````````````````````````````
+
+
diff --git a/test/permissive-www-autolinks.txt b/test/permissive-www-autolinks.txt
index 71ccf93..2830722 100644
--- a/test/permissive-www-autolinks.txt
+++ b/test/permissive-www-autolinks.txt
@@ -64,9 +64,9 @@ the only parentheses are in the interior of the autolink, no special rules are
 applied:
 
 ```````````````````````````````` example
-www.google.com/search?q=(business))+ok
+www.google.com/search?q=(business)+ok
 .
-<p><a href="http://www.google.com/search?q=(business))+ok">www.google.com/search?q=(business))+ok</a></p>
+<p><a href="http://www.google.com/search?q=(business)+ok">www.google.com/search?q=(business)+ok</a></p>
 ````````````````````````````````
 
 If an autolink ends in a semicolon (`;`), we check to see if it appears to