Commit a7899c057b4332ad7fd661ced4ce85f49006bd18

Martin Mitas 2016-10-11T02:34:01

Implement autolinks.

diff --git a/README.md b/README.md
index ec48964..eac74ad 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ more or less forms our to do list.
   - [ ] 6.4 Emphasis and strong emphasis
   - [ ] 6.5 Links
   - [ ] 6.6 Images
-  - [ ] 6.7 Autolinks
+  - [x] 6.7 Autolinks
   - [x] 6.8 Raw HTML
   - [x] 6.9 Hard line breaks
   - [x] 6.10 Soft line breaks
diff --git a/md2html/md2html.c b/md2html/md2html.c
index ebc1634..2c3025e 100644
--- a/md2html/md2html.c
+++ b/md2html/md2html.c
@@ -149,6 +149,14 @@ open_code_block(struct membuffer* out, const MD_BLOCK_CODE_DETAIL* det)
     MEMBUF_APPEND_LITERAL(out, ">");
 }
 
+static void
+open_a_span(struct membuffer* out, MD_SPAN_A_DETAIL* det)
+{
+    MEMBUF_APPEND_LITERAL(out, "<a href=\"");
+    membuf_append_escaped(out, det->href, det->href_size);
+    MEMBUF_APPEND_LITERAL(out, "\">");
+}
+
 static unsigned
 hex_val(char ch)
 {
@@ -285,6 +293,7 @@ enter_span_callback(MD_SPANTYPE type, void* detail, void* userdata)
     struct membuffer* out = (struct membuffer*) userdata;
 
     switch(type) {
+        case MD_SPAN_A:         open_a_span(out, (MD_SPAN_A_DETAIL*) detail); break;
         case MD_SPAN_CODE:      MEMBUF_APPEND_LITERAL(out, "<code>"); break;
     }
 
@@ -297,6 +306,7 @@ leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata)
     struct membuffer* out = (struct membuffer*) userdata;
 
     switch(type) {
+        case MD_SPAN_A:         MEMBUF_APPEND_LITERAL(out, "</a>"); break;
         case MD_SPAN_CODE:      MEMBUF_APPEND_LITERAL(out, "</code>"); break;
     }
 
diff --git a/md4c/md4c.c b/md4c/md4c.c
index b63dece..fad5610 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -100,8 +100,8 @@ struct MD_CTX_tag {
     MD_MARKCHAIN mark_chains[2];
     /* For md_analyze_backtick(). */
     #define BACKTICK_OPENERS        ctx->mark_chains[0]
-    /* For md_analyze_raw_html(). */
-    #define RAW_HTML_OPENERS        ctx->mark_chains[1]
+    /* For md_analyze_lt_gt(). */
+    #define LT_GT_OPENERS           ctx->mark_chains[1]
 
     /* For MD_BLOCK_QUOTE */
     unsigned quote_level;   /* Nesting level. */
@@ -684,6 +684,49 @@ md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_
 }
 
 
+/******************************************
+ ***  Recognizing Some Complex Inlines  ***
+ ******************************************/
+
+static int
+md_is_autolink(MD_CTX* ctx, OFF beg, OFF end)
+{
+    OFF off;
+
+    MD_ASSERT(CH(beg) == _T('<'));
+    MD_ASSERT(CH(end-1) == _T('>'));
+
+    beg++;
+    end--;
+
+    /* Check for scheme. */
+    off = beg;
+    if(off >= end  ||  !ISASCII(off))
+        return -1;
+    off++;
+    while(1) {
+        if(off >= end)
+            return -1;
+        if(off - beg > 32)
+            return -1;
+        if(CH(off) == _T(':')  &&  off - beg >= 2)
+            break;
+        if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.'))
+            return -1;
+        off++;
+    }
+
+    /* Check the path after the scheme. */
+    while(off < end) {
+        if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<') || CH(off) == _T('>'))
+            return -1;
+        off++;
+    }
+
+    return 0;
+}
+
+
 /******************************************************
  ***  Processing Sequence of Inlines (a.k.a Spans)  ***
  ******************************************************/
@@ -877,7 +920,7 @@ md_rollback(MD_CTX* ctx, int opener_index, int closer_index)
 
                 switch(opener->ch) {
                     case '`':   chain = &BACKTICK_OPENERS; break;
-                    case '<':   chain = &RAW_HTML_OPENERS; break;
+                    case '<':   chain = &LT_GT_OPENERS; break;
                     default:        MD_UNREACHABLE(); break;
                 }
 
@@ -1046,47 +1089,61 @@ md_analyze_backtick(MD_CTX* ctx, int mark_index)
 }
 
 static void
-md_analyze_raw_html(MD_CTX* ctx, int mark_index, const MD_LINE* lines, int n_lines)
+md_analyze_lt_gt(MD_CTX* ctx, int mark_index, const MD_LINE* lines, int n_lines)
 {
     MD_MARK* mark = &ctx->marks[mark_index];
     int opener_index;
 
     /* If it is an opener ('<'), remember it. */
     if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
-        md_mark_chain_append(ctx, &RAW_HTML_OPENERS, mark_index);
+        md_mark_chain_append(ctx, &LT_GT_OPENERS, mark_index);
         return;
     }
 
     /* Otherwise we are potential closer and we try to resolve with since all
      * the chained unresolved openers. */
-    opener_index = RAW_HTML_OPENERS.head;
+    opener_index = LT_GT_OPENERS.head;
     while(opener_index >= 0) {
         MD_MARK* opener = &ctx->marks[opener_index];
-        int line_index = 0;
         OFF detected_end;
+        int is_autolink = 0;
+        int is_raw_html = 0;
 
-        /* Identify the line where the opening mark lives. */
-        while(1) {
-            if(opener->beg < lines[line_index].end)
-                break;
-            line_index++;
+        is_autolink = (md_is_autolink(ctx, opener->beg, mark->end) == 0);
+
+        if(!is_autolink) {
+            /* Identify the line where the opening mark lives. */
+            int line_index = 0;
+            while(1) {
+                if(opener->beg < lines[line_index].end)
+                    break;
+                line_index++;
+            }
+
+            is_raw_html = (md_is_html_any(ctx, lines + line_index,
+                    n_lines - line_index, opener->beg, mark->end, &detected_end) == 0);
         }
 
         /* Check whether the range forms a valid raw HTML. */
-        if(md_is_html_any(ctx, lines + line_index, n_lines - line_index,
-                opener->beg, mark->end, &detected_end) == 0)
-        {
-            /* If this fail, it means we have missed some earlier opportunity
+        if(is_autolink || is_raw_html) {
+            /* If this fails, it means we have missed some earlier opportunity
              * to resolve the opener. */
             MD_ASSERT(detected_end == mark->end);
 
             md_rollback(ctx, opener_index, mark_index);
-            md_resolve_range(ctx, &RAW_HTML_OPENERS, opener_index, mark_index);
+            md_resolve_range(ctx, &LT_GT_OPENERS, opener_index, mark_index);
 
-            /* Make these marks zero width so the '<' and '>' are part of its
-             * contents. */
-            opener->end = opener->beg;
-            mark->beg = mark->end;
+            if(is_raw_html) {
+                /* Make these marks zero width so the '<' and '>' are part of its
+                 * contents. */
+                opener->end = opener->beg;
+                mark->beg = mark->end;
+            } else {
+                /* Hack: This is to distinguish the autolink from raw HTML in
+                 * md_process_inlines(). */
+                opener->ch = 'A';
+                mark->ch = 'B';
+            }
 
             /* And we are done. */
             return;
@@ -1202,7 +1259,7 @@ md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int precedence_
 
             case '<':
             case '>':
-                md_analyze_raw_html(ctx, i, lines, n_lines);
+                md_analyze_lt_gt(ctx, i, lines, n_lines);
                 break;
 
             case '&':
@@ -1251,6 +1308,9 @@ md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
 static int
 md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
 {
+    union {
+        MD_SPAN_A_DETAIL a;
+    } det;
     MD_TEXTTYPE text_type;
     const MD_LINE* line = lines;
     const MD_MARK* prev_mark = NULL;
@@ -1302,6 +1362,18 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
                     }
                     break;
 
+                case 'A':       /* Autolink. */
+                    det.a.href = STR(mark->end);
+                    det.a.href_size = ctx->marks[mark->next].beg - mark->end;
+                    MD_ENTER_SPAN(MD_SPAN_A, (void*) &det);
+                    break;
+                case 'B':
+                    /* The detail already has to be initialized: There cannot
+                     * be any resolved mark between the autlink opener and
+                     * closer. */
+                    MD_LEAVE_SPAN(MD_SPAN_A, (void*) &det);
+                    break;
+
                 case '<':       /* Raw HTML. */
                     text_type = MD_TEXT_HTML;
                     break;
diff --git a/md4c/md4c.h b/md4c/md4c.h
index 9a0fe5c..d69d40b 100644
--- a/md4c/md4c.h
+++ b/md4c/md4c.h
@@ -89,6 +89,11 @@ enum MD_BLOCKTYPE_tag {
  * like paragraph or list item. */
 typedef enum MD_SPANTYPE_tag MD_SPANTYPE;
 enum MD_SPANTYPE_tag {
+    /* <a href="xxx">...</a>
+     * Detail: See structure MD_SPAN_A_DETAIL. */
+    MD_SPAN_A,
+
+    /* <code>...</code> */
     MD_SPAN_CODE
 };
 
@@ -131,6 +136,13 @@ enum MD_TEXTTYPE_tag {
 };
 
 
+/* Detailed info for MD_SPAN_A. */
+typedef struct MD_SPAN_A_DETAIL_tag MD_SPAN_A_DETAIL;
+struct MD_SPAN_A_DETAIL_tag {
+    const MD_CHAR* href;    /* Not zero-terminated, use href_size. */
+    MD_SIZE href_size;
+};
+
 /* Detailed info for MD_BLOCK_H. */
 typedef struct MD_BLOCK_H_DETAIL_tag MD_BLOCK_H_DETAIL;
 struct MD_BLOCK_H_DETAIL_tag {