Implement e-mail autolinks.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
diff --git a/md4c/md4c.c b/md4c/md4c.c
index fe83b65..7511478 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -1323,18 +1323,11 @@ md_analyze_backtick(MD_CTX* ctx, int mark_index)
}
static int
-md_is_autolink(MD_CTX* ctx, OFF beg, OFF end)
+md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF end)
{
- OFF off;
-
- MD_ASSERT(CH(beg) == _T('<'));
- MD_ASSERT(CH(end-1) == _T('>'));
-
- beg++;
- end--;
+ OFF off = beg;
/* Check for scheme. */
- off = beg;
if(off >= end || !ISASCII(off))
return -1;
off++;
@@ -1360,6 +1353,74 @@ md_is_autolink(MD_CTX* ctx, OFF beg, OFF end)
return 0;
}
+static int
+md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF end)
+{
+ OFF off = beg;
+ int label_len;
+
+ /* The code should correspond to this regexp:
+ /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+
+ @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
+ (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
+ */
+
+ /* Username (before '@'). */
+ while(off < end && (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-"))))
+ off++;
+ if(off <= beg)
+ return -1;
+
+ /* '@' */
+ if(off >= end || CH(off) != _T('@'))
+ return -1;
+ off++;
+
+ /* Labels delimited with '.'; each label is sequence of 1 - 62 alnum
+ * characters or '-', but '-' is not allowed as first or last char. */
+ label_len = 0;
+ while(off < end) {
+ if(ISALNUM(off))
+ label_len++;
+ else if(CH(off) == _T('-') && label_len > 0)
+ label_len++;
+ else if(CH(off) == _T('.') && label_len > 0 && CH(off-1) != _T('-'))
+ label_len = 0;
+ else
+ return -1;
+
+ if(label_len > 63)
+ return -1;
+
+ off++;
+ }
+
+ if(label_len <= 0 || CH(off-1) == _T('-'))
+ return -1;
+
+ return 0;
+}
+
+static int
+md_is_autolink(MD_CTX* ctx, OFF beg, OFF end, int* p_missing_mailto)
+{
+ MD_ASSERT(CH(beg) == _T('<'));
+ MD_ASSERT(CH(end-1) == _T('>'));
+
+ beg++;
+ end--;
+
+ if(md_is_autolink_uri(ctx, beg, end) == 0)
+ return 0;
+
+ if(md_is_autolink_email(ctx, beg, end) == 0) {
+ *p_missing_mailto = 1;
+ return 0;
+ }
+
+ return -1;
+}
+
static void
md_analyze_lt_gt(MD_CTX* ctx, int mark_index, const MD_LINE* lines, int n_lines)
{
@@ -1379,11 +1440,15 @@ md_analyze_lt_gt(MD_CTX* ctx, int mark_index, const MD_LINE* lines, int n_lines)
MD_MARK* opener = &ctx->marks[opener_index];
OFF detected_end;
int is_autolink = 0;
+ int is_missing_mailto = 0;
int is_raw_html = 0;
- is_autolink = (md_is_autolink(ctx, opener->beg, mark->end) == 0);
+ is_autolink = (md_is_autolink(ctx, opener->beg, mark->end, &is_missing_mailto) == 0);
- if(!is_autolink) {
+ if(is_autolink) {
+ if(is_missing_mailto)
+ opener->ch = _T('@');
+ } else {
/* Identify the line where the opening mark lives. */
int line_index = 0;
while(1) {
@@ -1590,6 +1655,10 @@ md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index)
md_resolve_range(ctx, NULL, mark_index, closer_index);
}
+/* The permissive autolinks do not have to be enclosed in '<' '>' but we
+ * instead impose stricter rules what is understood as an e-mail address
+ * here. Actually any non-alphanumeric characters with exception of '.'
+ * are prohibited both in username and after '@'. */
static void
md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index)
{
@@ -1616,10 +1685,14 @@ md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index)
return;
}
- /* Accept any alphanumeric sequences delimited with dot after the '@'. */
+ /* Accept any alphanumeric sequences delimited with dot after the '@',
+ * limiting the sequences length by 64 characters. */
while(1) {
+ OFF label_start = end;
while(end + 1 < ctx->size && ISALNUM(end))
end++;
+ if(end - label_start > 63)
+ return;
if(end + 1 < ctx->size && CH(end) == _T('.') && ISALNUM(end+1)) {
right_dot_count++;