Commit 78829427081928cb89647b669304e688bb91d1d0

Martin Mitas 2024-01-13T02:59:35

Fix some emphasis parsing issues. * We incorrectly applied the infamous rule of three only to asterisk-encoded emphasis, it has to be applied to underscore as well. * We incorrectly applied the rule of three only if the opener and/or closer was inside a word. It has also to be applied if the mark is both preceded and followed by punctuation. Fixes #217.

diff --git a/src/md4c.c b/src/md4c.c
index e25d266..502d5a7 100644
--- a/src/md4c.c
+++ b/src/md4c.c
@@ -178,22 +178,27 @@ struct MD_CTX_tag {
 #endif
 
     /* For resolving of inline spans. */
-    MD_MARKCHAIN mark_chains[13];
+    MD_MARKCHAIN mark_chains[18];
 #define PTR_CHAIN                               (ctx->mark_chains[0])
 #define TABLECELLBOUNDARIES                     (ctx->mark_chains[1])
-#define ASTERISK_OPENERS_extraword_mod3_0       (ctx->mark_chains[2])
-#define ASTERISK_OPENERS_extraword_mod3_1       (ctx->mark_chains[3])
-#define ASTERISK_OPENERS_extraword_mod3_2       (ctx->mark_chains[4])
-#define ASTERISK_OPENERS_intraword_mod3_0       (ctx->mark_chains[5])
-#define ASTERISK_OPENERS_intraword_mod3_1       (ctx->mark_chains[6])
-#define ASTERISK_OPENERS_intraword_mod3_2       (ctx->mark_chains[7])
-#define UNDERSCORE_OPENERS                      (ctx->mark_chains[8])
-#define TILDE_OPENERS_1                         (ctx->mark_chains[9])
-#define TILDE_OPENERS_2                         (ctx->mark_chains[10])
-#define BRACKET_OPENERS                         (ctx->mark_chains[11])
-#define DOLLAR_OPENERS                          (ctx->mark_chains[12])
+#define ASTERISK_OPENERS_oo_mod3_0              (ctx->mark_chains[2])   /* Opener-only */
+#define ASTERISK_OPENERS_oo_mod3_1              (ctx->mark_chains[3])
+#define ASTERISK_OPENERS_oo_mod3_2              (ctx->mark_chains[4])
+#define ASTERISK_OPENERS_oc_mod3_0              (ctx->mark_chains[5])   /* Both opener and closer candidate */
+#define ASTERISK_OPENERS_oc_mod3_1              (ctx->mark_chains[6])
+#define ASTERISK_OPENERS_oc_mod3_2              (ctx->mark_chains[7])
+#define UNDERSCORE_OPENERS_oo_mod3_0            (ctx->mark_chains[8])   /* Opener-only */
+#define UNDERSCORE_OPENERS_oo_mod3_1            (ctx->mark_chains[9])
+#define UNDERSCORE_OPENERS_oo_mod3_2            (ctx->mark_chains[10])
+#define UNDERSCORE_OPENERS_oc_mod3_0            (ctx->mark_chains[11])  /* Both opener and closer candidate */
+#define UNDERSCORE_OPENERS_oc_mod3_1            (ctx->mark_chains[12])
+#define UNDERSCORE_OPENERS_oc_mod3_2            (ctx->mark_chains[13])
+#define TILDE_OPENERS_1                         (ctx->mark_chains[14])
+#define TILDE_OPENERS_2                         (ctx->mark_chains[15])
+#define BRACKET_OPENERS                         (ctx->mark_chains[16])
+#define DOLLAR_OPENERS                          (ctx->mark_chains[17])
 #define OPENERS_CHAIN_FIRST                     2  /* [0] and [1] are special. */
-#define OPENERS_CHAIN_LAST                      12
+#define OPENERS_CHAIN_LAST                      17
 
     int n_table_cell_boundaries;
 
@@ -2482,7 +2487,7 @@ struct MD_MARK_tag {
 #define MD_MARK_RESOLVED                    0x10  /* Resolved in any definite way. */
 
 /* Mark flags specific for various mark types (so they can share bits). */
-#define MD_MARK_EMPH_INTRAWORD              0x20  /* Helper for the "rule of 3". */
+#define MD_MARK_EMPH_OC                     0x20  /* Opener/closer mixed candidate. Helper for the "rule of 3". */
 #define MD_MARK_EMPH_MOD3_0                 0x40
 #define MD_MARK_EMPH_MOD3_1                 0x80
 #define MD_MARK_EMPH_MOD3_2                 (0x40 | 0x80)
@@ -2492,18 +2497,27 @@ struct MD_MARK_tag {
 #define MD_MARK_HASNESTEDBRACKETS           0x20  /* For '[' to rule out invalid link labels early */
 
 static MD_MARKCHAIN*
-md_asterisk_chain(MD_CTX* ctx, unsigned flags)
+md_emph_chain(MD_CTX* ctx, MD_CHAR ch, unsigned flags)
 {
-    switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) {
-        case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0:  return &ASTERISK_OPENERS_intraword_mod3_0;
-        case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1:  return &ASTERISK_OPENERS_intraword_mod3_1;
-        case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2:  return &ASTERISK_OPENERS_intraword_mod3_2;
-        case MD_MARK_EMPH_MOD3_0:                           return &ASTERISK_OPENERS_extraword_mod3_0;
-        case MD_MARK_EMPH_MOD3_1:                           return &ASTERISK_OPENERS_extraword_mod3_1;
-        case MD_MARK_EMPH_MOD3_2:                           return &ASTERISK_OPENERS_extraword_mod3_2;
-        default:                                            MD_UNREACHABLE();
+    MD_MARKCHAIN* chain;
+
+    switch(ch) {
+        case '*':   chain = &ASTERISK_OPENERS_oo_mod3_0; break;
+        case '_':   chain = &UNDERSCORE_OPENERS_oo_mod3_0; break;
+        default:    MD_UNREACHABLE();
     }
-    return NULL;
+
+    if(flags & MD_MARK_EMPH_OC)
+        chain += 3;
+
+    switch(flags & MD_MARK_EMPH_MOD3_MASK) {
+        case MD_MARK_EMPH_MOD3_0:   chain += 0; break;
+        case MD_MARK_EMPH_MOD3_1:   chain += 1; break;
+        case MD_MARK_EMPH_MOD3_2:   chain += 2; break;
+        default:                    MD_UNREACHABLE();
+    }
+
+    return chain;
 }
 
 static MD_MARKCHAIN*
@@ -2512,11 +2526,14 @@ md_mark_chain(MD_CTX* ctx, int mark_index)
     MD_MARK* mark = &ctx->marks[mark_index];
 
     switch(mark->ch) {
-        case _T('*'):   return md_asterisk_chain(ctx, mark->flags);
-        case _T('_'):   return &UNDERSCORE_OPENERS;
+        case _T('*'):
+        case _T('_'):   return md_emph_chain(ctx, mark->ch, mark->flags);
+
         case _T('~'):   return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
+
         case _T('!'):
         case _T('['):   return &BRACKET_OPENERS;
+
         default:        return NULL;
     }
 }
@@ -3075,8 +3092,8 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
                         flags |= MD_MARK_POTENTIAL_CLOSER;
                     if(right_level > 0  &&  right_level >= left_level)
                         flags |= MD_MARK_POTENTIAL_OPENER;
-                    if(left_level == 2  &&  right_level == 2)
-                        flags |= MD_MARK_EMPH_INTRAWORD;
+                    if(flags == (MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER))
+                        flags |= MD_MARK_EMPH_OC;
 
                     /* For "the rule of three" we need to remember the original
                      * size of the mark (modulo three), before we potentially
@@ -3745,48 +3762,39 @@ static void
 md_analyze_emph(MD_CTX* ctx, int mark_index)
 {
     MD_MARK* mark = &ctx->marks[mark_index];
-    MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
 
     /* If we can be a closer, try to resolve with the preceding opener. */
     if(mark->flags & MD_MARK_POTENTIAL_CLOSER) {
         MD_MARK* opener = NULL;
         int opener_index = 0;
-
-        if(mark->ch == _T('*')) {
-            MD_MARKCHAIN* opener_chains[6];
-            int i, n_opener_chains;
-            unsigned flags = mark->flags;
-
-            /* Apply the "rule of three". */
-            n_opener_chains = 0;
-            opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_0;
-            if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
-                opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_1;
-            if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
-                opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_2;
-            opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_0;
-            if(!(flags & MD_MARK_EMPH_INTRAWORD)  ||  (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
-                opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_1;
-            if(!(flags & MD_MARK_EMPH_INTRAWORD)  ||  (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
-                opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_2;
-
-            /* Opener is the most recent mark from the allowed chains. */
-            for(i = 0; i < n_opener_chains; i++) {
-                if(opener_chains[i]->tail >= 0) {
-                    int tmp_index = opener_chains[i]->tail;
-                    MD_MARK* tmp_mark = &ctx->marks[tmp_index];
-                    if(opener == NULL  ||  tmp_mark->end > opener->end) {
-                        opener_index = tmp_index;
-                        opener = tmp_mark;
-                    }
+        MD_MARKCHAIN* opener_chains[6];
+        int i, n_opener_chains;
+        unsigned flags = mark->flags;
+
+        n_opener_chains = 0;
+
+        /* Apply the rule of 3 */
+        opener_chains[n_opener_chains++] = md_emph_chain(ctx, mark->ch, MD_MARK_EMPH_MOD3_0 | MD_MARK_EMPH_OC);
+        if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
+            opener_chains[n_opener_chains++] = md_emph_chain(ctx, mark->ch, MD_MARK_EMPH_MOD3_1 | MD_MARK_EMPH_OC);
+        if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
+            opener_chains[n_opener_chains++] = md_emph_chain(ctx, mark->ch, MD_MARK_EMPH_MOD3_2 | MD_MARK_EMPH_OC);
+        opener_chains[n_opener_chains++] = md_emph_chain(ctx, mark->ch, MD_MARK_EMPH_MOD3_0);
+        if(!(flags & MD_MARK_EMPH_OC)  ||  (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
+            opener_chains[n_opener_chains++] = md_emph_chain(ctx, mark->ch, MD_MARK_EMPH_MOD3_1);
+        if(!(flags & MD_MARK_EMPH_OC)  ||  (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
+            opener_chains[n_opener_chains++] = md_emph_chain(ctx, mark->ch, MD_MARK_EMPH_MOD3_2);
+
+        /* Opener is the most recent mark from the allowed chains. */
+        for(i = 0; i < n_opener_chains; i++) {
+            if(opener_chains[i]->tail >= 0) {
+                int tmp_index = opener_chains[i]->tail;
+                MD_MARK* tmp_mark = &ctx->marks[tmp_index];
+                if(opener == NULL  ||  tmp_mark->end > opener->end) {
+                    opener_index = tmp_index;
+                    opener = tmp_mark;
                 }
             }
-        } else {
-            /* Simple emph. mark */
-            if(chain->tail >= 0) {
-                opener_index = chain->tail;
-                opener = &ctx->marks[opener_index];
-            }
         }
 
         /* Resolve, if we have found matching opener. */
@@ -3810,7 +3818,7 @@ md_analyze_emph(MD_CTX* ctx, int mark_index)
 
     /* If we could not resolve as closer, we may be yet be an opener. */
     if(mark->flags & MD_MARK_POTENTIAL_OPENER)
-        md_mark_chain_append(ctx, chain, mark_index);
+        md_mark_chain_append(ctx, md_emph_chain(ctx, mark->ch, mark->flags), mark_index);
 }
 
 static void
diff --git a/test/coverage.txt b/test/coverage.txt
index 1d4fdf5..5d79bdb 100644
--- a/test/coverage.txt
+++ b/test/coverage.txt
@@ -420,6 +420,36 @@ x <!A>
 ````````````````````````````````
 
 
+### [Issue 217](https://github.com/mity/md4c/issues/217)
+
+```````````````````````````````` example
+__!_!__
+
+__!x!__
+
+**!*!**
+
+---
+
+_*__*_*
+
+_*xx*_*
+
+_*__-_-
+
+_*xx-_-
+.
+<p><strong>!_!</strong></p>
+<p><strong>!x!</strong></p>
+<p><strong>!*!</strong></p>
+<hr />
+<p><em><em>__</em></em>*</p>
+<p><em><em>xx</em></em>*</p>
+<p><em>*__-</em>-</p>
+<p><em>*xx-</em>-</p>
+````````````````````````````````
+
+
 ## Code coverage
 
 ### `md_is_unicode_whitespace__()`