Commit 82d7d087cc3c9b57f5490e6706f64ccec792ae6f

Martin Mitas 2020-01-10T15:48:00

Rework/improve recognition of strike-through spans. Closes #102.

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a83c44b..afd674c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,18 @@
 
 Changes:
 
+ * Recognition of strike-through spans (with the flag `MD_FLAG_STRIKETHROUGH`)
+   has become much stricter and, arguably, reasonable.
+
+    - Only single tildes (`~`) and double tildes (`~~`) are recognized as
+      strike-through marks. Longer ones are not anymore.
+    - The length of the opener and closer marks have to be the same.
+    - The tildes cannot open a strike-through span if a whitespace follows.
+    - The tildes cannot close a strike-through span if a whitespace precedes.
+
+   This change follows the changes of behavior in cmark-gfm some time ago, so
+   it is also beneficial from compatibility point of view.
+
  * When building MD4C by hand instead of using its CMake-based build, the UTF-8
    support was by default disabled, unless explicitly asked for by defining
    a preprocessor macro `MD4C_USE_UTF8`.
diff --git a/md4c/md4c.c b/md4c/md4c.c
index dc76fb8..c293424 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -133,7 +133,7 @@ struct MD_CTX_tag {
 #endif
 
     /* For resolving of inline spans. */
-    MD_MARKCHAIN mark_chains[12];
+    MD_MARKCHAIN mark_chains[13];
 #define PTR_CHAIN                               ctx->mark_chains[0]
 #define TABLECELLBOUNDARIES                     ctx->mark_chains[1]
 #define ASTERISK_OPENERS_extraword_mod3_0       ctx->mark_chains[2]
@@ -143,11 +143,12 @@ struct MD_CTX_tag {
 #define ASTERISK_OPENERS_intraword_mod3_1       ctx->mark_chains[6]
 #define ASTERISK_OPENERS_intraword_mod3_2       ctx->mark_chains[7]
 #define UNDERSCORE_OPENERS                      ctx->mark_chains[8]
-#define TILDE_OPENERS                           ctx->mark_chains[9]
-#define BRACKET_OPENERS                         ctx->mark_chains[10]
-#define DOLLAR_OPENERS                          ctx->mark_chains[11]
+#define TILDE_OPENERS_1                         ctx->mark_chains[9]
+#define TILDE_OPENERS_2                         ctx->mark_chains[10]
+#define BRACKET_OPENERS                         ctx->mark_chains[11]
+#define DOLLAR_OPENERS                          ctx->mark_chains[12]
 #define OPENERS_CHAIN_FIRST                     2
-#define OPENERS_CHAIN_LAST                      11
+#define OPENERS_CHAIN_LAST                      12
 
     int n_table_cell_boundaries;
 
@@ -2474,7 +2475,7 @@ md_mark_chain(MD_CTX* ctx, int mark_index)
     switch(mark->ch) {
         case _T('*'):   return md_asterisk_chain(ctx, mark->flags);
         case _T('_'):   return &UNDERSCORE_OPENERS;
-        case _T('~'):   return &TILDE_OPENERS;
+        case _T('~'):   return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
         case _T('['):   return &BRACKET_OPENERS;
         case _T('|'):   return &TABLECELLBOUNDARIES;
         default:        return NULL;
@@ -3254,7 +3255,17 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
                 while(tmp < line_end  &&  CH(tmp) == _T('~'))
                     tmp++;
 
-                PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER);
+                if(tmp - off < 3) {
+                    unsigned flags = 0;
+
+                    if(tmp < line_end  &&  !ISUNICODEWHITESPACE(tmp))
+                        flags |= MD_MARK_POTENTIAL_OPENER;
+                    if(off > line->beg  &&  !ISUNICODEWHITESPACEBEFORE(off))
+                        flags |= MD_MARK_POTENTIAL_CLOSER;
+                    if(flags != 0)
+                        PUSH_MARK(ch, off, tmp, flags);
+                }
+
                 off = tmp;
                 continue;
             }
@@ -3724,20 +3735,23 @@ md_analyze_emph(MD_CTX* ctx, int mark_index)
 static void
 md_analyze_tilde(MD_CTX* ctx, int mark_index)
 {
-    /* We attempt to be Github Flavored Markdown compatible here. GFM says
-     * that length of the tilde sequence is not important at all. Note that
-     * implies the TILDE_OPENERS chain can have at most one item. */
+    MD_MARK* mark = &ctx->marks[mark_index];
+    MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
+
+    /* We attempt to be Github Flavored Markdown compatible here. GFM accepts
+     * only tildes sequences of length 1 and 2, and the length of the opener
+     * and closer has to match. */
 
-    if(TILDE_OPENERS.head >= 0) {
-        /* The chain already contains an opener, so we may resolve the span. */
-        int opener_index = TILDE_OPENERS.head;
+    if((mark->flags & MD_MARK_POTENTIAL_CLOSER)  &&  chain->head >= 0) {
+        int opener_index = chain->head;
 
         md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
-        md_resolve_range(ctx, &TILDE_OPENERS, opener_index, mark_index);
-    } else {
-        /* We can only be opener. */
-        md_mark_chain_append(ctx, &TILDE_OPENERS, mark_index);
+        md_resolve_range(ctx, chain, opener_index, mark_index);
+        return;
     }
+
+    if(mark->flags & MD_MARK_POTENTIAL_OPENER)
+        md_mark_chain_append(ctx, chain, mark_index);
 }
 
 static void
@@ -3997,8 +4011,10 @@ md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
     ASTERISK_OPENERS_intraword_mod3_2.tail = -1;
     UNDERSCORE_OPENERS.head = -1;
     UNDERSCORE_OPENERS.tail = -1;
-    TILDE_OPENERS.head = -1;
-    TILDE_OPENERS.tail = -1;
+    TILDE_OPENERS_1.head = -1;
+    TILDE_OPENERS_1.tail = -1;
+    TILDE_OPENERS_2.head = -1;
+    TILDE_OPENERS_2.tail = -1;
     DOLLAR_OPENERS.head = -1;
     DOLLAR_OPENERS.tail = -1;
 }
diff --git a/test/strikethrough.txt b/test/strikethrough.txt
index 0b21b7e..884ce59 100644
--- a/test/strikethrough.txt
+++ b/test/strikethrough.txt
@@ -1,10 +1,10 @@
 
-# Strikethrough
+# Strike-Through
 
 With the flag `MD_FLAG_STRIKETHROUGH`, MD4C enables extension for recognition
 of strike-through spans.
 
-Strikethrough text is any text wrapped in tildes (~).
+Strike-through text is any text wrapped in one or two tildes (`~`).
 
 ```````````````````````````````` example
 ~Hi~ Hello, world!
@@ -12,17 +12,36 @@ Strikethrough text is any text wrapped in tildes (~).
 <p><del>Hi</del> Hello, world!</p>
 ````````````````````````````````
 
-Any number of tildes may be used on either side of the text; they do not need
-to match, and they cannot be nested.
+If the length of the opener and closer doesn't match, the strike-through is
+not recognized.
 
 ```````````````````````````````` example
-This ~text~~~~ is ~~~~curious~.
+This ~text~~ is curious.
 .
-<p>This <del>text</del> is <del>curious</del>.</p>
+<p>This ~text~~ is curious.</p>
 ````````````````````````````````
 
+Too long tilde sequence won't be recognized:
+
+```````````````````````````````` example
+foo ~~~bar~~~
+.
+<p>foo ~~~bar~~~</p>
+````````````````````````````````
+
+Also note the markers cannot open a strike-through span if they are followed
+with a whitespace; and similarly, then cannot close the span if they are
+preceded with a whitespace:
+
+```````````````````````````````` example
+~foo ~bar
+.
+<p>~foo ~bar</p>
+````````````````````````````````
+
+
 As with regular emphasis delimiters, a new paragraph will cause the cessation
-of parsing a strikethrough:
+of parsing a strike-through:
 
 ```````````````````````````````` example
 This ~~has a