Commit 8bac86aa4301b4287f624b76817b64eedd35aa0b

Tilman Roeder 2019-07-07T09:46:10

Added support for LaTeX math (#87) Addresses #86.

diff --git a/md2html/md2html.c b/md2html/md2html.c
index 0d5b38b..a8e47a5 100644
--- a/md2html/md2html.c
+++ b/md2html/md2html.c
@@ -208,6 +208,7 @@ static const option cmdline_options[] = {
     { "fcollapse-whitespace",        0,  'W', OPTION_ARG_NONE },
     { "ftables",                     0,  'T', OPTION_ARG_NONE },
     { "fstrikethrough",              0,  'S', OPTION_ARG_NONE },
+    { "flatex-math",                 0,  'L', OPTION_ARG_NONE },
     { "ftasklists",                  0,  'X', OPTION_ARG_NONE },
     { 0 }
 };
@@ -256,6 +257,7 @@ usage(void)
         "      --fno-html       Same as --fno-html-blocks --fno-html-spans\n"
         "      --ftables        Enable tables\n"
         "      --fstrikethrough Enable strikethrough spans\n"
+        "      --flatex-math    Enable LaTeX style mathematics spans (e.g. $a+b=c$ becomes <equation>a+b=c</equation>)\n"
         "      --ftasklists     Enable task lists\n"
     );
 }
@@ -304,6 +306,7 @@ cmdline_callback(int opt, char const* value, void* data)
         case 'V':   parser_flags |= MD_FLAG_PERMISSIVEAUTOLINKS; break;
         case 'T':   parser_flags |= MD_FLAG_TABLES; break;
         case 'S':   parser_flags |= MD_FLAG_STRIKETHROUGH; break;
+        case 'L':   parser_flags |= MD_FLAG_LATEXMATHSPANS; break;
         case 'X':   parser_flags |= MD_FLAG_TASKLISTS; break;
 
         default:
diff --git a/md2html/render_html.c b/md2html/render_html.c
index b8046e0..2442697 100644
--- a/md2html/render_html.c
+++ b/md2html/render_html.c
@@ -426,12 +426,14 @@ enter_span_callback(MD_SPANTYPE type, void* detail, void* userdata)
     }
 
     switch(type) {
-        case MD_SPAN_EM:        RENDER_LITERAL(r, "<em>"); break;
-        case MD_SPAN_STRONG:    RENDER_LITERAL(r, "<strong>"); break;
-        case MD_SPAN_A:         render_open_a_span(r, (MD_SPAN_A_DETAIL*) detail); break;
-        case MD_SPAN_IMG:       render_open_img_span(r, (MD_SPAN_IMG_DETAIL*) detail); break;
-        case MD_SPAN_CODE:      RENDER_LITERAL(r, "<code>"); break;
-        case MD_SPAN_DEL:       RENDER_LITERAL(r, "<del>"); break;
+        case MD_SPAN_EM:                RENDER_LITERAL(r, "<em>"); break;
+        case MD_SPAN_STRONG:            RENDER_LITERAL(r, "<strong>"); break;
+        case MD_SPAN_A:                 render_open_a_span(r, (MD_SPAN_A_DETAIL*) detail); break;
+        case MD_SPAN_IMG:               render_open_img_span(r, (MD_SPAN_IMG_DETAIL*) detail); break;
+        case MD_SPAN_CODE:              RENDER_LITERAL(r, "<code>"); break;
+        case MD_SPAN_DEL:               RENDER_LITERAL(r, "<del>"); break;
+        case MD_SPAN_LATEXMATH:         RENDER_LITERAL(r, "<equation>"); break;
+        case MD_SPAN_LATEXMATH_DISPLAY: RENDER_LITERAL(r, "<equation type=\"display\">"); break;
     }
 
     return 0;
@@ -451,12 +453,14 @@ leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata)
     }
 
     switch(type) {
-        case MD_SPAN_EM:        RENDER_LITERAL(r, "</em>"); break;
-        case MD_SPAN_STRONG:    RENDER_LITERAL(r, "</strong>"); break;
-        case MD_SPAN_A:         RENDER_LITERAL(r, "</a>"); break;
-        case MD_SPAN_IMG:       /*noop, handled above*/ break;
-        case MD_SPAN_CODE:      RENDER_LITERAL(r, "</code>"); break;
-        case MD_SPAN_DEL:       RENDER_LITERAL(r, "</del>"); break;
+        case MD_SPAN_EM:                RENDER_LITERAL(r, "</em>"); break;
+        case MD_SPAN_STRONG:            RENDER_LITERAL(r, "</strong>"); break;
+        case MD_SPAN_A:                 RENDER_LITERAL(r, "</a>"); break;
+        case MD_SPAN_IMG:               /*noop, handled above*/ break;
+        case MD_SPAN_CODE:              RENDER_LITERAL(r, "</code>"); break;
+        case MD_SPAN_DEL:               RENDER_LITERAL(r, "</del>"); break;
+        case MD_SPAN_LATEXMATH:         /*fall through*/
+        case MD_SPAN_LATEXMATH_DISPLAY: RENDER_LITERAL(r, "</equation>"); break;
     }
 
     return 0;
diff --git a/md4c/md4c.c b/md4c/md4c.c
index d2b4313..6025659 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -127,7 +127,7 @@ struct MD_CTX_tag {
 #endif
 
     /* For resolving of inline spans. */
-    MD_MARKCHAIN mark_chains[11];
+    MD_MARKCHAIN mark_chains[12];
 #define PTR_CHAIN                               ctx->mark_chains[0]
 #define TABLECELLBOUNDARIES                     ctx->mark_chains[1]
 #define ASTERISK_OPENERS_extraword_mod3_0       ctx->mark_chains[2]
@@ -139,8 +139,9 @@ struct MD_CTX_tag {
 #define UNDERSCORE_OPENERS                      ctx->mark_chains[8]
 #define TILDE_OPENERS                           ctx->mark_chains[9]
 #define BRACKET_OPENERS                         ctx->mark_chains[10]
+#define DOLLAR_OPENERS                          ctx->mark_chains[11]
 #define OPENERS_CHAIN_FIRST                     2
-#define OPENERS_CHAIN_LAST                      10
+#define OPENERS_CHAIN_LAST                      11
 
     int n_table_cell_boundaries;
 
@@ -1128,7 +1129,7 @@ md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF 
     if(off+1 < lines[0].end  &&  CH(off) == _T('-')  &&  CH(off+1) == _T('>'))
         return FALSE;
 
-    /* HTML comment must not contyain "--", so we scan just for "--" instead
+    /* HTML comment must not contain "--", so we scan just for "--" instead
      * of "-->" and verify manually that '>' follows. */
     if(md_scan_for_html_closer(ctx, _T("--"), 2,
                 lines, n_lines, off, max_end, p_end, &ctx->html_comment_horizon))
@@ -2683,6 +2684,9 @@ md_build_mark_char_map(MD_CTX* ctx)
     if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH)
         ctx->mark_char_map['~'] = 1;
 
+    if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS)
+        ctx->mark_char_map['$'] = 1;
+
     if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
         ctx->mark_char_map['@'] = 1;
 
@@ -3251,6 +3255,21 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
                 continue;
             }
 
+            /* A potential equation start/end */
+            if(ch == _T('$')) {
+                /* We can have at most two consecutive $ signs,
+                 * where two dollar signs signify a display equation. */
+                OFF tmp = off+1;
+
+                while(tmp < line_end && CH(tmp) == _T('$'))
+                    tmp++;
+
+                if (tmp - off <= 2)
+                    PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER);
+                off = tmp;
+                continue;
+            }
+
             /* Turn non-trivial whitespace into single space. */
             if(ISWHITESPACE_(ch)) {
                 OFF tmp = off+1;
@@ -3631,6 +3650,36 @@ md_analyze_tilde(MD_CTX* ctx, int mark_index)
 }
 
 static void
+md_analyze_dollar(MD_CTX* ctx, int mark_index)
+{
+    /* This should mimic the way inline equations work in LaTeX, so there
+     * can only ever be one item in the chain (i.e. the dollars can't be
+     * nested). This is basically the same as the md_analyze_tilde function,
+     * except that we require matching openers and closers to be of the same
+     * length.
+     *
+     * E.g.: $abc$$def$$ => abc (display equation) def (end equation) */
+    if(DOLLAR_OPENERS.head >= 0) {
+        /* If the potential closer has a non-matching number of $, discard */
+        MD_MARK* open = &ctx->marks[DOLLAR_OPENERS.head];
+        MD_MARK* close = &ctx->marks[mark_index];
+
+        int opener_index = DOLLAR_OPENERS.head;
+        md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_ALL);
+        if (open->end - open->beg == close->end - close->beg) {
+            /* We are the matching closer */
+            md_resolve_range(ctx, &DOLLAR_OPENERS, opener_index, mark_index);
+        } else {
+            /* We don't match the opener, so discard old opener and insert as opener */
+            md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
+        }
+    } else {
+        /* No unmatched openers, so we are opener */
+        md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
+    }
+}
+
+static void
 md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index)
 {
     MD_MARK* opener = &ctx->marks[mark_index];
@@ -3785,6 +3834,7 @@ md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
             case '_':   /* Pass through. */
             case '*':   md_analyze_emph(ctx, i); break;
             case '~':   md_analyze_tilde(ctx, i); break;
+            case '$':   md_analyze_dollar(ctx, i); break;
             case '.':   /* Pass through. */
             case ':':   md_analyze_permissive_url_autolink(ctx, i); break;
             case '@':   md_analyze_permissive_email_autolink(ctx, i); break;
@@ -3841,7 +3891,7 @@ static void
 md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
                          int mark_beg, int mark_end)
 {
-    md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~@:."));
+    md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$@:."));
     ASTERISK_OPENERS_extraword_mod3_0.head = -1;
     ASTERISK_OPENERS_extraword_mod3_0.tail = -1;
     ASTERISK_OPENERS_extraword_mod3_1.head = -1;
@@ -3858,6 +3908,8 @@ md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
     UNDERSCORE_OPENERS.tail = -1;
     TILDE_OPENERS.head = -1;
     TILDE_OPENERS.tail = -1;
+    DOLLAR_OPENERS.head = -1;
+    DOLLAR_OPENERS.tail = -1;
 }
 
 static int
@@ -3974,6 +4026,16 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
                         MD_LEAVE_SPAN(MD_SPAN_DEL, NULL);
                     break;
 
+                case '$':
+                    if(mark->flags & MD_MARK_OPENER) {
+                        MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
+                        text_type = MD_TEXT_LATEXMATH;
+                    } else {
+                        MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
+                        text_type = MD_TEXT_NORMAL;
+                    }
+                    break;
+
                 case '[':       /* Link, image. */
                 case '!':
                 case ']':
@@ -4072,12 +4134,17 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
             if(off >= end)
                 break;
 
-            if(text_type == MD_TEXT_CODE) {
+            if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) {
                 OFF tmp;
 
                 MD_ASSERT(prev_mark != NULL);
-                MD_ASSERT(prev_mark->ch == '`'  &&  (prev_mark->flags & MD_MARK_OPENER));
-                MD_ASSERT(mark->ch == '`'  &&  (mark->flags & MD_MARK_CLOSER));
+                if (text_type == MD_TEXT_CODE) {
+                    MD_ASSERT(prev_mark->ch == '`'  &&  (prev_mark->flags & MD_MARK_OPENER));
+                    MD_ASSERT(mark->ch == '`'  &&  (mark->flags & MD_MARK_CLOSER));
+                } else if (text_type == MD_TEXT_LATEXMATH) {
+                    MD_ASSERT(prev_mark->ch == '$'  &&  (prev_mark->flags & MD_MARK_OPENER));
+                    MD_ASSERT(mark->ch == '$'  &&  (mark->flags & MD_MARK_CLOSER));
+                }
 
                 /* Inside a code span, trailing line whitespace has to be
                  * outputted. */
@@ -4085,11 +4152,11 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
                 while(off < ctx->size  &&  ISBLANK(off))
                     off++;
                 if(off > tmp)
-                    MD_TEXT(MD_TEXT_CODE, STR(tmp), off-tmp);
+                    MD_TEXT(text_type, STR(tmp), off-tmp);
 
                 /* and new lines are transformed into single spaces. */
                 if(prev_mark->end < off  &&  off < mark->beg)
-                    MD_TEXT(MD_TEXT_CODE, _T(" "), 1);
+                    MD_TEXT(text_type, _T(" "), 1);
 
             } else if(text_type == MD_TEXT_HTML) {
                 /* Inside raw HTML, we output the new line verbatim, including
diff --git a/md4c/md4c.h b/md4c/md4c.h
index dcdadad..6d9fce5 100644
--- a/md4c/md4c.h
+++ b/md4c/md4c.h
@@ -129,7 +129,13 @@ typedef enum MD_SPANTYPE {
     /* <del>...</del>
      * Note: Recognized only when MD_FLAG_STRIKETHROUGH is enabled.
      */
-    MD_SPAN_DEL
+    MD_SPAN_DEL,
+
+    /* For recognizing inline ($) and display ($$) equations
+     * Note: Recognized only when MD_FLAG_LATEXMATHSPANS is enabled.
+     */
+    MD_SPAN_LATEXMATH,
+    MD_SPAN_LATEXMATH_DISPLAY
 } MD_SPANTYPE;
 
 /* Text is the actual textual contents of span. */
@@ -168,7 +174,11 @@ typedef enum MD_TEXTTYPE {
     /* Text is a raw HTML. If it is contents of a raw HTML block (i.e. not
      * an inline raw HTML), then MD_TEXT_BR and MD_TEXT_SOFTBR are not used.
      * The text contains verbatim '\n' for the new lines. */
-    MD_TEXT_HTML
+    MD_TEXT_HTML,
+
+    /* Text is inside an equation. This is processed the same way as inlined code
+     * spans (`code`). */
+    MD_TEXT_LATEXMATH
 } MD_TEXTTYPE;
 
 
@@ -275,6 +285,7 @@ typedef struct MD_SPAN_IMG_DETAIL {
 #define MD_FLAG_STRIKETHROUGH               0x0200  /* Enable strikethrough extension. */
 #define MD_FLAG_PERMISSIVEWWWAUTOLINKS      0x0400  /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */
 #define MD_FLAG_TASKLISTS                   0x0800  /* Enable task list extension. */
+#define MD_FLAG_LATEXMATHSPANS              0x1000  /* Enable $ and $$ containing LaTeX equations. */
 
 #define MD_FLAG_PERMISSIVEAUTOLINKS         (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS)
 #define MD_FLAG_NOHTML                      (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS)
diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh
index 7f5324f..b8e335b 100755
--- a/scripts/run-tests.sh
+++ b/scripts/run-tests.sh
@@ -59,5 +59,9 @@ echo "Task lists extension:"
 $PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/tasklists.txt" -p "$PROGRAM --ftasklists"
 
 echo
+echo "LaTeX extension:"
+$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/latex.txt" -p "$PROGRAM --flatex-math"
+
+echo
 echo "Pathological input:"
 $PYTHON "$TEST_DIR/pathological_tests.py" -p "$PROGRAM"
diff --git a/test/latex.txt b/test/latex.txt
new file mode 100644
index 0000000..ed2b947
--- /dev/null
+++ b/test/latex.txt
@@ -0,0 +1,30 @@
+
+# LaTeX
+
+With the flag `MD_FLAG_LATEXMATHSPANS`, MD4C enables extension for recognition
+of LaTeX style equation spans.
+
+An equation is is any text wrapped in tildes ($ or $$).
+
+```````````````````````````````` example
+$a+b=c$ Hello, world!
+.
+<p><equation>a+b=c</equation> Hello, world!</p>
+````````````````````````````````
+
+If a double dollar sign is used, the equation is a display equation.
+
+```````````````````````````````` example
+This is a display equation: $$\int_a^b x dx$$.
+.
+<p>This is a display equation: <equation type="display">\int_a^b x dx</equation>.</p>
+````````````````````````````````
+
+Equations may span multiple lines.
+
+```````````````````````````````` example
+$$
+\int_a^b f(x) dx
+$$
+<p><equation type="display">\int_a^b f(x) dx </equation></p>
+````````````````````````````````