Commit ef85cfc27804f0bf2095d96ac9b85b21b65a09cc

Martin Mitáš 2019-11-04T15:05:07

Simplify parsing of tables (#97) We do so by removing the function md_is_table_row(). md_is_table_row() did some crazy inline parsing to detect whether the line contains at least one pipe which is not inside a code span or other high-priority inline element. This was very complicated under the hood and to was actually breaking the clean design which separates block analysis parse and inline analysis of each block contents. We now just use the table underline for determining the block is table and its properties like e.g. the column count. This means a paragraph now cannot interrupt a table. This is a change in a behavior but likely acceptable one as it actually brings the behavior closer to behavior of tables in cmark-gfm in this regard. Last but not least, it seems to prevent adoption of other useful features, for about that, see the discussion in PR #92.

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4c11aa1..a5104e5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,13 @@
 
 Changes:
 
+ * Parsing of tables (with `MD_FLAG_TABLES`) is now closer to the way how
+   cmark-gfm parses tables as we do not require every row of the table to
+   contain a pipe `|` anymore.
+
+   As a consequence, paragraphs now cannot interrupt tables. A paragraph which
+   follows the table has to be delimited with a blank line.
+
  * With `MD_FLAG_LATEXMATHSPANS`, LaTeX math spans (`$...$`) and LaTeX display
    math spans (`$$...$$`) are recognized. (Note though that the HTML renderer
    outputs them verbatim.) Thanks for the feature belong to [Tilman Roeder](
diff --git a/md4c/md4c.c b/md4c/md4c.c
index 2b4bfd5..ecd9656 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -4251,7 +4251,7 @@ md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end,
 {
     MD_LINE line;
     OFF* pipe_offs = NULL;
-    int i, j, n;
+    int i, j, k, n;
     int ret = 0;
 
     line.beg = beg;
@@ -4263,32 +4263,32 @@ md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end,
 
     /* We have to remember the cell boundaries in local buffer because
      * ctx->marks[] shall be reused during cell contents processing. */
-    n = ctx->n_table_cell_boundaries;
+    n = ctx->n_table_cell_boundaries + 2;
     pipe_offs = (OFF*) malloc(n * sizeof(OFF));
     if(pipe_offs == NULL) {
         MD_LOG("malloc() failed.");
         ret = -1;
         goto abort;
     }
-    for(i = TABLECELLBOUNDARIES.head, j = 0; i >= 0; i = ctx->marks[i].next) {
+    j = 0;
+    pipe_offs[j++] = beg;
+    for(i = TABLECELLBOUNDARIES.head; i >= 0; i = ctx->marks[i].next) {
         MD_MARK* mark = &ctx->marks[i];
-        pipe_offs[j++] = mark->beg;
+        pipe_offs[j++] = mark->end;
     }
+    pipe_offs[j++] = end+1;
 
     /* Process cells. */
     MD_ENTER_BLOCK(MD_BLOCK_TR, NULL);
-    j = 0;
-    if(beg < pipe_offs[0]  &&  j < col_count)
-        MD_CHECK(md_process_table_cell(ctx, cell_type, align[j++], beg, pipe_offs[0]));
-    for(i = 0; i < n-1  &&  j < col_count; i++)
-        MD_CHECK(md_process_table_cell(ctx, cell_type, align[j++], pipe_offs[i]+1, pipe_offs[i+1]));
-    if(pipe_offs[n-1] < end-1  &&  j < col_count)
-        MD_CHECK(md_process_table_cell(ctx, cell_type, align[j++], pipe_offs[n-1]+1, end));
+    k = 0;
+    for(i = 0; i < j-1  &&  k < col_count; i++) {
+        if(pipe_offs[i] < pipe_offs[i+1]-1)
+            MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1));
+    }
     /* Make sure we call enough table cells even if the current table contains
      * too few of them. */
-    while(j < col_count)
-        MD_CHECK(md_process_table_cell(ctx, cell_type, align[j++], 0, 0));
-
+    while(k < col_count)
+        MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0));
     MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL);
 
 abort:
@@ -4340,38 +4340,6 @@ abort:
     return ret;
 }
 
-static int
-md_is_table_row(MD_CTX* ctx, OFF beg, OFF* p_end)
-{
-    MD_LINE line;
-    int i;
-    int ret = FALSE;
-
-    line.beg = beg;
-    line.end = beg;
-
-    /* Find end of line. */
-    while(line.end < ctx->size  &&  !ISNEWLINE(line.end))
-        line.end++;
-
-    MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE));
-
-    if(TABLECELLBOUNDARIES.head >= 0) {
-        if(p_end != NULL)
-            *p_end = line.end;
-        ret = TRUE;
-    }
-
-abort:
-    /* Free any temporary memory blocks stored within some dummy marks. */
-    for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
-        free(md_mark_get_ptr(ctx, i));
-    PTR_CHAIN.head = -1;
-    PTR_CHAIN.tail = -1;
-
-    return ret;
-}
-
 
 /**************************
  ***  Processing Block  ***
@@ -5803,9 +5771,7 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end,
         }
 
         /* Check whether we are table continuation. */
-        if(pivot_line->type == MD_LINE_TABLE  &&  md_is_table_row(ctx, off, &off)  &&
-           n_parents == ctx->n_containers)
-        {
+        if(pivot_line->type == MD_LINE_TABLE  &&  n_parents == ctx->n_containers) {
             line->type = MD_LINE_TABLE;
             break;
         }
@@ -5859,8 +5825,7 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end,
             unsigned col_count;
 
             if(ctx->current_block != NULL  &&  ctx->current_block->n_lines == 1  &&
-                md_is_table_underline(ctx, off, &off, &col_count)  &&
-                md_is_table_row(ctx, pivot_line->beg, NULL))
+                md_is_table_underline(ctx, off, &off, &col_count))
             {
                 line->data = col_count;
                 line->type = MD_LINE_TABLEUNDERLINE;
diff --git a/test/tables.txt b/test/tables.txt
index b883e82..de61f7d 100644
--- a/test/tables.txt
+++ b/test/tables.txt
@@ -85,8 +85,9 @@ quux     | quuz
 </table>
 ````````````````````````````````
 
-However for one-column table, at least one of those has to be used, otherwise
-it would be parsed as a Setext title followed by paragraph.
+However for one-column table, at least one pipe has to be used in the table
+header underline, otherwise it would be parsed as a Setext title followed by
+a paragraph.
 
 ```````````````````````````````` example
 Column 1
@@ -141,8 +142,7 @@ Lorem ipsum dolor sit amet.
 | quux     | quuz</p>
 ````````````````````````````````
 
-But paragraph or other block can interrupt a table as a line without any pipe
-ends the table.
+Similarly, paragraph cannot interrupt a table:
 
 ```````````````````````````````` example
 Column 1 | Column 2
@@ -160,15 +160,15 @@ Lorem ipsum dolor sit amet.
 <tr><td>foo</td><td>bar</td></tr>
 <tr><td>baz</td><td>qux</td></tr>
 <tr><td>quux</td><td>quuz</td></tr>
+<tr><td>Lorem ipsum dolor sit amet.</td><td></td></tr>
 </tbody>
 </table>
-<p>Lorem ipsum dolor sit amet.</p>
 ````````````````````````````````
 
-The ruling line between head and body of the table must include the same amount
-of cells as the line with column names, and each cell has to consist of three
-dash (`-`) characters. However first, last or both dashes may be replaced with
-colon to denote column alignment.
+The underline of the table is crucial for recognition of the table, count of
+its columns and their alignment: The line has to contain at least one pipe,
+and it has provide at least three dash (`-`) characters for every column in
+the table.
 
 Thus this is not a table because there are too few dashes for Column 2.
 
@@ -186,7 +186,9 @@ Thus this is not a table because there are too few dashes for Column 2.
 | quux     | quuz</p>
 ````````````````````````````````
 
-And this is a table where each of the four column uses different alignment.
+The first, the last or both the first and the last dash in each column
+underline can be replaced with a colon (`:`) to request left, right or middle
+alignment of the respective column:
 
 ```````````````````````````````` example
 | Column 1 | Column 2 | Column 3 | Column 4 |
@@ -263,12 +265,11 @@ quux     | quuz
 <tr><th>Column 1</th><th>Column 2</th></tr>
 </thead>
 <tbody>
+<tr><td><code>foo     | bar</code></td><td></td></tr>
+<tr><td>baz</td><td>qux</td></tr>
+<tr><td>quux</td><td>quuz</td></tr>
 </tbody>
 </table>
-<p><code>foo     | bar</code>
-baz      | qux
-quux     | quuz</p>
-
 ````````````````````````````````