Commit aa53f82c29457e31b220fe2fff9c4070491044fb

Martin Mitáš 2024-02-07T11:44:39

Introduce an overall limit to link. ref. defs instantiations. This is to prevent time and output size explosion in case of input pattern generated by this: $ python -c 'N=1000; print("[x]: " + "x" * N + "\n[x]" * N)' We roughly allow to blowing up the input size of the document 16 times by link reference definitions or up to 1 MB, whatever is smaller. When the threshold is reached, following reference definitions are sent to output unresolved as a text. Fixes #238.

diff --git a/src/md4c.c b/src/md4c.c
index 3038863..4a4e401 100644
--- a/src/md4c.c
+++ b/src/md4c.c
@@ -26,6 +26,7 @@
 #include "md4c.h"
 
 #include <limits.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -143,6 +144,9 @@
 #define SZ      MD_SIZE
 #define OFF     MD_OFFSET
 
+#define SZ_MAX      (sizeof(SZ) == 8 ? UINT64_MAX : UINT32_MAX)
+#define OFF_MAX     (sizeof(OFF) == 8 ? UINT64_MAX : UINT32_MAX)
+
 typedef struct MD_MARK_tag MD_MARK;
 typedef struct MD_BLOCK_tag MD_BLOCK;
 typedef struct MD_CONTAINER_tag MD_CONTAINER;
@@ -180,6 +184,7 @@ struct MD_CTX_tag {
     int alloc_ref_defs;
     void** ref_def_hashtable;
     int ref_def_hashtable_size;
+    SZ max_ref_def_output;
 
     /* Stack of inline/span markers.
      * This is only used for parsing a single block contents but by storing it
@@ -2283,11 +2288,14 @@ md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines,
     int is_multiline;
     CHAR* label;
     SZ label_size;
-    int ret;
+    int ret = FALSE;
 
     MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!'));
     MD_ASSERT(CH(end-1) == _T(']'));
 
+    if(ctx->max_ref_def_output == 0)
+        return FALSE;
+
     beg += (CH(beg) == _T('!') ? 2 : 1);
     end--;
 
@@ -2315,7 +2323,17 @@ md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines,
     if(is_multiline)
         free(label);
 
-    ret = (def != NULL);
+    if(def != NULL) {
+        /* See https://github.com/mity/md4c/issues/238 */
+        MD_SIZE output_size_estimation = def->label_size + def->title_size + def->dest_end - def->dest_beg;
+        if(output_size_estimation < ctx->max_ref_def_output) {
+            ctx->max_ref_def_output -= output_size_estimation;
+            ret = TRUE;
+        } else {
+            MD_LOG("Too many link reference definition instantiations.");
+            ctx->max_ref_def_output = 0;
+        }
+    }
 
 abort:
     return ret;
@@ -6470,6 +6488,7 @@ md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userd
     ctx.code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4;
     md_build_mark_char_map(&ctx);
     ctx.doc_ends_with_newline = (size > 0  &&  ISNEWLINE_(text[size-1]));
+    ctx.max_ref_def_output = MIN(MIN(16 * (uint64_t)size, (uint64_t)(1024 * 1024)), (uint64_t)SZ_MAX);
 
     /* Reset all mark stacks and lists. */
     for(i = 0; i < (int) SIZEOF_ARRAY(ctx.opener_stacks); i++)
diff --git a/test/pathological-tests.py b/test/pathological-tests.py
index 924cbe9..66b85ad 100644
--- a/test/pathological-tests.py
+++ b/test/pathological-tests.py
@@ -102,7 +102,10 @@ pathological = {
             "--ftables"),
     "many broken links":
             (("]([\n" * 50000),
-            re.compile("<p>(\]\(\[\r?\n){49999}\]\(\[</p>"))
+            re.compile("<p>(\]\(\[\r?\n){49999}\]\(\[</p>")),
+    "many link ref. def. instantiations":
+            (("[x]: " + "x" * 50000 + "\n[x]" * 50000),
+            re.compile(""))
 }
 
 whitespace_re = re.compile('/s+/')