Commit e25ea3d1826dddf53ee5df9075cc43b91e3d8bbf

Martin Mitas 2024-01-11T03:34:24

Update list of named entities.

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 82cc2cf..655a016 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,9 @@ Changes:
 
  * Make Unicode-specific code compliant to Unicode 15.1.
 
+ * Update list of entities known to the HTML renderer from
+   https://html.spec.whatwg.org/entities.json.
+
 New Features:
 
  * Add extension allowing to treat all soft break as hard ones. It has to be
diff --git a/scripts/build_entity_map.py b/scripts/build_entity_map.py
new file mode 100644
index 0000000..a1ee2c9
--- /dev/null
+++ b/scripts/build_entity_map.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import json
+import urllib.request
+
+
+url_str = "https://html.spec.whatwg.org/entities.json"
+
+with urllib.request.urlopen(url_str) as url:
+    entities = json.load(url)
+
+records = []
+
+for name in entities:
+    if name[-1] != ';':
+        continue
+
+    codepoints = entities[name]["codepoints"]
+
+    if len(codepoints) > 2:
+        print('Entity {} needs {} codepints; may need to update the .c code '
+                ' accordingly.'.format(name, len(codepoints)), file=sys.stderr)
+        sys.exit(1)
+
+    while len(codepoints) < 2:
+        codepoints.append(0)
+
+    codepoints_str = map(str, codepoints)
+    records.append("    { \"" + name + "\", { " + ", ".join(codepoints_str) + " } }")
+
+records.sort()
+
+sys.stdout.write("static const ENTITY ENTITY_MAP[] = {\n")
+sys.stdout.write(",\n".join(records))
+sys.stdout.write("\n};\n\n")
diff --git a/src/entity.c b/src/entity.c
index 9991ca1..38588a0 100644
--- a/src/entity.c
+++ b/src/entity.c
@@ -27,8 +27,8 @@
 #include <string.h>
 
 
-/* The table is generated from https://html.spec.whatwg.org/entities.json */
-static const struct entity entity_table[] = {
+/* Generated by scripts/build_enity_map.py. */
+static const ENTITY ENTITY_MAP[] = {
     { "&AElig;", { 198, 0 } },
     { "&AMP;", { 38, 0 } },
     { "&Aacute;", { 193, 0 } },
@@ -1040,17 +1040,14 @@ static const struct entity entity_table[] = {
     { "&fork;", { 8916, 0 } },
     { "&forkv;", { 10969, 0 } },
     { "&fpartint;", { 10765, 0 } },
-    { "&frac12", { 189, 0 } },
     { "&frac12;", { 189, 0 } },
     { "&frac13;", { 8531, 0 } },
-    { "&frac14", { 188, 0 } },
     { "&frac14;", { 188, 0 } },
     { "&frac15;", { 8533, 0 } },
     { "&frac16;", { 8537, 0 } },
     { "&frac18;", { 8539, 0 } },
     { "&frac23;", { 8532, 0 } },
     { "&frac25;", { 8534, 0 } },
-    { "&frac34", { 190, 0 } },
     { "&frac34;", { 190, 0 } },
     { "&frac35;", { 8535, 0 } },
     { "&frac38;", { 8540, 0 } },
@@ -1923,11 +1920,8 @@ static const struct entity entity_table[] = {
     { "&succsim;", { 8831, 0 } },
     { "&sum;", { 8721, 0 } },
     { "&sung;", { 9834, 0 } },
-    { "&sup1", { 185, 0 } },
     { "&sup1;", { 185, 0 } },
-    { "&sup2", { 178, 0 } },
     { "&sup2;", { 178, 0 } },
-    { "&sup3", { 179, 0 } },
     { "&sup3;", { 179, 0 } },
     { "&sup;", { 8835, 0 } },
     { "&supE;", { 10950, 0 } },
@@ -2163,7 +2157,8 @@ static const struct entity entity_table[] = {
 };
 
 
-struct entity_key {
+typedef struct ENTITY_KEY_tag ENTITY_KEY;
+struct ENTITY_KEY_tag {
     const char* name;
     size_t name_size;
 };
@@ -2171,20 +2166,20 @@ struct entity_key {
 static int
 entity_cmp(const void* p_key, const void* p_entity)
 {
-    struct entity_key* key = (struct entity_key*) p_key;
-    struct entity* ent = (struct entity*) p_entity;
+    ENTITY_KEY* key = (ENTITY_KEY*) p_key;
+    ENTITY* ent = (ENTITY*) p_entity;
 
     return strncmp(key->name, ent->name, key->name_size);
 }
 
-const struct entity*
+const ENTITY*
 entity_lookup(const char* name, size_t name_size)
 {
-    struct entity_key key = { name, name_size };
+    ENTITY_KEY key = { name, name_size };
 
     return bsearch(&key,
-                   entity_table,
-                   sizeof(entity_table) / sizeof(entity_table[0]),
-                   sizeof(struct entity),
+                   ENTITY_MAP,
+                   sizeof(ENTITY_MAP) / sizeof(ENTITY_MAP[0]),
+                   sizeof(ENTITY),
                    entity_cmp);
 }
diff --git a/src/entity.h b/src/entity.h
index 36395fe..0e2e254 100644
--- a/src/entity.h
+++ b/src/entity.h
@@ -31,12 +31,13 @@
 
 /* Most entities are formed by single Unicode codepoint, few by two codepoints.
  * Single-codepoint entities have codepoints[1] set to zero. */
-struct entity {
+typedef struct ENTITY_tag ENTITY;
+struct ENTITY_tag {
     const char* name;
     unsigned codepoints[2];
 };
 
-const struct entity* entity_lookup(const char* name, size_t name_size);
+const ENTITY* entity_lookup(const char* name, size_t name_size);
 
 
 #endif  /* MD4C_ENTITY_H */
diff --git a/src/md4c-html.c b/src/md4c-html.c
index f452146..ca799f5 100644
--- a/src/md4c-html.c
+++ b/src/md4c-html.c
@@ -231,7 +231,7 @@ render_entity(MD_HTML* r, const MD_CHAR* text, MD_SIZE size,
         return;
     } else {
         /* Named entity (e.g. "&nbsp;"). */
-        const struct entity* ent;
+        const ENTITY* ent;
 
         ent = entity_lookup(text, size);
         if(ent != NULL) {