Update list of named entities.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 82cc2cf..655a016 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,9 @@ Changes:
* Make Unicode-specific code compliant to Unicode 15.1.
+ * Update list of entities known to the HTML renderer from
+ https://html.spec.whatwg.org/entities.json.
+
New Features:
* Add extension allowing to treat all soft break as hard ones. It has to be
diff --git a/scripts/build_entity_map.py b/scripts/build_entity_map.py
new file mode 100644
index 0000000..a1ee2c9
--- /dev/null
+++ b/scripts/build_entity_map.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import json
+import urllib.request
+
+
+url_str = "https://html.spec.whatwg.org/entities.json"
+
+with urllib.request.urlopen(url_str) as url:
+ entities = json.load(url)
+
+records = []
+
+for name in entities:
+ if name[-1] != ';':
+ continue
+
+ codepoints = entities[name]["codepoints"]
+
+ if len(codepoints) > 2:
+ print('Entity {} needs {} codepints; may need to update the .c code '
+ ' accordingly.'.format(name, len(codepoints)), file=sys.stderr)
+ sys.exit(1)
+
+ while len(codepoints) < 2:
+ codepoints.append(0)
+
+ codepoints_str = map(str, codepoints)
+ records.append(" { \"" + name + "\", { " + ", ".join(codepoints_str) + " } }")
+
+records.sort()
+
+sys.stdout.write("static const ENTITY ENTITY_MAP[] = {\n")
+sys.stdout.write(",\n".join(records))
+sys.stdout.write("\n};\n\n")
diff --git a/src/entity.c b/src/entity.c
index 9991ca1..38588a0 100644
--- a/src/entity.c
+++ b/src/entity.c
@@ -27,8 +27,8 @@
#include <string.h>
-/* The table is generated from https://html.spec.whatwg.org/entities.json */
-static const struct entity entity_table[] = {
+/* Generated by scripts/build_enity_map.py. */
+static const ENTITY ENTITY_MAP[] = {
{ "Æ", { 198, 0 } },
{ "&", { 38, 0 } },
{ "Á", { 193, 0 } },
@@ -1040,17 +1040,14 @@ static const struct entity entity_table[] = {
{ "⋔", { 8916, 0 } },
{ "⫙", { 10969, 0 } },
{ "⨍", { 10765, 0 } },
- { "½", { 189, 0 } },
{ "½", { 189, 0 } },
{ "⅓", { 8531, 0 } },
- { "¼", { 188, 0 } },
{ "¼", { 188, 0 } },
{ "⅕", { 8533, 0 } },
{ "⅙", { 8537, 0 } },
{ "⅛", { 8539, 0 } },
{ "⅔", { 8532, 0 } },
{ "⅖", { 8534, 0 } },
- { "¾", { 190, 0 } },
{ "¾", { 190, 0 } },
{ "⅗", { 8535, 0 } },
{ "⅜", { 8540, 0 } },
@@ -1923,11 +1920,8 @@ static const struct entity entity_table[] = {
{ "≿", { 8831, 0 } },
{ "∑", { 8721, 0 } },
{ "♪", { 9834, 0 } },
- { "¹", { 185, 0 } },
{ "¹", { 185, 0 } },
- { "²", { 178, 0 } },
{ "²", { 178, 0 } },
- { "³", { 179, 0 } },
{ "³", { 179, 0 } },
{ "⊃", { 8835, 0 } },
{ "⫆", { 10950, 0 } },
@@ -2163,7 +2157,8 @@ static const struct entity entity_table[] = {
};
-struct entity_key {
+typedef struct ENTITY_KEY_tag ENTITY_KEY;
+struct ENTITY_KEY_tag {
const char* name;
size_t name_size;
};
@@ -2171,20 +2166,20 @@ struct entity_key {
static int
entity_cmp(const void* p_key, const void* p_entity)
{
- struct entity_key* key = (struct entity_key*) p_key;
- struct entity* ent = (struct entity*) p_entity;
+ ENTITY_KEY* key = (ENTITY_KEY*) p_key;
+ ENTITY* ent = (ENTITY*) p_entity;
return strncmp(key->name, ent->name, key->name_size);
}
-const struct entity*
+const ENTITY*
entity_lookup(const char* name, size_t name_size)
{
- struct entity_key key = { name, name_size };
+ ENTITY_KEY key = { name, name_size };
return bsearch(&key,
- entity_table,
- sizeof(entity_table) / sizeof(entity_table[0]),
- sizeof(struct entity),
+ ENTITY_MAP,
+ sizeof(ENTITY_MAP) / sizeof(ENTITY_MAP[0]),
+ sizeof(ENTITY),
entity_cmp);
}
diff --git a/src/entity.h b/src/entity.h
index 36395fe..0e2e254 100644
--- a/src/entity.h
+++ b/src/entity.h
@@ -31,12 +31,13 @@
/* Most entities are formed by single Unicode codepoint, few by two codepoints.
* Single-codepoint entities have codepoints[1] set to zero. */
-struct entity {
+typedef struct ENTITY_tag ENTITY;
+struct ENTITY_tag {
const char* name;
unsigned codepoints[2];
};
-const struct entity* entity_lookup(const char* name, size_t name_size);
+const ENTITY* entity_lookup(const char* name, size_t name_size);
#endif /* MD4C_ENTITY_H */
diff --git a/src/md4c-html.c b/src/md4c-html.c
index f452146..ca799f5 100644
--- a/src/md4c-html.c
+++ b/src/md4c-html.c
@@ -231,7 +231,7 @@ render_entity(MD_HTML* r, const MD_CHAR* text, MD_SIZE size,
return;
} else {
/* Named entity (e.g. " "). */
- const struct entity* ent;
+ const ENTITY* ent;
ent = entity_lookup(text, size);
if(ent != NULL) {