md_build_ref_def_hashtable: Minor cleanup. * Add some comments. * Make complex hastable buckets start at 2 records instead of 4. Some experiments show there may be about 20% collisions in documents with too many reference links but the buckets rarely need more then the 2 records so it is just a memory wasting.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
diff --git a/md4c/md4c.c b/md4c/md4c.c
index bfa1b9a..f5a094e 100644
--- a/md4c/md4c.c
+++ b/md4c/md4c.c
@@ -1625,7 +1625,7 @@ md_ref_def_cmp(const void* a, const void* b)
}
static int
-md_ref_def_cmp_stable(const void* a, const void* b)
+md_ref_def_cmp_for_sort(const void* a, const void* b)
{
int cmp;
@@ -1678,6 +1678,7 @@ md_build_ref_def_hashtable(MD_CTX* ctx)
bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
if(bucket == NULL) {
+ /* The bucket is empty. Make it just point to the def. */
ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
continue;
}
@@ -1689,12 +1690,12 @@ md_build_ref_def_hashtable(MD_CTX* ctx)
MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) {
- /* Ignore this ref. def. */
+ /* Duplicate label: Ignore this ref. def. */
continue;
}
- /* Make the bucket capable of holding more ref. defs. */
- list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 4 * sizeof(MD_REF_DEF*));
+ /* Make the bucket complex, i.e. able to hold more ref. defs. */
+ list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*));
if(list == NULL) {
MD_LOG("malloc() failed.");
goto abort;
@@ -1702,12 +1703,17 @@ md_build_ref_def_hashtable(MD_CTX* ctx)
list->ref_defs[0] = old_def;
list->ref_defs[1] = def;
list->n_ref_defs = 2;
- list->alloc_ref_defs = 4;
+ list->alloc_ref_defs = 2;
ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
continue;
}
- /* Append the def to the bucket list. */
+ /* Append the def to the complex bucket list.
+ *
+ * Note in this case we ignore potential duplicates to avoid expensive
+ * iterating over the complex bucket. Below, we revisit all the complex
+ * buckets and handle it more cheaply after the complex bucket contents
+ * is sorted. */
list = (MD_REF_DEF_LIST*) bucket;
if(list->n_ref_defs >= list->alloc_ref_defs) {
MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list,
@@ -1736,9 +1742,12 @@ md_build_ref_def_hashtable(MD_CTX* ctx)
continue;
list = (MD_REF_DEF_LIST*) bucket;
- qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_stable);
+ qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_for_sort);
- /* Disable duplicates. */
+ /* Disable all duplicates in the complex bucket by forcing all such
+ * records to point to the 1st such ref. def. I.e. no matter which
+ * record is found during the lookup, it will always point to the right
+ * ref. def. in ctx->ref_defs[]. */
for(j = 1; j < list->n_ref_defs; j++) {
if(md_ref_def_cmp(&list->ref_defs[j-1], &list->ref_defs[j]) == 0)
list->ref_defs[j] = list->ref_defs[j-1];