Commit f2400a9c81c1574f7b917bcef90dde7b222fb9a0

Patrick Steinhardt 2020-07-13T20:56:08

config_entries: Avoid excessive map operations When appending config entries, we currently always first get the currently existing map entry and then afterwards update the map to contain the current config value. In the common scenario where keys aren't being overridden, this is the best we can do. But in case a key gets set multiple times, then we'll also perform these two map operations. In extreme cases, hashing the map keys will thus start to dominate performance. Let's optimize the pattern by using a separately allocated map entry. Currently, we always put the current list entry into the map and update it to get any overridden multivar. As these list entries are also used to iterate config entries, we cannot update them in-place in the map and are thus forced to always set the map to contain the new entry. But with a separately allocated map entry, we can now create one once per config key and insert it into the map. Whenever appending a new config value with the same key, we can now just update the map entry in-place instead of having to replace the map entry completely. This reduces calls to the hashing function by half and trades the improved runtime for one more allocation per unique config key. Given that the refactoring arguably improves code readability by splitting concerns of the `config_entry_list` type and not having to track it in two different structures, this alone would already be reason enough to take the trade. Given a pathological case of a gitconfig with 100.000 repeated keys and a section of length 10.000 characters, this reduces runtime by half from approximately 14 seconds to 7 seconds as expected.

diff --git a/src/config_entries.c b/src/config_entries.c
index 3ddd336..66aae09 100644
--- a/src/config_entries.c
+++ b/src/config_entries.c
@@ -11,9 +11,13 @@ typedef struct config_entry_list {
 	struct config_entry_list *next;
 	struct config_entry_list *last;
 	git_config_entry *entry;
-	bool first;
 } config_entry_list;
 
+typedef struct {
+	git_config_entry *entry;
+	bool multivar;
+} config_entry_map_head;
+
 typedef struct config_entries_iterator {
 	git_config_iterator parent;
 	git_config_entries *entries;
@@ -102,14 +106,16 @@ void git_config_entries_incref(git_config_entries *entries)
 static void config_entries_free(git_config_entries *entries)
 {
 	config_entry_list *list = NULL, *next;
+	config_entry_map_head *head;
 
+	git_strmap_foreach_value(entries->map, head,
+		git__free((char *) head->entry->name); git__free(head)
+	);
 	git_strmap_free(entries->map);
 
 	list = entries->list;
 	while (list != NULL) {
 		next = list->next;
-		if (list->first)
-			git__free((char *) list->entry->name);
 		git__free((char *) list->entry->value);
 		git__free(list->entry);
 		git__free(list);
@@ -127,40 +133,42 @@ void git_config_entries_free(git_config_entries *entries)
 
 int git_config_entries_append(git_config_entries *entries, git_config_entry *entry)
 {
-	config_entry_list *existing, *head;
-
-	head = git__calloc(1, sizeof(config_entry_list));
-	GIT_ERROR_CHECK_ALLOC(head);
-	head->entry = entry;
-
-	/*
-	 * This is a micro-optimization for configuration files
-	 * with a lot of same keys. As for multivars the entry's
-	 * key will be the same for all entries, we can just free
-	 * all except the first entry's name and just re-use it.
-	 */
-	if ((existing = git_strmap_get(entries->map, entry->name)) != NULL) {
+	config_entry_list *list_head;
+	config_entry_map_head *map_head;
+
+	if ((map_head = git_strmap_get(entries->map, entry->name)) != NULL) {
+		map_head->multivar = true;
+		/*
+		 * This is a micro-optimization for configuration files
+		 * with a lot of same keys. As for multivars the entry's
+		 * key will be the same for all entries, we can just free
+		 * all except the first entry's name and just re-use it.
+		 */
 		git__free((char *) entry->name);
-		entry->name = existing->entry->name;
+		entry->name = map_head->entry->name;
 	} else {
-		head->first = 1;
+		map_head = git__calloc(1, sizeof(*map_head));
+		if ((git_strmap_set(entries->map, entry->name, map_head)) < 0)
+			return -1;
 	}
+	map_head->entry = entry;
+
+	list_head = git__calloc(1, sizeof(config_entry_list));
+	GIT_ERROR_CHECK_ALLOC(list_head);
+	list_head->entry = entry;
 
 	if (entries->list)
-		entries->list->last->next = head;
+		entries->list->last->next = list_head;
 	else
-		entries->list = head;
-	entries->list->last = head;
-
-	if (git_strmap_set(entries->map, entry->name, head) < 0)
-		return -1;
+		entries->list = list_head;
+	entries->list->last = list_head;
 
 	return 0;
 }
 
 int git_config_entries_get(git_config_entry **out, git_config_entries *entries, const char *key)
 {
-	config_entry_list *entry;
+	config_entry_map_head *entry;
 	if ((entry = git_strmap_get(entries->map, key)) == NULL)
 		return GIT_ENOTFOUND;
 	*out = entry->entry;
@@ -169,12 +177,12 @@ int git_config_entries_get(git_config_entry **out, git_config_entries *entries, 
 
 int git_config_entries_get_unique(git_config_entry **out, git_config_entries *entries, const char *key)
 {
-	config_entry_list *entry;
+	config_entry_map_head *entry;
 
 	if ((entry = git_strmap_get(entries->map, key)) == NULL)
 		return GIT_ENOTFOUND;
 
-	if (!entry->first) {
+	if (entry->multivar) {
 		git_error_set(GIT_ERROR_CONFIG, "entry is not unique due to being a multivar");
 		return -1;
 	}