Commit 4d99c4cfc604bb141fd4e1423e934ebd3fb7e2a7

Brock Peabody 2016-11-23T18:32:48

Allow for caching of submodules. Added `git_repository_submodule_cache_all` to initialze a cache of submodules on the repository so that operations looking up N submodules are O(N) and not O(N^2). Added a `git_repository_submodule_cache_clear` function to remove the cache. Also optimized the function that loads all submodules as it was itself O(N^2) w.r.t the number of submodules, having to loop through the `.gitmodules` file once per submodule. I changed it to process the `.gitmodules` file once, into a map. Signed-off-by: David Turner <dturner@twosigma.com>

diff --git a/include/git2/sys/repository.h b/include/git2/sys/repository.h
index 800396c..0c91421 100644
--- a/include/git2/sys/repository.h
+++ b/include/git2/sys/repository.h
@@ -135,6 +135,35 @@ GIT_EXTERN(void) git_repository_set_index(git_repository *repo, git_index *index
  */
 GIT_EXTERN(int) git_repository_set_bare(git_repository *repo);
 
+/**
+ * Load and cache all submodules.
+ *
+ * Because the `.gitmodules` file is unstructured, loading submodules is an
+ * O(N) operation.  Any operation (such as `git_rebase_init`) that requires
+ * accessing all submodules is O(N^2) in the number of submodules, if it
+ * has to look each one up individually.  This function loads all submodules
+ * and caches them so that subsequent calls to `git_submodule_lookup` are O(1).
+ *
+ * @param repo the repository whose submodules will be cached.
+ */
+GIT_EXTERN(int) git_repository_submodule_cache_all(
+	git_repository *repo);
+
+/**
+ * Clear the submodule cache.
+ *
+ * Clear the submodule cache populated by `git_repository_submodule_cache_all`.
+ * If there is no cache, do nothing.
+ *
+ * The cache incorporates data from the repository's configuration, as well
+ * as the state of the working tree, the index, and HEAD.  So any time any
+ * of these has changed, the cache might become invalid.
+ *
+ * @param repo the repository whose submodule cache will be cleared
+ */
+GIT_EXTERN(int) git_repository_submodule_cache_clear(
+	git_repository *repo);
+
 /** @} */
 GIT_END_DECL
 #endif
diff --git a/src/repository.c b/src/repository.c
index 6029919..2185632 100644
--- a/src/repository.c
+++ b/src/repository.c
@@ -27,6 +27,10 @@
 #include "merge.h"
 #include "diff_driver.h"
 #include "annotated_commit.h"
+#include "submodule.h"
+
+GIT__USE_STRMAP
+#include "strmap.h"
 
 #ifdef GIT_WIN32
 # include "win32/w32_util.h"
@@ -109,6 +113,7 @@ void git_repository__cleanup(git_repository *repo)
 {
 	assert(repo);
 
+	git_repository_submodule_cache_clear(repo);
 	git_cache_clear(&repo->objects);
 	git_attr_cache_flush(repo);
 
@@ -2541,3 +2546,31 @@ int git_repository_set_ident(git_repository *repo, const char *name, const char 
 
 	return 0;
 }
+
+int git_repository_submodule_cache_all(git_repository *repo)
+{
+	int error;
+
+	assert(repo);
+
+	if ((error = git_strmap_alloc(&repo->submodule_cache)))
+		return error;
+
+	error = git_submodule__map(repo, repo->submodule_cache);
+	return error;
+}
+
+int git_repository_submodule_cache_clear(git_repository *repo)
+{
+	git_submodule *sm;
+	assert(repo);
+	if (repo->submodule_cache == NULL) {
+		return 0;
+	}
+	git_strmap_foreach_value(repo->submodule_cache, sm, {
+		git_submodule_free(sm);
+	});
+	git_strmap_free(repo->submodule_cache);
+	repo->submodule_cache = 0;
+	return 0;
+}
diff --git a/src/repository.h b/src/repository.h
index b259bea..9d276f3 100644
--- a/src/repository.h
+++ b/src/repository.h
@@ -143,6 +143,7 @@ struct git_repository {
 	git_atomic attr_session_key;
 
 	git_cvar_value cvar_cache[GIT_CVAR_CACHE_MAX];
+	git_strmap *submodule_cache;
 };
 
 GIT_INLINE(git_attr_cache *) git_repository_attr_cache(git_repository *repo)
diff --git a/src/submodule.c b/src/submodule.c
index 46462f1..6996006 100644
--- a/src/submodule.c
+++ b/src/submodule.c
@@ -149,40 +149,53 @@ static int find_by_path(const git_config_entry *entry, void *payload)
 }
 
 /**
- * Find out the name of a submodule from its path
+ * Release the name map returned by 'load_submodule_names'.
  */
-static int name_from_path(git_buf *out, git_config *cfg, const char *path)
+static void free_submodule_names(git_strmap *names)
+{
+	git_buf *name = 0;
+	if (names == NULL)
+		return;
+	git_strmap_foreach_value(names, name, {
+		git__free(name);
+	});
+	git_strmap_free(names);
+	return;
+}
+
+/**
+ * Map submodule paths to names.
+ * TODO: for some use-cases, this might need case-folding on a
+ * case-insensitive filesystem
+ */
+static int load_submodule_names(git_strmap *out, git_config *cfg)
 {
 	const char *key = "submodule\\..*\\.path";
 	git_config_iterator *iter;
 	git_config_entry *entry;
-	int error;
+	git_buf buf = GIT_BUF_INIT;
+	int rval;
+	int error = 0;
 
 	if ((error = git_config_iterator_glob_new(&iter, cfg, key)) < 0)
 		return error;
 
-	while ((error = git_config_next(&entry, iter)) == 0) {
+	while (git_config_next(&entry, iter) == 0) {
 		const char *fdot, *ldot;
-		/* TODO: this should maybe be strcasecmp on a case-insensitive fs */
-		if (strcmp(path, entry->value) != 0)
-			continue;
-
 		fdot = strchr(entry->name, '.');
 		ldot = strrchr(entry->name, '.');
 
-		git_buf_clear(out);
-		git_buf_put(out, fdot + 1, ldot - fdot - 1);
-		goto cleanup;
-	}
-
-	if (error == GIT_ITEROVER) {
-		giterr_set(GITERR_SUBMODULE, "could not find a submodule name for '%s'", path);
-		error = GIT_ENOTFOUND;
+		git_buf_put(&buf, fdot + 1, ldot - fdot - 1);
+		git_strmap_insert(out, entry->value, git_buf_detach(&buf), rval);
+		if (rval < 0) {
+			giterr_set(GITERR_NOMEMORY, "Error inserting submodule into hash table");
+			free_submodule_names(out);
+			return -1;
+		}
 	}
 
-cleanup:
 	git_config_iterator_free(iter);
-	return error;
+	return 0;
 }
 
 int git_submodule_lookup(
@@ -196,6 +209,17 @@ int git_submodule_lookup(
 
 	assert(repo && name);
 
+	if (repo->submodule_cache != NULL) {
+		khiter_t pos = git_strmap_lookup_index(repo->submodule_cache, name);
+		if (git_strmap_valid_index(repo->submodule_cache, pos)) {
+			if (out) {
+				*out = git_strmap_value_at(repo->submodule_cache, pos);
+				GIT_REFCOUNT_INC(*out);
+			}
+			return 0;
+		}
+	}
+
 	if ((error = submodule_alloc(&sm, repo, name)) < 0)
 		return error;
 
@@ -327,20 +351,18 @@ static int submodules_from_index(git_strmap *map, git_index *idx, git_config *cf
 	int error;
 	git_iterator *i;
 	const git_index_entry *entry;
-	git_buf name = GIT_BUF_INIT;
+	git_strmap *names = 0;
+	git_strmap_alloc(&names);
+	if ((error = load_submodule_names(names, cfg)))
+		goto done;
 
 	if ((error = git_iterator_for_index(&i, git_index_owner(idx), idx, NULL)) < 0)
-		return error;
+		goto done;
 
 	while (!(error = git_iterator_advance(&entry, i))) {
 		khiter_t pos = git_strmap_lookup_index(map, entry->path);
 		git_submodule *sm;
 
-		git_buf_clear(&name);
-		if (!name_from_path(&name, cfg, entry->path)) {
-			git_strmap_lookup_index(map, name.ptr);
-		}
-
 		if (git_strmap_valid_index(map, pos)) {
 			sm = git_strmap_value_at(map, pos);
 
@@ -349,7 +371,17 @@ static int submodules_from_index(git_strmap *map, git_index *idx, git_config *cf
 			else
 				sm->flags |= GIT_SUBMODULE_STATUS__INDEX_NOT_SUBMODULE;
 		} else if (S_ISGITLINK(entry->mode)) {
-			if (!submodule_get_or_create(&sm, git_index_owner(idx), map, name.ptr ? name.ptr : entry->path)) {
+			khiter_t name_pos;
+			const char *name;
+
+			name_pos = git_strmap_lookup_index(names, entry->path);
+			if (git_strmap_valid_index(names, name_pos)) {
+				name = git_strmap_value_at(names, name_pos);
+			} else {
+				name = entry->path;
+			}
+
+			if (!submodule_get_or_create(&sm, git_index_owner(idx), map, name)) {
 				submodule_update_from_index_entry(sm, entry);
 				git_submodule_free(sm);
 			}
@@ -359,8 +391,9 @@ static int submodules_from_index(git_strmap *map, git_index *idx, git_config *cf
 	if (error == GIT_ITEROVER)
 		error = 0;
 
-	git_buf_free(&name);
+done:
 	git_iterator_free(i);
+	free_submodule_names(names);
 
 	return error;
 }
@@ -370,20 +403,18 @@ static int submodules_from_head(git_strmap *map, git_tree *head, git_config *cfg
 	int error;
 	git_iterator *i;
 	const git_index_entry *entry;
-	git_buf name = GIT_BUF_INIT;
+	git_strmap *names = 0;
+	git_strmap_alloc(&names);
+	if ((error = load_submodule_names(names, cfg)))
+		goto done;
 
 	if ((error = git_iterator_for_tree(&i, head, NULL)) < 0)
-		return error;
+		goto done;
 
 	while (!(error = git_iterator_advance(&entry, i))) {
 		khiter_t pos = git_strmap_lookup_index(map, entry->path);
 		git_submodule *sm;
 
-		git_buf_clear(&name);
-		if (!name_from_path(&name, cfg, entry->path)) {
-			git_strmap_lookup_index(map, name.ptr);
-		}
-
 		if (git_strmap_valid_index(map, pos)) {
 			sm = git_strmap_value_at(map, pos);
 
@@ -392,7 +423,17 @@ static int submodules_from_head(git_strmap *map, git_tree *head, git_config *cfg
 			else
 				sm->flags |= GIT_SUBMODULE_STATUS__HEAD_NOT_SUBMODULE;
 		} else if (S_ISGITLINK(entry->mode)) {
-			if (!submodule_get_or_create(&sm, git_tree_owner(head), map, name.ptr ? name.ptr : entry->path)) {
+			khiter_t name_pos;
+			const char *name;
+
+			name_pos = git_strmap_lookup_index(names, entry->path);
+			if (git_strmap_valid_index(names, name_pos)) {
+				name = git_strmap_value_at(names, name_pos);
+			} else {
+				name = entry->path;
+			}
+
+			if (!submodule_get_or_create(&sm, git_tree_owner(head), map, name)) {
 				submodule_update_from_head_data(
 					sm, entry->mode, &entry->id);
 				git_submodule_free(sm);
@@ -403,8 +444,9 @@ static int submodules_from_head(git_strmap *map, git_tree *head, git_config *cfg
 	if (error == GIT_ITEROVER)
 		error = 0;
 
-	git_buf_free(&name);
+done:
 	git_iterator_free(i);
+	free_submodule_names(names);
 
 	return error;
 }
@@ -416,7 +458,7 @@ typedef struct {
 	git_repository *repo;
 } lfc_data;
 
-static int all_submodules(git_repository *repo, git_strmap *map)
+int git_submodule__map(git_repository *repo, git_strmap *map)
 {
 	int error = 0;
 	git_index *idx = NULL;
@@ -509,7 +551,7 @@ int git_submodule_foreach(
 	if ((error = git_strmap_alloc(&submodules)) < 0)
 		return error;
 
-	if ((error = all_submodules(repo, submodules)) < 0)
+	if ((error = git_submodule__map(repo, submodules)) < 0)
 		goto done;
 
 	if (!(error = git_vector_init(
@@ -1566,7 +1608,6 @@ int git_submodule_location(unsigned int *location, git_submodule *sm)
 		location, NULL, NULL, NULL, sm, GIT_SUBMODULE_IGNORE_ALL);
 }
 
-
 /*
  * INTERNAL FUNCTIONS
  */
diff --git a/src/submodule.h b/src/submodule.h
index 2ef2031..456a939 100644
--- a/src/submodule.h
+++ b/src/submodule.h
@@ -143,4 +143,7 @@ extern int git_submodule_parse_ignore(
 extern int git_submodule_parse_update(
 	git_submodule_update_t *out, const char *value);
 
+extern int git_submodule__map(
+	git_repository *repo,
+	git_strmap *map);
 #endif
diff --git a/tests/submodule/lookup.c b/tests/submodule/lookup.c
index 148f927..e36fc44 100644
--- a/tests/submodule/lookup.c
+++ b/tests/submodule/lookup.c
@@ -388,3 +388,28 @@ void test_submodule_lookup__renamed(void)
 	cl_git_pass(git_submodule_foreach(g_repo, sm_lookup_cb, &data));
 	cl_assert_equal_i(8, data.count);
 }
+
+void test_submodule_lookup_cached(void) {
+	git_submodule *sm;
+	git_submodule *sm2;
+	/* See that the simple tests still pass. */
+
+	git_repository_submodule_cache_all(g_repo);
+	test_submodule_lookup__simple_lookup();
+	git_repository_submodule_cache_clear(g_repo);
+	test_submodule_lookup__simple_lookup();
+
+	/* Check that subsequent calls return different objects when cached. */
+	git_repository_submodule_cache_all(g_repo);
+	cl_git_pass(git_submodule_lookup(&sm, g_repo, "sm_unchanged"));
+	cl_git_pass(git_submodule_lookup(&sm2, g_repo, "sm_unchanged"));
+	cl_assert_equal_p(sm, sm2);
+	git_submodule_free(sm2);
+
+	/* and that we get new objects again after clearing the cache. */
+	git_repository_submodule_cache_clear(g_repo);
+	cl_git_pass(git_submodule_lookup(&sm2, g_repo, "sm_unchanged"));
+	cl_assert(sm != sm2);
+	git_submodule_free(sm);
+	git_submodule_free(sm2);
+}