Commit bf9a2e98b83c8de2b8664a91167b5529ae1d9d39

Vicent Martí 2011-07-06T10:55:06

Merge pull request #296 from kiryl/index-optimization Index optimization

diff --git a/include/git2/index.h b/include/git2/index.h
index 83a18e1..d34ed0a 100644
--- a/include/git2/index.h
+++ b/include/git2/index.h
@@ -173,6 +173,13 @@ GIT_EXTERN(int) git_index_write(git_index *index);
 GIT_EXTERN(int) git_index_find(git_index *index, const char *path);
 
 /**
+ * Remove all entries with equal path except last added
+ *
+ * @param index an existing index object
+ */
+GIT_EXTERN(void) git_index_uniq(git_index *index);
+
+/**
  * Add or update an index entry from a file in disk
  *
  * The file `path` must be relative to the repository's
diff --git a/src/index.c b/src/index.c
index 06fa95c..dd33db9 100644
--- a/src/index.c
+++ b/src/index.c
@@ -348,6 +348,7 @@ static int index_insert(git_index *index, const git_index_entry *source_entry, i
 	git_index_entry *entry;
 	size_t path_length;
 	int position;
+	git_index_entry **entry_array;
 
 	assert(index && source_entry);
 
@@ -375,27 +376,28 @@ static int index_insert(git_index *index, const git_index_entry *source_entry, i
 	else
 		entry->flags |= GIT_IDXENTRY_NAMEMASK;;
 
+	/*
+	 * replacing is not requested: just insert entry at the end;
+	 * the index is no longer sorted
+	 */
+	if (!replace)
+		return git_vector_insert(&index->entries, entry);
 
 	/* look if an entry with this path already exists */
 	position = git_index_find(index, source_entry->path);
 
-	/* if no entry exists and replace is not set,
-	 * add the entry at the end;
-	 * the index is no longer sorted */
-	if (!replace || position == GIT_ENOTFOUND) {
-		if (git_vector_insert(&index->entries, entry) < GIT_SUCCESS)
-			return GIT_ENOMEM;
+	/*
+	 * if no entry exists add the entry at the end;
+	 * the index is no longer sorted
+	 */
+	if (position == GIT_ENOTFOUND)
+		return git_vector_insert(&index->entries, entry);
 
-	/* if a previous entry exists and replace is set,
-	 * replace it */
-	} else {
-		git_index_entry **entry_array = (git_index_entry **)index->entries.contents;
-
-		free((char *)entry_array[position]->path);
-		free(entry_array[position]);
-
-		entry_array[position] = entry;
-	}
+	/* exists, replace it */
+	entry_array = (git_index_entry **) index->entries.contents;
+	free((char *)entry_array[position]->path);
+	free(entry_array[position]);
+	entry_array[position] = entry;
 
 	return GIT_SUCCESS;
 }
@@ -485,6 +487,11 @@ int git_index_find(git_index *index, const char *path)
 	return git_vector_bsearch2(&index->entries, index_srch, path);
 }
 
+void git_index_uniq(git_index *index)
+{
+	git_vector_uniq(&index->entries);
+}
+
 const git_index_entry_unmerged *git_index_get_unmerged_bypath(git_index *index, const char *path)
 {
 	int pos;
diff --git a/src/util.c b/src/util.c
index 7bfc08b..03e9112 100644
--- a/src/util.c
+++ b/src/util.c
@@ -330,3 +330,67 @@ uint32_t git__hash(const void *key, int len, uint32_t seed)
 	return h1;
 }
 #endif
+
+/*
+ * A merge sort implementation, simplified from the qsort implementation
+ * by Mike Haertel, which is a part of the GNU C Library.
+ */
+static void msort_with_tmp(void *b, size_t n, size_t s,
+		int (*cmp)(const void *, const void *),
+		char *t)
+{
+	char *tmp;
+	char *b1, *b2;
+	size_t n1, n2;
+
+	if (n <= 1)
+		return;
+
+	n1 = n / 2;
+	n2 = n - n1;
+	b1 = b;
+	b2 = (char *)b + (n1 * s);
+
+	msort_with_tmp(b1, n1, s, cmp, t);
+	msort_with_tmp(b2, n2, s, cmp, t);
+
+	tmp = t;
+
+	while (n1 > 0 && n2 > 0) {
+		if (cmp(b1, b2) <= 0) {
+			memcpy(tmp, b1, s);
+			tmp += s;
+			b1 += s;
+			--n1;
+		} else {
+			memcpy(tmp, b2, s);
+			tmp += s;
+			b2 += s;
+			--n2;
+		}
+	}
+	if (n1 > 0)
+		memcpy(tmp, b1, n1 * s);
+	memcpy(b, t, (n - n2) * s);
+}
+
+int git__msort(void *b, size_t n, size_t s,
+		int (*cmp)(const void *, const void *))
+{
+	const size_t size = n * s;
+	char buf[1024];
+
+	if (size < sizeof(buf)) {
+		/* The temporary array fits on the small on-stack buffer. */
+		msort_with_tmp(b, n, s, cmp, buf);
+	} else {
+		/* It's somewhat large, so malloc it.  */
+		char *tmp = git__malloc(size);
+		if (tmp == NULL)
+			return GIT_ENOMEM;
+		msort_with_tmp(b, n, s, cmp, tmp);
+		free(tmp);
+	}
+
+	return GIT_SUCCESS;
+}
diff --git a/src/util.h b/src/util.h
index 0faf7f6..c9ca4de 100644
--- a/src/util.h
+++ b/src/util.h
@@ -118,4 +118,7 @@ extern int git__fnmatch(const char *pattern, const char *name, int flags);
 		} \
 	} while (0)
 
+extern int git__msort(void *b, size_t n, size_t s,
+		int (*cmp)(const void *, const void *));
+
 #endif /* INCLUDE_util_h__ */
diff --git a/src/vector.c b/src/vector.c
index 0451eb0..2fc051f 100644
--- a/src/vector.c
+++ b/src/vector.c
@@ -94,7 +94,7 @@ void git_vector_sort(git_vector *v)
 	if (v->sorted || v->_cmp == NULL)
 		return;
 
-	qsort(v->contents, v->length, sizeof(void *), v->_cmp);
+	git__msort(v->contents, v->length, sizeof(void *), v->_cmp);
 	v->sorted = 1;
 }
 
@@ -162,6 +162,26 @@ int git_vector_remove(git_vector *v, unsigned int idx)
 	return GIT_SUCCESS;
 }
 
+void git_vector_uniq(git_vector *v)
+{
+	git_vector_cmp cmp;
+	unsigned int i, j;
+
+	if (v->length <= 1)
+		return;
+
+	git_vector_sort(v);
+	cmp = v->_cmp ? v->_cmp : strict_comparison;
+
+	for (i = 0, j = 1 ; j < v->length; ++j)
+		if (!cmp(v->contents + i, v->contents + j))
+			v->contents[i] = v->contents[j];
+		else
+			v->contents[++i] = v->contents[j];
+
+	v->length -= j - i - 1;
+}
+
 void git_vector_clear(git_vector *v)
 {
 	assert(v);
diff --git a/src/vector.h b/src/vector.h
index 256452e..76778ba 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -32,5 +32,5 @@ GIT_INLINE(void *) git_vector_get(git_vector *v, unsigned int position)
 
 int git_vector_insert(git_vector *v, void *element);
 int git_vector_remove(git_vector *v, unsigned int idx);
-
+void git_vector_uniq(git_vector *v);
 #endif
diff --git a/tests/t00-core.c b/tests/t00-core.c
index c01518b..ee1eb6d 100644
--- a/tests/t00-core.c
+++ b/tests/t00-core.c
@@ -73,6 +73,28 @@ BEGIN_TEST(vector1, "don't read past array bounds on remove()")
   git_vector_free(&x);
 END_TEST
 
+static int test_cmp(const void *a, const void *b)
+{
+	int n1 = *(int *)a;
+	int n2 = *(int *)b;
+
+	return n1 - n2;
+}
+
+BEGIN_TEST(vector2, "remove duplicates")
+	git_vector x;
+	must_pass(git_vector_init(&x, 5, test_cmp));
+	must_pass(git_vector_insert(&x, (void *) 0xdeadbeef));
+	must_pass(git_vector_insert(&x, (void *) 0xcafebabe));
+	must_pass(git_vector_insert(&x, (void *) 0xcafebabe));
+	must_pass(git_vector_insert(&x, (void *) 0xdeadbeef));
+	must_pass(git_vector_insert(&x, (void *) 0xcafebabe));
+	must_be_true(x.length == 5);
+	git_vector_uniq(&x);
+	must_be_true(x.length == 2);
+	git_vector_free(&x);
+END_TEST
+
 
 BEGIN_TEST(path0, "get the dirname of a path")
 	char dir[64], *dir2;
@@ -531,6 +553,7 @@ BEGIN_SUITE(core)
 
 	ADD_TEST(vector0);
 	ADD_TEST(vector1);
+	ADD_TEST(vector2);
 
 	ADD_TEST(path0);
 	ADD_TEST(path1);