Commit 0e040c031edc6b61692e74a9b8ce0b0ff86d270a

Carlos Martín Nieto 2013-03-03T14:50:47

indexer: use a hashtable for keeping track of offsets These offsets are needed for REF_DELTA objects, which encode which object they use as a base, but not where it lies in the packfile, so we need a list. These objects are mostly from older packfiles, before OFS_DELTA was widely spread. The time spent in indexing these packfiles is greatly reduced, though remains above what git is able to do.

diff --git a/src/indexer.c b/src/indexer.c
index 1600d1c..c7e142b 100644
--- a/src/indexer.c
+++ b/src/indexer.c
@@ -17,6 +17,7 @@
 #include "posix.h"
 #include "pack.h"
 #include "filebuf.h"
+#include "oidmap.h"
 
 #define UINT31_MAX (0x7FFFFFFF)
 
@@ -122,14 +123,6 @@ static int objects_cmp(const void *a, const void *b)
 	return git_oid_cmp(&entrya->oid, &entryb->oid);
 }
 
-static int cache_cmp(const void *a, const void *b)
-{
-	const struct git_pack_entry *ea = a;
-	const struct git_pack_entry *eb = b;
-
-	return git_oid_cmp(&ea->sha1, &eb->sha1);
-}
-
 int git_indexer_stream_new(
 		git_indexer_stream **out,
 		const char *prefix,
@@ -271,7 +264,8 @@ static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, git_off_t start,
 
 static int store_object(git_indexer_stream *idx)
 {
-	int i;
+	int i, error;
+	khiter_t k;
 	git_oid oid;
 	struct entry *entry;
 	git_off_t entry_size;
@@ -296,11 +290,15 @@ static int store_object(git_indexer_stream *idx)
 
 	git_oid_cpy(&pentry->sha1, &oid);
 	pentry->offset = entry_start;
-	if (git_vector_insert(&idx->pack->cache, pentry) < 0) {
+
+	k = kh_put(oid, idx->pack->idx_cache, &pentry->sha1, &error);
+	if (!error) {
 		git__free(pentry);
 		goto on_error;
 	}
 
+	kh_value(idx->pack->idx_cache, k) = pentry;
+
 	git_oid_cpy(&entry->oid, &oid);
 
 	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
@@ -324,7 +322,8 @@ on_error:
 
 static int hash_and_save(git_indexer_stream *idx, git_rawobj *obj, git_off_t entry_start)
 {
-	int i;
+	int i, error;
+	khiter_t k;
 	git_oid oid;
 	size_t entry_size;
 	struct entry *entry;
@@ -351,11 +350,14 @@ static int hash_and_save(git_indexer_stream *idx, git_rawobj *obj, git_off_t ent
 
 	git_oid_cpy(&pentry->sha1, &oid);
 	pentry->offset = entry_start;
-	if (git_vector_insert(&idx->pack->cache, pentry) < 0) {
+	k = kh_put(oid, idx->pack->idx_cache, &pentry->sha1, &error);
+	if (!error) {
 		git__free(pentry);
 		goto on_error;
 	}
 
+	kh_value(idx->pack->idx_cache, k) = pentry;
+
 	git_oid_cpy(&entry->oid, &oid);
 	entry->crc = crc32(0L, Z_NULL, 0);
 
@@ -426,8 +428,8 @@ int git_indexer_stream_add(git_indexer_stream *idx, const void *data, size_t siz
 		/* for now, limit to 2^32 objects */
 		assert(idx->nr_objects == (size_t)((unsigned int)idx->nr_objects));
 
-		if (git_vector_init(&idx->pack->cache, (unsigned int)idx->nr_objects, cache_cmp) < 0)
-			return -1;
+		idx->pack->idx_cache = git_oidmap_alloc();
+		GITERR_CHECK_ALLOC(idx->pack->idx_cache);
 
 		idx->pack->has_cache = 1;
 		if (git_vector_init(&idx->objects, (unsigned int)idx->nr_objects, objects_cmp) < 0)
@@ -718,9 +720,9 @@ on_error:
 
 void git_indexer_stream_free(git_indexer_stream *idx)
 {
+	khiter_t k;
 	unsigned int i;
 	struct entry *e;
-	struct git_pack_entry *pe;
 	struct delta_info *delta;
 
 	if (idx == NULL)
@@ -729,11 +731,16 @@ void git_indexer_stream_free(git_indexer_stream *idx)
 	git_vector_foreach(&idx->objects, i, e)
 		git__free(e);
 	git_vector_free(&idx->objects);
+
 	if (idx->pack) {
-		git_vector_foreach(&idx->pack->cache, i, pe)
-			git__free(pe);
-		git_vector_free(&idx->pack->cache);
+		for (k = kh_begin(idx->pack->idx_cache); k != kh_end(idx->pack->idx_cache); k++) {
+			if (kh_exist(idx->pack->idx_cache, k))
+				git__free(kh_value(idx->pack->idx_cache, k));
+		}
+
+		git_oidmap_free(idx->pack->idx_cache);
 	}
+
 	git_vector_foreach(&idx->deltas, i, delta)
 		git__free(delta);
 	git_vector_free(&idx->deltas);
diff --git a/src/pack-objects.c b/src/pack-objects.c
index e4b6719..459201f 100644
--- a/src/pack-objects.c
+++ b/src/pack-objects.c
@@ -21,8 +21,6 @@
 #include "git2/indexer.h"
 #include "git2/config.h"
 
-GIT__USE_OIDMAP;
-
 struct unpacked {
 	git_pobject *object;
 	void *data;
diff --git a/src/pack.c b/src/pack.c
index f36f3cf..75ac981 100644
--- a/src/pack.c
+++ b/src/pack.c
@@ -760,13 +760,14 @@ git_off_t get_delta_base(
 	} else if (type == GIT_OBJ_REF_DELTA) {
 		/* If we have the cooperative cache, search in it first */
 		if (p->has_cache) {
-			size_t pos;
-			struct git_pack_entry key;
+			khiter_t k;
+			git_oid oid;
 
-			git_oid_fromraw(&key.sha1, base_info);
-			if (!git_vector_bsearch(&pos, &p->cache, &key)) {
+			git_oid_fromraw(&oid, base_info);
+			k = kh_get(oid, p->idx_cache, &oid);
+			if (k != kh_end(p->idx_cache)) {
 				*curpos += 20;
-				return ((struct git_pack_entry *)git_vector_get(&p->cache, pos))->offset;
+				return ((struct git_pack_entry *)kh_value(p->idx_cache, k))->offset;
 			}
 		}
 		/* The base entry _must_ be in the same pack */
diff --git a/src/pack.h b/src/pack.h
index 6c43d8f..8d7e33d 100644
--- a/src/pack.h
+++ b/src/pack.h
@@ -16,6 +16,7 @@
 #include "map.h"
 #include "mwindow.h"
 #include "odb.h"
+#include "oidmap.h"
 
 #define GIT_PACK_FILE_MODE 0444
 
@@ -62,6 +63,7 @@ typedef struct git_pack_cache_entry {
 #include "offmap.h"
 
 GIT__USE_OFFMAP;
+GIT__USE_OIDMAP;
 
 #define GIT_PACK_CACHE_MEMORY_LIMIT 16 * 1024 * 1024
 #define GIT_PACK_CACHE_SIZE_LIMIT 1024 * 1024 /* don't bother caching anything over 1MB */
@@ -86,7 +88,7 @@ struct git_pack_file {
 	git_time_t mtime;
 	unsigned pack_local:1, pack_keep:1, has_cache:1;
 	git_oid sha1;
-	git_vector cache;
+	git_oidmap *idx_cache;
 	git_oid **oids;
 
 	git_pack_cache bases; /* delta base cache */