Commit a7c60cfc6d3af29669ef5cba9df325719b9f30cf

Shawn O. Pearce 2009-01-03T02:41:26

Add basic support to read pack-*.idx v1 and v2 files The index data is mapped into memory and then scanned using a binary search algorithm to locate the matching entry for the supplied git_oid. The standard fanout hash trick is applied to reduce the search space by 8 iterations. Since the v1 and v2 file formats differ in their search function, due to the different layouts used for the object records, we use two different search implementations and a virtual function pointer to jump to the correct version of code for the current pack index. The single function jump per-pack should be faster then computing a branch point inside the inner loop of a common binary search. To improve concurrency during read operations the pack lock is only held while verifying the index is actually open, or while opening the index for the first time. This permits multiple concurrent readers to scan through the same index. If an invalid index file is opened we close it and mark the git_pack's invalid bit to true. The git_pack structure is kept around in its parent git_packlist, but the invalid bit will cause all future readers to skip over the pack entirely. Pruning the invalid entries is relatively unimportant because they shouldn't be very common, a $GIT_DIRECTORY/objects/pack directory tends to only have valid pack files. Signed-off-by: Shawn O. Pearce <spearce@spearce.org>

diff --git a/src/odb.c b/src/odb.c
index 412ec4e..158e2b7 100644
--- a/src/odb.c
+++ b/src/odb.c
@@ -28,15 +28,44 @@
 #include "git/zlib.h"
 #include "fileops.h"
 #include "hash.h"
+#include <arpa/inet.h>
 #include <stdio.h>
+#include "odb.h"
 
 #define GIT_PACK_NAME_MAX (5 + 40 + 1)
-typedef struct {
+struct git_pack {
+	git_odb *db;
 	git_lck lock;
 
+	/** Functions to access idx_map. */
+	int (*idx_search)(
+		off_t *,
+		struct git_pack *,
+		const git_oid *);
+
+	/** The .idx file, mapped into memory. */
+	git_file idx_fd;
+	gitfo_map idx_map;
+	uint32_t *im_crc;
+	uint32_t *im_offset32;
+	uint32_t *im_offset64;
+
+	/** Number of objects in this pack. */
+	uint32_t obj_cnt;
+
+	/** Number of git_packlist we appear in. */
 	unsigned int refcnt;
+
+	/** Number of active users of the idx_map data. */
+	unsigned int idxcnt;
+	unsigned
+		invalid:1 /* the pack is unable to be read by libgit2 */
+		;
+
+	/** Name of the pack file(s), without extension ("pack-abc"). */
 	char pack_name[GIT_PACK_NAME_MAX];
-} git_pack;
+};
+typedef struct git_pack git_pack;
 
 typedef struct {
 	size_t n_packs;
@@ -77,6 +106,17 @@ static struct {
 	{ "REF_DELTA", 0 }   /* 7 = GIT_OBJ_REF_DELTA */
 };
 
+GIT_INLINE(uint32_t) decode32(void *b)
+{
+	return ntohl(*((uint32_t*)b));
+}
+
+GIT_INLINE(uint64_t) decode64(void *b)
+{
+	uint32_t *p = b;
+	return (((uint64_t)ntohl(p[0])) << 32) | ntohl(p[1]);
+}
+
 const char *git_obj_type_to_string(git_otype type)
 {
 	if (type < 0 || type >= ARRAY_SIZE(obj_type_table))
@@ -455,6 +495,160 @@ static int open_alternates(git_odb *db)
 	return 0;
 }
 
+static int pack_openidx_map(git_pack *p)
+{
+	char pb[GIT_PATH_MAX];
+	off_t len;
+
+	if (git__fmt(pb, sizeof(pb), "%s/pack/%s.idx",
+			p->db->objects_dir,
+			p->pack_name) < 0)
+		return GIT_ERROR;
+
+	if ((p->idx_fd = gitfo_open(pb, O_RDONLY)) < 0)
+		return GIT_ERROR;
+
+	if ((len = gitfo_size(p->idx_fd)) < 0
+		|| !git__is_sizet(len)
+		|| gitfo_map_ro(&p->idx_map, p->idx_fd, 0, (size_t)len)) {
+		gitfo_close(p->idx_fd);
+		return GIT_ERROR;
+	}
+
+	return GIT_SUCCESS;
+}
+
+static int idxv1_search(off_t *out, git_pack *p, const git_oid *id)
+{
+	unsigned char *data = p->idx_map.data;
+	size_t lo = id->id[0] ? decode32(data + ((id->id[0]-1) << 2)) : 0;
+	size_t hi = decode32(data + (id->id[0] << 2));
+
+	data += 1024;
+	do {
+		size_t mid = (lo + hi) >> 1;
+		size_t pos = 24 * mid;
+		int cmp = memcmp(id->id, data + pos + 4, 20);
+		if (cmp < 0)
+			hi = mid;
+		else if (!cmp) {
+			*out = decode32(data + pos);
+			return GIT_SUCCESS;
+		} else
+			lo = mid + 1;
+	} while (lo < hi);
+	return GIT_ENOTFOUND;
+}
+
+static int pack_openidx_v1(git_pack *p)
+{
+	uint32_t *fanout = p->idx_map.data;
+	size_t expsz;
+	int j;
+
+	for (j = 1; j < 256; j++) {
+		if (decode32(&fanout[j]) < decode32(&fanout[j - 1]))
+			return GIT_ERROR;
+	}
+	p->obj_cnt = decode32(&fanout[255]);
+	expsz = 4 * 256 + 24 * p->obj_cnt + 2 * 20;
+	if (expsz != p->idx_map.len)
+		return GIT_ERROR;
+
+	p->idx_search = idxv1_search;
+	return GIT_SUCCESS;
+}
+
+static int idxv2_search(off_t *out, git_pack *p, const git_oid *id)
+{
+	unsigned char *data = ((unsigned char*)p->idx_map.data) + 8;
+	size_t lo = id->id[0] ? decode32(data + ((id->id[0]-1) << 2)) : 0;
+	size_t hi = decode32(data + (id->id[0] << 2));
+
+	data += 1024;
+	do {
+		size_t mid = (lo + hi) >> 1;
+		size_t pos = 20 * mid;
+		int cmp = memcmp(id->id, data + pos, 20);
+		if (cmp < 0)
+			hi = mid;
+		else if (!cmp) {
+			uint32_t o32 = decode32(p->im_offset32 + mid);
+			if (o32 & 0x80000000)
+				*out = decode64(p->im_offset64 + 2*(o32 & ~0x80000000));
+			else
+				*out = o32;
+			return GIT_SUCCESS;
+		} else
+			lo = mid + 1;
+	} while (lo < hi);
+	return GIT_ENOTFOUND;
+}
+
+static int pack_openidx_v2(git_pack *p)
+{
+	void *data = ((unsigned char*)p->idx_map.data) + 8;
+	uint32_t *fanout = data;
+	int j;
+
+	for (j = 1; j < 256; j++) {
+		if (decode32(&fanout[j]) < decode32(&fanout[j - 1]))
+			return GIT_ERROR;
+	}
+	p->obj_cnt = decode32(&fanout[255]);
+	p->idx_search = idxv2_search;
+	p->im_crc = (uint32_t*)(data + 1024 + (20 * p->obj_cnt));
+	p->im_offset32 = p->im_crc + p->obj_cnt;
+	p->im_offset64 = p->im_offset32 + p->obj_cnt;
+	return GIT_SUCCESS;
+}
+
+static int pack_openidx(git_pack *p)
+{
+	gitlck_lock(&p->lock);
+	if (p->invalid)
+		goto unlock_fail;
+	if (++p->idxcnt == 1 && !p->idx_search) {
+		uint32_t *data;
+
+		if (pack_openidx_map(p))
+			goto invalid_fail;
+		data = p->idx_map.data;
+
+		if (decode32(&data[0]) == PACK_TOC) {
+			switch (decode32(&data[1])) {
+			case 2:
+				if (pack_openidx_v2(p))
+					goto unmap_fail;
+				break;
+			default:
+				goto unmap_fail;
+			}
+		} else if (pack_openidx_v1(p))
+			goto unmap_fail;
+	}
+	gitlck_unlock(&p->lock);
+	return GIT_SUCCESS;
+
+unmap_fail:
+	gitfo_free_map(&p->idx_map);
+
+invalid_fail:
+	p->invalid = 1;
+	p->idxcnt--;
+
+unlock_fail:
+	gitlck_unlock(&p->lock);
+	return GIT_ERROR;
+}
+
+static void pack_decidx(git_pack *p)
+{
+	gitlck_lock(&p->lock);
+	p->idxcnt--;
+	gitlck_unlock(&p->lock);
+}
+
 static void pack_dec(git_pack *p)
 {
 	int need_free;
@@ -464,6 +658,11 @@ static void pack_dec(git_pack *p)
 	gitlck_unlock(&p->lock);
 
 	if (need_free) {
+		if (p->idx_search) {
+			gitfo_free_map(&p->idx_map);
+			gitfo_close(p->idx_fd);
+		}
+
 		gitlck_free(&p->lock);
 		free(p);
 	}
@@ -552,6 +751,7 @@ static git_packlist* scan_packs(git_odb *db)
 
 	for (cnt = 0, c = state; c; ) {
 		struct scanned_pack *n = c->next;
+		c->pack->db = db;
 		new_list->packs[cnt++] = c->pack;
 		free(c);
 		c = n;
@@ -676,7 +876,20 @@ int git_odb__read_loose(git_obj *out, git_odb *db, const git_oid *id)
 
 static int read_packed(git_obj *out, git_pack *p, const git_oid *id)
 {
-	return GIT_ENOTFOUND;
+	off_t pos;
+	int res;
+
+	if (pack_openidx(p))
+		return GIT_ERROR;
+	res = p->idx_search(&pos, p, id);
+	pack_decidx(p);
+
+	if (!res) {
+		/* TODO unpack object at pos */
+		res = GIT_ERROR;
+	}
+
+	return res;
 }
 
 int git_odb__read_packed(git_obj *out, git_odb *db, const git_oid *id)
diff --git a/src/odb.h b/src/odb.h
new file mode 100644
index 0000000..2f205b2
--- /dev/null
+++ b/src/odb.h
@@ -0,0 +1,19 @@
+#ifndef INCLUDE_odb_h__
+#define INCLUDE_odb_h__
+
+/** First 4 bytes of a pack-*.idx file header.
+ *
+ * Note this header exists only in idx v2 and later.  The idx v1
+ * file format does not have a magic sequence at the front, and
+ * must be detected by the first four bytes *not* being this value
+ * and the first 8 bytes matching the following expression:
+ *
+ *   uint32_t *fanout = ... the file data at offset 0 ...
+ *   ntohl(fanout[0]) < ntohl(fanout[1])
+ *
+ * The value chosen here for PACK_TOC is such that the above
+ * cannot be true for an idx v1 file.
+ */
+#define PACK_TOC 0xff744f63 /* -1tOc */
+
+#endif
diff --git a/src/util.h b/src/util.h
index 018fce5..8f0c4b5 100644
--- a/src/util.h
+++ b/src/util.h
@@ -31,6 +31,13 @@ extern int git__fmt(char *, size_t, const char *, ...)
 extern int git__prefixcmp(const char *str, const char *prefix);
 extern int git__suffixcmp(const char *str, const char *suffix);
 
+/** @return true if p fits into the range of a size_t */
+GIT_INLINE(int) git__is_sizet(off_t p)
+{
+	size_t r = (size_t)p;
+	return p == r;
+}
+
 /*
  * Realloc the buffer pointed at by variable 'x' so that it can hold
  * at least 'nr' entries; the number of entries currently allocated