Add basic support to read pack-*.idx v1 and v2 files The index data is mapped into memory and then scanned using a binary search algorithm to locate the matching entry for the supplied git_oid. The standard fanout hash trick is applied to reduce the search space by 8 iterations. Since the v1 and v2 file formats differ in their search function, due to the different layouts used for the object records, we use two different search implementations and a virtual function pointer to jump to the correct version of code for the current pack index. The single function jump per-pack should be faster then computing a branch point inside the inner loop of a common binary search. To improve concurrency during read operations the pack lock is only held while verifying the index is actually open, or while opening the index for the first time. This permits multiple concurrent readers to scan through the same index. If an invalid index file is opened we close it and mark the git_pack's invalid bit to true. The git_pack structure is kept around in its parent git_packlist, but the invalid bit will cause all future readers to skip over the pack entirely. Pruning the invalid entries is relatively unimportant because they shouldn't be very common, a $GIT_DIRECTORY/objects/pack directory tends to only have valid pack files. Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
diff --git a/src/odb.c b/src/odb.c
index 412ec4e..158e2b7 100644
--- a/src/odb.c
+++ b/src/odb.c
@@ -28,15 +28,44 @@
#include "git/zlib.h"
#include "fileops.h"
#include "hash.h"
+#include <arpa/inet.h>
#include <stdio.h>
+#include "odb.h"
#define GIT_PACK_NAME_MAX (5 + 40 + 1)
-typedef struct {
+struct git_pack {
+ git_odb *db;
git_lck lock;
+ /** Functions to access idx_map. */
+ int (*idx_search)(
+ off_t *,
+ struct git_pack *,
+ const git_oid *);
+
+ /** The .idx file, mapped into memory. */
+ git_file idx_fd;
+ gitfo_map idx_map;
+ uint32_t *im_crc;
+ uint32_t *im_offset32;
+ uint32_t *im_offset64;
+
+ /** Number of objects in this pack. */
+ uint32_t obj_cnt;
+
+ /** Number of git_packlist we appear in. */
unsigned int refcnt;
+
+ /** Number of active users of the idx_map data. */
+ unsigned int idxcnt;
+ unsigned
+ invalid:1 /* the pack is unable to be read by libgit2 */
+ ;
+
+ /** Name of the pack file(s), without extension ("pack-abc"). */
char pack_name[GIT_PACK_NAME_MAX];
-} git_pack;
+};
+typedef struct git_pack git_pack;
typedef struct {
size_t n_packs;
@@ -77,6 +106,17 @@ static struct {
{ "REF_DELTA", 0 } /* 7 = GIT_OBJ_REF_DELTA */
};
+GIT_INLINE(uint32_t) decode32(void *b)
+{
+ return ntohl(*((uint32_t*)b));
+}
+
+GIT_INLINE(uint64_t) decode64(void *b)
+{
+ uint32_t *p = b;
+ return (((uint64_t)ntohl(p[0])) << 32) | ntohl(p[1]);
+}
+
const char *git_obj_type_to_string(git_otype type)
{
if (type < 0 || type >= ARRAY_SIZE(obj_type_table))
@@ -455,6 +495,160 @@ static int open_alternates(git_odb *db)
return 0;
}
+static int pack_openidx_map(git_pack *p)
+{
+ char pb[GIT_PATH_MAX];
+ off_t len;
+
+ if (git__fmt(pb, sizeof(pb), "%s/pack/%s.idx",
+ p->db->objects_dir,
+ p->pack_name) < 0)
+ return GIT_ERROR;
+
+ if ((p->idx_fd = gitfo_open(pb, O_RDONLY)) < 0)
+ return GIT_ERROR;
+
+ if ((len = gitfo_size(p->idx_fd)) < 0
+ || !git__is_sizet(len)
+ || gitfo_map_ro(&p->idx_map, p->idx_fd, 0, (size_t)len)) {
+ gitfo_close(p->idx_fd);
+ return GIT_ERROR;
+ }
+
+ return GIT_SUCCESS;
+}
+
+static int idxv1_search(off_t *out, git_pack *p, const git_oid *id)
+{
+ unsigned char *data = p->idx_map.data;
+ size_t lo = id->id[0] ? decode32(data + ((id->id[0]-1) << 2)) : 0;
+ size_t hi = decode32(data + (id->id[0] << 2));
+
+ data += 1024;
+ do {
+ size_t mid = (lo + hi) >> 1;
+ size_t pos = 24 * mid;
+ int cmp = memcmp(id->id, data + pos + 4, 20);
+ if (cmp < 0)
+ hi = mid;
+ else if (!cmp) {
+ *out = decode32(data + pos);
+ return GIT_SUCCESS;
+ } else
+ lo = mid + 1;
+ } while (lo < hi);
+ return GIT_ENOTFOUND;
+}
+
+static int pack_openidx_v1(git_pack *p)
+{
+ uint32_t *fanout = p->idx_map.data;
+ size_t expsz;
+ int j;
+
+ for (j = 1; j < 256; j++) {
+ if (decode32(&fanout[j]) < decode32(&fanout[j - 1]))
+ return GIT_ERROR;
+ }
+ p->obj_cnt = decode32(&fanout[255]);
+ expsz = 4 * 256 + 24 * p->obj_cnt + 2 * 20;
+ if (expsz != p->idx_map.len)
+ return GIT_ERROR;
+
+ p->idx_search = idxv1_search;
+ return GIT_SUCCESS;
+}
+
+static int idxv2_search(off_t *out, git_pack *p, const git_oid *id)
+{
+ unsigned char *data = ((unsigned char*)p->idx_map.data) + 8;
+ size_t lo = id->id[0] ? decode32(data + ((id->id[0]-1) << 2)) : 0;
+ size_t hi = decode32(data + (id->id[0] << 2));
+
+ data += 1024;
+ do {
+ size_t mid = (lo + hi) >> 1;
+ size_t pos = 20 * mid;
+ int cmp = memcmp(id->id, data + pos, 20);
+ if (cmp < 0)
+ hi = mid;
+ else if (!cmp) {
+ uint32_t o32 = decode32(p->im_offset32 + mid);
+ if (o32 & 0x80000000)
+ *out = decode64(p->im_offset64 + 2*(o32 & ~0x80000000));
+ else
+ *out = o32;
+ return GIT_SUCCESS;
+ } else
+ lo = mid + 1;
+ } while (lo < hi);
+ return GIT_ENOTFOUND;
+}
+
+static int pack_openidx_v2(git_pack *p)
+{
+ void *data = ((unsigned char*)p->idx_map.data) + 8;
+ uint32_t *fanout = data;
+ int j;
+
+ for (j = 1; j < 256; j++) {
+ if (decode32(&fanout[j]) < decode32(&fanout[j - 1]))
+ return GIT_ERROR;
+ }
+ p->obj_cnt = decode32(&fanout[255]);
+ p->idx_search = idxv2_search;
+ p->im_crc = (uint32_t*)(data + 1024 + (20 * p->obj_cnt));
+ p->im_offset32 = p->im_crc + p->obj_cnt;
+ p->im_offset64 = p->im_offset32 + p->obj_cnt;
+ return GIT_SUCCESS;
+}
+
+static int pack_openidx(git_pack *p)
+{
+ gitlck_lock(&p->lock);
+ if (p->invalid)
+ goto unlock_fail;
+ if (++p->idxcnt == 1 && !p->idx_search) {
+ uint32_t *data;
+
+ if (pack_openidx_map(p))
+ goto invalid_fail;
+ data = p->idx_map.data;
+
+ if (decode32(&data[0]) == PACK_TOC) {
+ switch (decode32(&data[1])) {
+ case 2:
+ if (pack_openidx_v2(p))
+ goto unmap_fail;
+ break;
+ default:
+ goto unmap_fail;
+ }
+ } else if (pack_openidx_v1(p))
+ goto unmap_fail;
+ }
+ gitlck_unlock(&p->lock);
+ return GIT_SUCCESS;
+
+unmap_fail:
+ gitfo_free_map(&p->idx_map);
+
+invalid_fail:
+ p->invalid = 1;
+ p->idxcnt--;
+
+unlock_fail:
+ gitlck_unlock(&p->lock);
+ return GIT_ERROR;
+}
+
+static void pack_decidx(git_pack *p)
+{
+ gitlck_lock(&p->lock);
+ p->idxcnt--;
+ gitlck_unlock(&p->lock);
+}
+
static void pack_dec(git_pack *p)
{
int need_free;
@@ -464,6 +658,11 @@ static void pack_dec(git_pack *p)
gitlck_unlock(&p->lock);
if (need_free) {
+ if (p->idx_search) {
+ gitfo_free_map(&p->idx_map);
+ gitfo_close(p->idx_fd);
+ }
+
gitlck_free(&p->lock);
free(p);
}
@@ -552,6 +751,7 @@ static git_packlist* scan_packs(git_odb *db)
for (cnt = 0, c = state; c; ) {
struct scanned_pack *n = c->next;
+ c->pack->db = db;
new_list->packs[cnt++] = c->pack;
free(c);
c = n;
@@ -676,7 +876,20 @@ int git_odb__read_loose(git_obj *out, git_odb *db, const git_oid *id)
static int read_packed(git_obj *out, git_pack *p, const git_oid *id)
{
- return GIT_ENOTFOUND;
+ off_t pos;
+ int res;
+
+ if (pack_openidx(p))
+ return GIT_ERROR;
+ res = p->idx_search(&pos, p, id);
+ pack_decidx(p);
+
+ if (!res) {
+ /* TODO unpack object at pos */
+ res = GIT_ERROR;
+ }
+
+ return res;
}
int git_odb__read_packed(git_obj *out, git_odb *db, const git_oid *id)
diff --git a/src/odb.h b/src/odb.h
new file mode 100644
index 0000000..2f205b2
--- /dev/null
+++ b/src/odb.h
@@ -0,0 +1,19 @@
+#ifndef INCLUDE_odb_h__
+#define INCLUDE_odb_h__
+
+/** First 4 bytes of a pack-*.idx file header.
+ *
+ * Note this header exists only in idx v2 and later. The idx v1
+ * file format does not have a magic sequence at the front, and
+ * must be detected by the first four bytes *not* being this value
+ * and the first 8 bytes matching the following expression:
+ *
+ * uint32_t *fanout = ... the file data at offset 0 ...
+ * ntohl(fanout[0]) < ntohl(fanout[1])
+ *
+ * The value chosen here for PACK_TOC is such that the above
+ * cannot be true for an idx v1 file.
+ */
+#define PACK_TOC 0xff744f63 /* -1tOc */
+
+#endif
diff --git a/src/util.h b/src/util.h
index 018fce5..8f0c4b5 100644
--- a/src/util.h
+++ b/src/util.h
@@ -31,6 +31,13 @@ extern int git__fmt(char *, size_t, const char *, ...)
extern int git__prefixcmp(const char *str, const char *prefix);
extern int git__suffixcmp(const char *str, const char *suffix);
+/** @return true if p fits into the range of a size_t */
+GIT_INLINE(int) git__is_sizet(off_t p)
+{
+ size_t r = (size_t)p;
+ return p == r;
+}
+
/*
* Realloc the buffer pointed at by variable 'x' so that it can hold
* at least 'nr' entries; the number of entries currently allocated