indexer: use a hashtable for keeping track of offsets These offsets are needed for REF_DELTA objects, which encode which object they use as a base, but not where it lies in the packfile, so we need a list. These objects are mostly from older packfiles, before OFS_DELTA was widely spread. The time spent in indexing these packfiles is greatly reduced, though remains above what git is able to do.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
diff --git a/src/indexer.c b/src/indexer.c
index 1600d1c..c7e142b 100644
--- a/src/indexer.c
+++ b/src/indexer.c
@@ -17,6 +17,7 @@
#include "posix.h"
#include "pack.h"
#include "filebuf.h"
+#include "oidmap.h"
#define UINT31_MAX (0x7FFFFFFF)
@@ -122,14 +123,6 @@ static int objects_cmp(const void *a, const void *b)
return git_oid_cmp(&entrya->oid, &entryb->oid);
}
-static int cache_cmp(const void *a, const void *b)
-{
- const struct git_pack_entry *ea = a;
- const struct git_pack_entry *eb = b;
-
- return git_oid_cmp(&ea->sha1, &eb->sha1);
-}
-
int git_indexer_stream_new(
git_indexer_stream **out,
const char *prefix,
@@ -271,7 +264,8 @@ static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, git_off_t start,
static int store_object(git_indexer_stream *idx)
{
- int i;
+ int i, error;
+ khiter_t k;
git_oid oid;
struct entry *entry;
git_off_t entry_size;
@@ -296,11 +290,15 @@ static int store_object(git_indexer_stream *idx)
git_oid_cpy(&pentry->sha1, &oid);
pentry->offset = entry_start;
- if (git_vector_insert(&idx->pack->cache, pentry) < 0) {
+
+ k = kh_put(oid, idx->pack->idx_cache, &pentry->sha1, &error);
+ if (!error) {
git__free(pentry);
goto on_error;
}
+ kh_value(idx->pack->idx_cache, k) = pentry;
+
git_oid_cpy(&entry->oid, &oid);
if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
@@ -324,7 +322,8 @@ on_error:
static int hash_and_save(git_indexer_stream *idx, git_rawobj *obj, git_off_t entry_start)
{
- int i;
+ int i, error;
+ khiter_t k;
git_oid oid;
size_t entry_size;
struct entry *entry;
@@ -351,11 +350,14 @@ static int hash_and_save(git_indexer_stream *idx, git_rawobj *obj, git_off_t ent
git_oid_cpy(&pentry->sha1, &oid);
pentry->offset = entry_start;
- if (git_vector_insert(&idx->pack->cache, pentry) < 0) {
+ k = kh_put(oid, idx->pack->idx_cache, &pentry->sha1, &error);
+ if (!error) {
git__free(pentry);
goto on_error;
}
+ kh_value(idx->pack->idx_cache, k) = pentry;
+
git_oid_cpy(&entry->oid, &oid);
entry->crc = crc32(0L, Z_NULL, 0);
@@ -426,8 +428,8 @@ int git_indexer_stream_add(git_indexer_stream *idx, const void *data, size_t siz
/* for now, limit to 2^32 objects */
assert(idx->nr_objects == (size_t)((unsigned int)idx->nr_objects));
- if (git_vector_init(&idx->pack->cache, (unsigned int)idx->nr_objects, cache_cmp) < 0)
- return -1;
+ idx->pack->idx_cache = git_oidmap_alloc();
+ GITERR_CHECK_ALLOC(idx->pack->idx_cache);
idx->pack->has_cache = 1;
if (git_vector_init(&idx->objects, (unsigned int)idx->nr_objects, objects_cmp) < 0)
@@ -718,9 +720,9 @@ on_error:
void git_indexer_stream_free(git_indexer_stream *idx)
{
+ khiter_t k;
unsigned int i;
struct entry *e;
- struct git_pack_entry *pe;
struct delta_info *delta;
if (idx == NULL)
@@ -729,11 +731,16 @@ void git_indexer_stream_free(git_indexer_stream *idx)
git_vector_foreach(&idx->objects, i, e)
git__free(e);
git_vector_free(&idx->objects);
+
if (idx->pack) {
- git_vector_foreach(&idx->pack->cache, i, pe)
- git__free(pe);
- git_vector_free(&idx->pack->cache);
+ for (k = kh_begin(idx->pack->idx_cache); k != kh_end(idx->pack->idx_cache); k++) {
+ if (kh_exist(idx->pack->idx_cache, k))
+ git__free(kh_value(idx->pack->idx_cache, k));
+ }
+
+ git_oidmap_free(idx->pack->idx_cache);
}
+
git_vector_foreach(&idx->deltas, i, delta)
git__free(delta);
git_vector_free(&idx->deltas);
diff --git a/src/pack-objects.c b/src/pack-objects.c
index e4b6719..459201f 100644
--- a/src/pack-objects.c
+++ b/src/pack-objects.c
@@ -21,8 +21,6 @@
#include "git2/indexer.h"
#include "git2/config.h"
-GIT__USE_OIDMAP;
-
struct unpacked {
git_pobject *object;
void *data;
diff --git a/src/pack.c b/src/pack.c
index f36f3cf..75ac981 100644
--- a/src/pack.c
+++ b/src/pack.c
@@ -760,13 +760,14 @@ git_off_t get_delta_base(
} else if (type == GIT_OBJ_REF_DELTA) {
/* If we have the cooperative cache, search in it first */
if (p->has_cache) {
- size_t pos;
- struct git_pack_entry key;
+ khiter_t k;
+ git_oid oid;
- git_oid_fromraw(&key.sha1, base_info);
- if (!git_vector_bsearch(&pos, &p->cache, &key)) {
+ git_oid_fromraw(&oid, base_info);
+ k = kh_get(oid, p->idx_cache, &oid);
+ if (k != kh_end(p->idx_cache)) {
*curpos += 20;
- return ((struct git_pack_entry *)git_vector_get(&p->cache, pos))->offset;
+ return ((struct git_pack_entry *)kh_value(p->idx_cache, k))->offset;
}
}
/* The base entry _must_ be in the same pack */
diff --git a/src/pack.h b/src/pack.h
index 6c43d8f..8d7e33d 100644
--- a/src/pack.h
+++ b/src/pack.h
@@ -16,6 +16,7 @@
#include "map.h"
#include "mwindow.h"
#include "odb.h"
+#include "oidmap.h"
#define GIT_PACK_FILE_MODE 0444
@@ -62,6 +63,7 @@ typedef struct git_pack_cache_entry {
#include "offmap.h"
GIT__USE_OFFMAP;
+GIT__USE_OIDMAP;
#define GIT_PACK_CACHE_MEMORY_LIMIT 16 * 1024 * 1024
#define GIT_PACK_CACHE_SIZE_LIMIT 1024 * 1024 /* don't bother caching anything over 1MB */
@@ -86,7 +88,7 @@ struct git_pack_file {
git_time_t mtime;
unsigned pack_local:1, pack_keep:1, has_cache:1;
git_oid sha1;
- git_vector cache;
+ git_oidmap *idx_cache;
git_oid **oids;
git_pack_cache bases; /* delta base cache */