Fix rename detection for tree-to-tree diffs The performance improvements I introduced for rename detection were not able to run successfully for tree-to-tree diffs because the blob size was not known early enough and so the file signature always had to be calculated nonetheless. This change separates loading blobs into memory from calculating the signature. I can't avoid having to load the large blobs into memory, but by moving it forward, I'm able to avoid the signature calculation if the blob won't come into play for renames.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
diff --git a/src/diff_tform.c b/src/diff_tform.c
index cabcd1f..08b0d5c 100644
--- a/src/diff_tform.c
+++ b/src/diff_tform.c
@@ -408,54 +408,90 @@ GIT_INLINE(git_diff_file *) similarity_get_file(git_diff_list *diff, size_t idx)
return (idx & 1) ? &delta->new_file : &delta->old_file;
}
-static int similarity_calc(
- git_diff_list *diff,
- const git_diff_find_options *opts,
- size_t file_idx,
- void **cache)
+typedef struct {
+ size_t idx;
+ git_iterator_type_t src;
+ git_repository *repo;
+ git_diff_file *file;
+ git_buf data;
+ git_blob *blob;
+ int loaded;
+} similarity_info;
+
+static void similarity_init(
+ similarity_info *info, git_diff_list *diff, size_t file_idx)
+{
+ info->idx = file_idx;
+ info->src = (file_idx & 1) ? diff->new_src : diff->old_src;
+ info->repo = diff->repo;
+ info->file = similarity_get_file(diff, file_idx);
+ info->blob = NULL;
+ info->loaded = 0;
+ git_buf_init(&info->data, 0);
+}
+
+static int similarity_load(similarity_info *info)
{
int error = 0;
- git_diff_file *file = similarity_get_file(diff, file_idx);
- git_iterator_type_t src = (file_idx & 1) ? diff->new_src : diff->old_src;
+ git_diff_file *file = info->file;
- if (src == GIT_ITERATOR_TYPE_WORKDIR) { /* compute hashsig from file */
- git_buf path = GIT_BUF_INIT;
+ if (info->src == GIT_ITERATOR_TYPE_WORKDIR) {
+ error = git_buf_joinpath(
+ &info->data, git_repository_workdir(info->repo), file->path);
- /* TODO: apply wd-to-odb filters to file data if necessary */
+ /* if path is not a regular file, just skip this item */
+ if (!error && !git_path_isfile(info->data.ptr))
+ git_buf_free(&info->data);
+ } else if (git_blob_lookup(&info->blob, info->repo, &file->oid) < 0) {
+ /* if lookup fails, just skip this item in similarity calc */
+ giterr_clear();
+ } else {
+ if (!file->size)
+ file->size = git_blob_rawsize(info->blob);
+ assert(file->size == git_blob_rawsize(info->blob));
- if ((error = git_buf_joinpath(
- &path, git_repository_workdir(diff->repo), file->path)) < 0)
- return error;
+ info->data.size = (size_t)(git__is_sizet(file->size) ? file->size : -1);
+ info->data.ptr = (char *)git_blob_rawcontent(info->blob);
+ }
- /* if path is not a regular file, just skip this item */
- if (git_path_isfile(path.ptr))
- error = opts->metric->file_signature(
- &cache[file_idx], file, path.ptr, opts->metric->payload);
+ info->loaded = 1;
- git_buf_free(&path);
- } else { /* compute hashsig from blob buffer */
- git_blob *blob = NULL;
- git_off_t blobsize;
+ return error;
+}
- /* TODO: add max size threshold a la diff? */
+static void similarity_unload(similarity_info *info)
+{
+ if (info->blob)
+ git_blob_free(info->blob);
+ else
+ git_buf_free(&info->data);
- if (git_blob_lookup(&blob, diff->repo, &file->oid) < 0) {
- /* if lookup fails, just skip this item in similarity calc */
- giterr_clear();
- return 0;
- }
+ info->loaded = 0;
+}
- blobsize = git_blob_rawsize(blob);
- if (!file->size)
- file->size = blobsize;
- if (!git__is_sizet(blobsize)) /* ? what to do ? */
- blobsize = (size_t)-1;
+static int similarity_calc(
+ similarity_info *info,
+ const git_diff_find_options *opts,
+ void **cache)
+{
+ int error = 0;
- error = opts->metric->buffer_signature(
- &cache[file_idx], file, git_blob_rawcontent(blob),
- (size_t)blobsize, opts->metric->payload);
+ if (!info->loaded && (error = similarity_load(info)) < 0)
+ return error;
- git_blob_free(blob);
+ if (!info->data.size)
+ return 0;
+
+ if (info->src == GIT_ITERATOR_TYPE_WORKDIR) {
+ /* TODO: apply wd-to-odb filters to file data if necessary */
+
+ error = opts->metric->file_signature(
+ &cache[info->idx], info->file,
+ info->data.ptr, opts->metric->payload);
+ } else {
+ error = opts->metric->buffer_signature(
+ &cache[info->idx], info->file,
+ info->data.ptr, info->data.size, opts->metric->payload);
}
return error;
@@ -478,6 +514,8 @@ static int similarity_measure(
git_diff_file *a_file = similarity_get_file(diff, a_idx);
git_diff_file *b_file = similarity_get_file(diff, b_idx);
bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
+ int error = 0;
+ similarity_info a_info, b_info;
*score = -1;
@@ -512,26 +550,39 @@ static int similarity_measure(
return 0;
}
+ similarity_init(&a_info, diff, a_idx);
+ similarity_init(&b_info, diff, b_idx);
+
+ if (!a_file->size && (error = similarity_load(&a_info)) < 0)
+ goto done;
+ if (!b_file->size && (error = similarity_load(&b_info)) < 0)
+ goto done;
+
/* check if file sizes are nowhere near each other */
if (a_file->size > 127 &&
b_file->size > 127 &&
(a_file->size > (b_file->size << 4) ||
b_file->size > (a_file->size << 4)))
- return 0;
+ goto done;
/* update signature cache if needed */
- if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0)
- return -1;
- if (!cache[b_idx] && similarity_calc(diff, opts, b_idx, cache) < 0)
- return -1;
+ if (!cache[a_idx] && (error = similarity_calc(&a_info, opts, cache)) < 0)
+ goto done;
+ if (!cache[b_idx] && (error = similarity_calc(&b_info, opts, cache)) < 0)
+ goto done;
- /* some metrics may not wish to process this file (too big / too small) */
- if (!cache[a_idx] || !cache[b_idx])
- return 0;
+ /* calculate similarity provided that the metric choose to process
+ * both the a and b files (some may not if file is too big, etc).
+ */
+ if (cache[a_idx] && cache[b_idx])
+ error = opts->metric->similarity(
+ score, cache[a_idx], cache[b_idx], opts->metric->payload);
- /* compare signatures */
- return opts->metric->similarity(
- score, cache[a_idx], cache[b_idx], opts->metric->payload);
+done:
+ similarity_unload(&a_info);
+ similarity_unload(&b_info);
+
+ return error;
}
static int calc_self_similarity(
diff --git a/tests-clar/diff/rename.c b/tests-clar/diff/rename.c
index ed58b7a..79c89e3 100644
--- a/tests-clar/diff/rename.c
+++ b/tests-clar/diff/rename.c
@@ -1126,7 +1126,7 @@ void test_diff_rename__unmodified_can_be_renamed(void)
void test_diff_rename__many_files(void)
{
git_index *index;
- git_tree *tree;
+ git_tree *tree, *new_tree;
git_diff_list *diff = NULL;
diff_expects exp;
git_diff_options diffopts = GIT_DIFF_OPTIONS_INIT;
@@ -1176,6 +1176,52 @@ void test_diff_rename__many_files(void)
cl_assert_equal_i(51, exp.files);
git_diff_list_free(diff);
- git_index_free(index);
+
+ {
+ git_object *parent;
+ git_signature *sig;
+ git_oid tree_id, commit_id;
+ git_reference *ref;
+
+ cl_git_pass(git_index_write_tree(&tree_id, index));
+ cl_git_pass(git_tree_lookup(&new_tree, g_repo, &tree_id));
+
+ cl_git_pass(git_revparse_ext(&parent, &ref, g_repo, "HEAD"));
+ cl_git_pass(git_signature_new(
+ &sig, "Sm Test", "sm@tester.test", 1372350000, 480));
+
+ cl_git_pass(git_commit_create_v(
+ &commit_id, g_repo, git_reference_name(ref), sig, sig,
+ NULL, "yoyoyo", new_tree, 1, parent));
+
+ git_object_free(parent);
+ git_reference_free(ref);
+ git_signature_free(sig);
+ }
+
+ cl_git_pass(git_diff_tree_to_tree(
+ &diff, g_repo, tree, new_tree, &diffopts));
+
+ memset(&exp, 0, sizeof(exp));
+ cl_git_pass(git_diff_foreach(
+ diff, diff_file_cb, NULL, NULL, &exp));
+ cl_assert_equal_i(1, exp.file_status[GIT_DELTA_DELETED]);
+ cl_assert_equal_i(51, exp.file_status[GIT_DELTA_ADDED]);
+ cl_assert_equal_i(52, exp.files);
+
+ opts.flags = GIT_DIFF_FIND_ALL;
+ cl_git_pass(git_diff_find_similar(diff, &opts));
+
+ memset(&exp, 0, sizeof(exp));
+ cl_git_pass(git_diff_foreach(
+ diff, diff_file_cb, NULL, NULL, &exp));
+ cl_assert_equal_i(1, exp.file_status[GIT_DELTA_RENAMED]);
+ cl_assert_equal_i(50, exp.file_status[GIT_DELTA_ADDED]);
+ cl_assert_equal_i(51, exp.files);
+
+ git_diff_list_free(diff);
+
+ git_tree_free(new_tree);
git_tree_free(tree);
+ git_index_free(index);
}