Merge pull request #5477 from pks-t/pks/rename-detection-negative-caches merge: cache negative cache results for similarity metrics
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
diff --git a/src/merge.c b/src/merge.c
index 05a776e..afe69e5 100644
--- a/src/merge.c
+++ b/src/merge.c
@@ -68,6 +68,16 @@ struct merge_diff_df_data {
git_merge_diff *prev_conflict;
};
+/*
+ * This acts as a negative cache entry marker. In case we've tried to calculate
+ * similarity metrics for a given blob already but `git_hashsig` determined
+ * that it's too small in order to have a meaningful hash signature, we will
+ * insert the address of this marker instead of `NULL`. Like this, we can
+ * easily check whether we have checked a gien entry already and skip doing the
+ * calculation again and again.
+ */
+static int cache_invalid_marker;
+
/* Merge base computation */
int merge_bases_many(git_commit_list **out, git_revwalk **walk_out, git_repository *repo, size_t length, const git_oid input_array[])
@@ -1027,6 +1037,9 @@ static int index_entry_similarity_calc(
git_object_size_t blobsize;
int error;
+ if (*out || *out == &cache_invalid_marker)
+ return 0;
+
*out = NULL;
if ((error = git_blob_lookup(&blob, repo, &entry->id)) < 0)
@@ -1047,6 +1060,8 @@ static int index_entry_similarity_calc(
error = opts->metric->buffer_signature(out, &diff_file,
git_blob_rawcontent(blob), (size_t)blobsize,
opts->metric->payload);
+ if (error == GIT_EBUFS)
+ *out = &cache_invalid_marker;
git_blob_free(blob);
@@ -1069,18 +1084,16 @@ static int index_entry_similarity_inexact(
return 0;
/* update signature cache if needed */
- if (!cache[a_idx] && (error = index_entry_similarity_calc(&cache[a_idx], repo, a, opts)) < 0)
- return error;
- if (!cache[b_idx] && (error = index_entry_similarity_calc(&cache[b_idx], repo, b, opts)) < 0)
+ if ((error = index_entry_similarity_calc(&cache[a_idx], repo, a, opts)) < 0 ||
+ (error = index_entry_similarity_calc(&cache[b_idx], repo, b, opts)) < 0)
return error;
/* some metrics may not wish to process this file (too big / too small) */
- if (!cache[a_idx] || !cache[b_idx])
+ if (cache[a_idx] == &cache_invalid_marker || cache[b_idx] == &cache_invalid_marker)
return 0;
/* compare signatures */
- if (opts->metric->similarity(
- &score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
+ if (opts->metric->similarity(&score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
return -1;
/* clip score */
@@ -1550,7 +1563,7 @@ int git_merge_diff_list__find_renames(
done:
if (cache != NULL) {
for (i = 0; i < cache_size; ++i) {
- if (cache[i] != NULL)
+ if (cache[i] != NULL && cache[i] != &cache_invalid_marker)
opts->metric->free_signature(cache[i], opts->metric->payload);
}
diff --git a/tests/merge/trees/renames.c b/tests/merge/trees/renames.c
index e0b12af..c515aaf 100644
--- a/tests/merge/trees/renames.c
+++ b/tests/merge/trees/renames.c
@@ -274,3 +274,80 @@ void test_merge_trees_renames__submodules(void)
cl_assert(merge_test_index(index, merge_index_entries, 7));
git_index_free(index);
}
+
+void test_merge_trees_renames__cache_recomputation(void)
+{
+ git_oid blob, binary, ancestor_oid, theirs_oid, ours_oid;
+ git_merge_options opts = GIT_MERGE_OPTIONS_INIT;
+ git_buf path = GIT_BUF_INIT;
+ git_treebuilder *builder;
+ git_tree *ancestor_tree, *their_tree, *our_tree;
+ git_index *index;
+ size_t blob_size;
+ void *data;
+ size_t i;
+
+ cl_git_pass(git_oid_fromstr(&blob, "a2d8d1824c68541cca94ffb90f79291eba495921"));
+
+ /*
+ * Create a 50MB blob that consists of NUL bytes only. It is important
+ * that this blob is of a special format, most importantly it cannot
+ * contain more than four non-consecutive newlines or NUL bytes. This
+ * is because of git_hashsig's inner workings where all files with less
+ * than four "lines" are deemed to small.
+ */
+ blob_size = 50 * 1024 * 1024;
+ cl_assert(data = git__calloc(blob_size, 1));
+ cl_git_pass(git_blob_create_from_buffer(&binary, repo, data, blob_size));
+
+ /*
+ * Create the common ancestor, which has 1000 dummy blobs and the binary
+ * blob. The dummy blobs serve as potential rename targets for the
+ * dummy blob.
+ */
+ cl_git_pass(git_treebuilder_new(&builder, repo, NULL));
+ for (i = 0; i < 1000; i++) {
+ cl_git_pass(git_buf_printf(&path, "%"PRIuMAX".txt", i));
+ cl_git_pass(git_treebuilder_insert(NULL, builder, path.ptr, &blob, GIT_FILEMODE_BLOB));
+ git_buf_clear(&path);
+ }
+ cl_git_pass(git_treebuilder_insert(NULL, builder, "original.bin", &binary, GIT_FILEMODE_BLOB));
+ cl_git_pass(git_treebuilder_write(&ancestor_oid, builder));
+
+ /* We now the binary blob in our tree. */
+ cl_git_pass(git_treebuilder_remove(builder, "original.bin"));
+ cl_git_pass(git_treebuilder_insert(NULL, builder, "renamed.bin", &binary, GIT_FILEMODE_BLOB));
+ cl_git_pass(git_treebuilder_write(&ours_oid, builder));
+
+ git_treebuilder_free(builder);
+
+ /* And move everything into a subdirectory in their tree. */
+ cl_git_pass(git_treebuilder_new(&builder, repo, NULL));
+ cl_git_pass(git_treebuilder_insert(NULL, builder, "subdir", &ancestor_oid, GIT_FILEMODE_TREE));
+ cl_git_pass(git_treebuilder_write(&theirs_oid, builder));
+
+ /*
+ * Now merge ancestor, ours and theirs. As `git_hashsig` refuses to
+ * create a hash signature for the 50MB binary file, we historically
+ * didn't cache the hashsig computation for it. As a result, we now
+ * started looking up the 50MB blob and scanning it at least 1000
+ * times, which takes a long time.
+ *
+ * The number of 1000 blobs is chosen in such a way that it's
+ * noticeable when the bug creeps in again, as it takes around 12
+ * minutes on my machine to compute the following merge.
+ */
+ opts.target_limit = 5000;
+ cl_git_pass(git_tree_lookup(&ancestor_tree, repo, &ancestor_oid));
+ cl_git_pass(git_tree_lookup(&their_tree, repo, &theirs_oid));
+ cl_git_pass(git_tree_lookup(&our_tree, repo, &ours_oid));
+ cl_git_pass(git_merge_trees(&index, repo, ancestor_tree, our_tree, their_tree, &opts));
+
+ git_treebuilder_free(builder);
+ git_buf_dispose(&path);
+ git_index_free(index);
+ git_tree_free(ancestor_tree);
+ git_tree_free(their_tree);
+ git_tree_free(our_tree);
+ git__free(data);
+}