More git_diff_find_similar improvements - Add new GIT_DIFF_FIND_EXACT_MATCH_ONLY flag to do similarity matching without using the similarity metric (i.e. only compare the SHA). - Clean up the similarity measurement code to more rigorously distinguish between files that are not similar and files that are not comparable (previously, a 0 could either mean that the files could not be compared or that they were totally different) - When splitting a MODIFIED file into a DELETE/ADD pair, actually make a DELETED/UNTRACKED pair if the right side of the diff is from the working directory. This prevents an odd mix of ADDED and UNTRACKED files on workdir diffs.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303
diff --git a/include/git2/diff.h b/include/git2/diff.h
index 172aa11..31f6e05 100644
--- a/include/git2/diff.h
+++ b/include/git2/diff.h
@@ -441,6 +441,8 @@ typedef enum {
GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 12),
/** measure similarity including all data */
GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 13),
+ /** measure similarity only by comparing SHAs (fast and cheap) */
+ GIT_DIFF_FIND_EXACT_MATCH_ONLY = (1 << 14),
} git_diff_find_t;
/**
diff --git a/src/diff_tform.c b/src/diff_tform.c
index 33268e4..d5e56ac 100644
--- a/src/diff_tform.c
+++ b/src/diff_tform.c
@@ -255,6 +255,16 @@ static int normalize_find_opts(
/* some flags imply others */
+ if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
+ /* if we are only looking for exact matches, then don't turn
+ * MODIFIED items into ADD/DELETE pairs because it's too picky
+ */
+ opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);
+
+ /* similarly, don't look for self-rewrites to split */
+ opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
+ }
+
if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
opts->flags |= GIT_DIFF_FIND_RENAMES;
@@ -373,7 +383,10 @@ static int apply_splits_and_deletes(
if (git_vector_insert(&onto, deleted) < 0)
goto on_error;
- delta->status = GIT_DELTA_ADDED;
+ if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
+ delta->status = GIT_DELTA_UNTRACKED;
+ else
+ delta->status = GIT_DELTA_ADDED;
memset(&delta->old_file, 0, sizeof(delta->old_file));
delta->old_file.path = delta->new_file.path;
delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
@@ -460,22 +473,56 @@ static int similarity_calc(
return error;
}
+#define FLAG_SET(opts,flag_name) (((opts).flags & flag_name) != 0)
+
+/* - score < 0 means files cannot be compared
+ * - score >= 100 means files are exact match
+ * - score == 0 means files are completely different
+ */
static int similarity_measure(
+ int *score,
git_diff_list *diff,
git_diff_find_options *opts,
void **cache,
size_t a_idx,
size_t b_idx)
{
- int score = 0;
git_diff_file *a_file = similarity_get_file(diff, a_idx);
git_diff_file *b_file = similarity_get_file(diff, b_idx);
+ bool exact_match = FLAG_SET(*opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
+
+ *score = -1;
+ /* don't try to compare files of different types */
if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode))
return 0;
- if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0)
- return 100;
+ /* if exact match is requested, force calculation of missing OIDs */
+ if (exact_match) {
+ if (git_oid_iszero(&a_file->oid) &&
+ diff->old_src == GIT_ITERATOR_TYPE_WORKDIR &&
+ !git_diff__oid_for_file(diff->repo, a_file->path,
+ a_file->mode, a_file->size, &a_file->oid))
+ a_file->flags |= GIT_DIFF_FLAG_VALID_OID;
+
+ if (git_oid_iszero(&b_file->oid) &&
+ diff->new_src == GIT_ITERATOR_TYPE_WORKDIR &&
+ !git_diff__oid_for_file(diff->repo, b_file->path,
+ b_file->mode, b_file->size, &b_file->oid))
+ b_file->flags |= GIT_DIFF_FLAG_VALID_OID;
+ }
+
+ /* check OID match as a quick test */
+ if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0) {
+ *score = 100;
+ return 0;
+ }
+
+ /* don't calculate signatures if we are doing exact match */
+ if (exact_match) {
+ *score = 0;
+ return 0;
+ }
/* update signature cache if needed */
if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0)
@@ -488,20 +535,33 @@ static int similarity_measure(
return 0;
/* compare signatures */
- if (opts->metric->similarity(
- &score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
- return -1;
-
- /* clip score */
- if (score < 1)
- score = 1; /* zero means uncomparable, so use 1 for least similar */
- else if (score > 100)
- score = 100;
-
- return score;
+ return opts->metric->similarity(
+ score, cache[a_idx], cache[b_idx], opts->metric->payload);
}
-#define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0)
+static void convert_to_rename_and_add(
+ git_diff_list *diff,
+ git_diff_delta *from,
+ git_diff_delta *to,
+ int similarity)
+{
+ to->status = GIT_DELTA_RENAMED;
+ to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
+ to->similarity = (uint32_t)similarity;
+ memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
+ validate_delta(to);
+
+ if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
+ from->status = GIT_DELTA_UNTRACKED;
+ else
+ from->status = GIT_DELTA_ADDED;
+ from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
+ from->similarity = 0;
+ memset(&from->old_file, 0, sizeof(from->old_file));
+ from->old_file.path = from->new_file.path;
+ from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
+ validate_delta(from);
+}
typedef struct {
uint32_t idx;
@@ -542,21 +602,17 @@ int git_diff_find_similar(
continue;
/* skip things that aren't plain blobs */
- if (GIT_MODE_TYPE(from->old_file.mode) !=
- GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
+ if (!GIT_MODE_ISBLOB(from->old_file.mode))
continue;
/* measure similarity from old_file to new_file */
- similarity = similarity_measure(
- diff, &opts, cache, 2 * i, 2 * i + 1);
-
- if (similarity < 0) {
- error = similarity;
+ if ((error = similarity_measure(
+ &similarity, diff, &opts, cache, 2 * i, 2 * i + 1)) < 0)
goto cleanup;
- }
- if (similarity > 0 &&
- similarity < (int)opts.break_rewrite_threshold) {
+ if (similarity < 0)
+ continue;
+ if (similarity < (int)opts.break_rewrite_threshold) {
from->similarity = (uint32_t)similarity;
from->flags |= GIT_DIFF_FLAG__TO_SPLIT;
num_rewrites++;
@@ -573,8 +629,7 @@ int git_diff_find_similar(
matches[i].similarity = 0;
/* skip things that aren't plain blobs */
- if (GIT_MODE_TYPE(from->old_file.mode) !=
- GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
+ if (!GIT_MODE_ISBLOB(from->old_file.mode))
continue;
/* don't check UNMODIFIED files as source unless given option */
@@ -599,8 +654,7 @@ int git_diff_find_similar(
continue;
/* skip things that aren't blobs */
- if (GIT_MODE_TYPE(to->new_file.mode) !=
- GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
+ if (!GIT_MODE_ISBLOB(to->new_file.mode))
continue;
/* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
@@ -630,14 +684,13 @@ int git_diff_find_similar(
break;
/* calculate similarity for this pair and find best match */
- similarity = similarity_measure(
- diff, &opts, cache, 2 * i, 2 * j + 1);
-
- if (similarity < 0) {
- error = similarity;
+ if ((error = similarity_measure(
+ &similarity, diff, &opts, cache, 2 * i, 2 * j + 1)) < 0)
goto cleanup;
+ if (similarity < 0) {
+ --tried_targets;
+ continue;
}
-
if (matches[i].similarity < (uint32_t)similarity) {
matches[i].similarity = (uint32_t)similarity;
matches[i].idx = j;
@@ -687,18 +740,7 @@ int git_diff_find_similar(
if (similarity < (int)opts.rename_threshold)
continue;
- to->status = GIT_DELTA_RENAMED;
- to->similarity = (uint32_t)similarity;
- memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
- validate_delta(to);
-
- from->status = GIT_DELTA_ADDED;
- from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
- from->similarity = 0; /* reset self-similarity */
- memset(&from->old_file, 0, sizeof(from->old_file));
- from->old_file.path = from->new_file.path;
- validate_delta(from);
-
+ convert_to_rename_and_add(diff, from, to, similarity);
num_rewrites--;
num_updates++;
continue;
@@ -712,28 +754,16 @@ int git_diff_find_similar(
FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
similarity > (int)opts.rename_threshold)
{
- int self_similarity = similarity_measure(
- diff, &opts, cache, 2 * i, 2 * i + 1);
- if (self_similarity < 0) {
- error = self_similarity;
+ int self_similarity;
+
+ if ((error = similarity_measure(&self_similarity,
+ diff, &opts, cache, 2 * i, 2 * i + 1)) < 0)
goto cleanup;
- }
- if (self_similarity < (int)opts.rename_from_rewrite_threshold) {
- to->status = GIT_DELTA_RENAMED;
- to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
- to->similarity = (uint32_t)similarity;
- memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
- validate_delta(to);
-
- from->status = GIT_DELTA_ADDED;
- from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
- from->similarity = 0;
- memset(&from->old_file, 0, sizeof(from->old_file));
- from->old_file.path = from->new_file.path;
- from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
- validate_delta(from);
+ if (self_similarity >= 0 &&
+ self_similarity < (int)opts.rename_from_rewrite_threshold) {
+ convert_to_rename_and_add(diff, from, to, similarity);
num_updates++;
continue;
}
@@ -754,13 +784,10 @@ int git_diff_find_similar(
num_updates++;
}
- if (num_rewrites > 0) {
- assert(num_rewrites < diff->deltas.length);
-
+ if (num_rewrites > 0)
error = apply_splits_and_deletes(
diff, diff->deltas.length - num_rewrites,
FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES));
- }
if (num_rewrites > 0 || num_updates > 0)
git_vector_sort(&diff->deltas);
diff --git a/src/fileops.h b/src/fileops.h
index 627a692..3e214aa 100644
--- a/src/fileops.h
+++ b/src/fileops.h
@@ -223,6 +223,7 @@ extern git_off_t git_futils_filesize(git_file fd);
#define GIT_MODE_PERMS_MASK 0777
#define GIT_CANONICAL_PERMS(MODE) (((MODE) & 0100) ? 0755 : 0644)
#define GIT_MODE_TYPE(MODE) ((MODE) & ~GIT_MODE_PERMS_MASK)
+#define GIT_MODE_ISBLOB(MODE) (GIT_MODE_TYPE(MODE) == GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
/**
* Convert a mode_t from the OS to a legal git mode_t value.