Commit 9be5be47fb1d9bc08e25b30c05dbf48739710062

Russell Belfer 2013-05-20T13:37:21

More git_diff_find_similar improvements - Add new GIT_DIFF_FIND_EXACT_MATCH_ONLY flag to do similarity matching without using the similarity metric (i.e. only compare the SHA). - Clean up the similarity measurement code to more rigorously distinguish between files that are not similar and files that are not comparable (previously, a 0 could either mean that the files could not be compared or that they were totally different) - When splitting a MODIFIED file into a DELETE/ADD pair, actually make a DELETED/UNTRACKED pair if the right side of the diff is from the working directory. This prevents an odd mix of ADDED and UNTRACKED files on workdir diffs.

diff --git a/include/git2/diff.h b/include/git2/diff.h
index 172aa11..31f6e05 100644
--- a/include/git2/diff.h
+++ b/include/git2/diff.h
@@ -441,6 +441,8 @@ typedef enum {
 	GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 12),
 	/** measure similarity including all data */
 	GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 13),
+	/** measure similarity only by comparing SHAs (fast and cheap) */
+	GIT_DIFF_FIND_EXACT_MATCH_ONLY = (1 << 14),
 } git_diff_find_t;
 
 /**
diff --git a/src/diff_tform.c b/src/diff_tform.c
index 33268e4..d5e56ac 100644
--- a/src/diff_tform.c
+++ b/src/diff_tform.c
@@ -255,6 +255,16 @@ static int normalize_find_opts(
 
 	/* some flags imply others */
 
+	if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
+		/* if we are only looking for exact matches, then don't turn
+		 * MODIFIED items into ADD/DELETE pairs because it's too picky
+		 */
+		opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);
+
+		/* similarly, don't look for self-rewrites to split */
+		opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
+	}
+
 	if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
 		opts->flags |= GIT_DIFF_FIND_RENAMES;
 
@@ -373,7 +383,10 @@ static int apply_splits_and_deletes(
 			if (git_vector_insert(&onto, deleted) < 0)
 				goto on_error;
 
-			delta->status = GIT_DELTA_ADDED;
+			if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
+				delta->status = GIT_DELTA_UNTRACKED;
+			else
+				delta->status = GIT_DELTA_ADDED;
 			memset(&delta->old_file, 0, sizeof(delta->old_file));
 			delta->old_file.path = delta->new_file.path;
 			delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
@@ -460,22 +473,56 @@ static int similarity_calc(
 	return error;
 }
 
+#define FLAG_SET(opts,flag_name) (((opts).flags & flag_name) != 0)
+
+/* - score < 0 means files cannot be compared
+ * - score >= 100 means files are exact match
+ * - score == 0 means files are completely different
+ */
 static int similarity_measure(
+	int *score,
 	git_diff_list *diff,
 	git_diff_find_options *opts,
 	void **cache,
 	size_t a_idx,
 	size_t b_idx)
 {
-	int score = 0;
 	git_diff_file *a_file = similarity_get_file(diff, a_idx);
 	git_diff_file *b_file = similarity_get_file(diff, b_idx);
+	bool exact_match = FLAG_SET(*opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
+
+	*score = -1;
 
+	/* don't try to compare files of different types */
 	if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode))
 		return 0;
 
-	if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0)
-		return 100;
+	/* if exact match is requested, force calculation of missing OIDs */
+	if (exact_match) {
+		if (git_oid_iszero(&a_file->oid) &&
+			diff->old_src == GIT_ITERATOR_TYPE_WORKDIR &&
+			!git_diff__oid_for_file(diff->repo, a_file->path,
+				a_file->mode, a_file->size, &a_file->oid))
+			a_file->flags |= GIT_DIFF_FLAG_VALID_OID;
+
+		if (git_oid_iszero(&b_file->oid) &&
+			diff->new_src == GIT_ITERATOR_TYPE_WORKDIR &&
+			!git_diff__oid_for_file(diff->repo, b_file->path,
+				b_file->mode, b_file->size, &b_file->oid))
+			b_file->flags |= GIT_DIFF_FLAG_VALID_OID;
+	}
+
+	/* check OID match as a quick test */
+	if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0) {
+		*score = 100;
+		return 0;
+	}
+
+	/* don't calculate signatures if we are doing exact match */
+	if (exact_match) {
+		*score = 0;
+		return 0;
+	}
 
 	/* update signature cache if needed */
 	if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0)
@@ -488,20 +535,33 @@ static int similarity_measure(
 		return 0;
 
 	/* compare signatures */
-	if (opts->metric->similarity(
-			&score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
-		return -1;
-
-	/* clip score */
-	if (score < 1)
-		score = 1; /* zero means uncomparable, so use 1 for least similar */
-	else if (score > 100)
-		score = 100;
-
-	return score;
+	return opts->metric->similarity(
+		score, cache[a_idx], cache[b_idx], opts->metric->payload);
 }
 
-#define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0)
+static void convert_to_rename_and_add(
+	git_diff_list *diff,
+	git_diff_delta *from,
+	git_diff_delta *to,
+	int similarity)
+{
+	to->status = GIT_DELTA_RENAMED;
+	to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
+	to->similarity = (uint32_t)similarity;
+	memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
+	validate_delta(to);
+
+	if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
+		from->status = GIT_DELTA_UNTRACKED;
+	else
+		from->status = GIT_DELTA_ADDED;
+	from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
+	from->similarity = 0;
+	memset(&from->old_file, 0, sizeof(from->old_file));
+	from->old_file.path = from->new_file.path;
+	from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
+	validate_delta(from);
+}
 
 typedef struct {
 	uint32_t idx;
@@ -542,21 +602,17 @@ int git_diff_find_similar(
 				continue;
 
 			/* skip things that aren't plain blobs */
-			if (GIT_MODE_TYPE(from->old_file.mode) !=
-				GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
+			if (!GIT_MODE_ISBLOB(from->old_file.mode))
 				continue;
 
 			/* measure similarity from old_file to new_file */
-			similarity = similarity_measure(
-				diff, &opts, cache, 2 * i, 2 * i + 1);
-
-			if (similarity < 0) {
-				error = similarity;
+			if ((error = similarity_measure(
+					&similarity, diff, &opts, cache, 2 * i, 2 * i + 1)) < 0)
 				goto cleanup;
-			}
 
-			if (similarity > 0 &&
-				similarity < (int)opts.break_rewrite_threshold) {
+			if (similarity < 0)
+				continue;
+			if (similarity < (int)opts.break_rewrite_threshold) {
 				from->similarity = (uint32_t)similarity;
 				from->flags |= GIT_DIFF_FLAG__TO_SPLIT;
 				num_rewrites++;
@@ -573,8 +629,7 @@ int git_diff_find_similar(
 		matches[i].similarity = 0;
 
 		/* skip things that aren't plain blobs */
-		if (GIT_MODE_TYPE(from->old_file.mode) !=
-			GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
+		if (!GIT_MODE_ISBLOB(from->old_file.mode))
 			continue;
 
 		/* don't check UNMODIFIED files as source unless given option */
@@ -599,8 +654,7 @@ int git_diff_find_similar(
 				continue;
 
 			/* skip things that aren't blobs */
-			if (GIT_MODE_TYPE(to->new_file.mode) !=
-				GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
+			if (!GIT_MODE_ISBLOB(to->new_file.mode))
 				continue;
 
 			/* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
@@ -630,14 +684,13 @@ int git_diff_find_similar(
 				break;
 
 			/* calculate similarity for this pair and find best match */
-			similarity = similarity_measure(
-				diff, &opts, cache, 2 * i, 2 * j + 1);
-
-			if (similarity < 0) {
-				error = similarity;
+			if ((error = similarity_measure(
+					&similarity, diff, &opts, cache, 2 * i, 2 * j + 1)) < 0)
 				goto cleanup;
+			if (similarity < 0) {
+				--tried_targets;
+				continue;
 			}
-
 			if (matches[i].similarity < (uint32_t)similarity) {
 				matches[i].similarity = (uint32_t)similarity;
 				matches[i].idx = j;
@@ -687,18 +740,7 @@ int git_diff_find_similar(
 			if (similarity < (int)opts.rename_threshold)
 				continue;
 
-			to->status = GIT_DELTA_RENAMED;
-			to->similarity = (uint32_t)similarity;
-			memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
-			validate_delta(to);
-
-			from->status = GIT_DELTA_ADDED;
-			from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
-			from->similarity = 0; /* reset self-similarity */
-			memset(&from->old_file, 0, sizeof(from->old_file));
-			from->old_file.path = from->new_file.path;
-			validate_delta(from);
-
+			convert_to_rename_and_add(diff, from, to, similarity);
 			num_rewrites--;
 			num_updates++;
 			continue;
@@ -712,28 +754,16 @@ int git_diff_find_similar(
 			FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
 			similarity > (int)opts.rename_threshold)
 		{
-			int self_similarity = similarity_measure(
-				diff, &opts, cache, 2 * i, 2 * i + 1);
-			if (self_similarity < 0) {
-				error = self_similarity;
+			int self_similarity;
+
+			if ((error = similarity_measure(&self_similarity,
+					diff, &opts, cache, 2 * i, 2 * i + 1)) < 0)
 				goto cleanup;
-			}
 
-			if (self_similarity < (int)opts.rename_from_rewrite_threshold) {
-				to->status = GIT_DELTA_RENAMED;
-				to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
-				to->similarity = (uint32_t)similarity;
-				memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
-				validate_delta(to);
-
-				from->status = GIT_DELTA_ADDED;
-				from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
-				from->similarity = 0;
-				memset(&from->old_file, 0, sizeof(from->old_file));
-				from->old_file.path = from->new_file.path;
-				from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
-				validate_delta(from);
+			if (self_similarity >= 0 &&
+				self_similarity < (int)opts.rename_from_rewrite_threshold) {
 
+				convert_to_rename_and_add(diff, from, to, similarity);
 				num_updates++;
 				continue;
 			}
@@ -754,13 +784,10 @@ int git_diff_find_similar(
 		num_updates++;
 	}
 
-	if (num_rewrites > 0) {
-		assert(num_rewrites < diff->deltas.length);
-
+	if (num_rewrites > 0)
 		error = apply_splits_and_deletes(
 			diff, diff->deltas.length - num_rewrites,
 			FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES));
-	}
 
 	if (num_rewrites > 0 || num_updates > 0)
 		git_vector_sort(&diff->deltas);
diff --git a/src/fileops.h b/src/fileops.h
index 627a692..3e214aa 100644
--- a/src/fileops.h
+++ b/src/fileops.h
@@ -223,6 +223,7 @@ extern git_off_t git_futils_filesize(git_file fd);
 #define GIT_MODE_PERMS_MASK			0777
 #define GIT_CANONICAL_PERMS(MODE)	(((MODE) & 0100) ? 0755 : 0644)
 #define GIT_MODE_TYPE(MODE)			((MODE) & ~GIT_MODE_PERMS_MASK)
+#define GIT_MODE_ISBLOB(MODE)		(GIT_MODE_TYPE(MODE) == GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
 
 /**
  * Convert a mode_t from the OS to a legal git mode_t value.