Commit 89a3482829c77590b3cc4fe95a33b93eebaecff5

Patrick Steinhardt 2017-06-16T13:34:43

diff: implement function to calculate patch ID The upstream git project provides the ability to calculate a so-called patch ID. Quoting from git-patch-id(1): A "patch ID" is nothing but a sum of SHA-1 of the file diffs associated with a patch, with whitespace and line numbers ignored." Patch IDs can be used to identify two patches which are probably the same thing, e.g. when a patch has been cherry-picked to another branch. This commit implements a new function `git_diff_patchid`, which gets a patch and derives an OID from the diff. Note the different terminology here: a patch in libgit2 are the differences in a single file and a diff can contain multiple patches for different files. The implementation matches the upstream implementation and should derive the same OID for the same diff. In fact, some code has been directly derived from the upstream implementation. The upstream implementation has two different modes to calculate patch IDs, which is the stable and unstable mode. The old way of calculating the patch IDs was unstable in a sense that a different ordering the diffs was leading to different results. This oversight was fixed in git 1.9, but as git tries hard to never break existing workflows, the old and unstable way is still default. The newer and stable way does not care for ordering of the diff hunks, and in fact it is the mode that should probably be used today. So right now, we only implement the stable way of generating the patch ID.

diff --git a/include/git2/diff.h b/include/git2/diff.h
index 4f0871d..40536cb 100644
--- a/include/git2/diff.h
+++ b/include/git2/diff.h
@@ -1400,6 +1400,51 @@ GIT_EXTERN(int) git_diff_format_email_init_options(
 	git_diff_format_email_options *opts,
 	unsigned int version);
 
+/**
+ * Patch ID options structure
+ *
+ * Initialize with `GIT_DIFF_PATCHID_OPTIONS_INIT` macro to
+ * correctly set the default values and version.
+ */
+typedef struct git_diff_patchid_options {
+	unsigned int version;
+} git_diff_patchid_options;
+
+#define GIT_DIFF_PATCHID_OPTIONS_VERSION 1
+#define GIT_DIFF_PATCHID_OPTIONS_INIT { GIT_DIFF_PATCHID_OPTIONS_VERSION }
+
+/**
+ * Initialize `git_diff_patchid_options` structure.
+ *
+ * Initializes the structure with default values. Equivalent to
+ * creating an instance with `GIT_DIFF_PATCHID_OPTIONS_INIT`.
+ */
+GIT_EXTERN(int) git_diff_patchid_init_options(
+	git_diff_patchid_options *opts,
+	unsigned int version);
+
+/**
+ * Calculate the patch ID for the given patch.
+ *
+ * Calculate a stable patch ID for the given patch by summing the
+ * hash of the file diffs, ignoring whitespace and line numbers.
+ * This can be used to derive whether two diffs are the same with
+ * a high probability.
+ *
+ * Currently, this function only calculates stable patch IDs, as
+ * defined in git-patch-id(1), and should in fact generate the
+ * same IDs as the upstream git project does.
+ *
+ * @param out Pointer where the calculated patch ID shoul be
+ *  stored
+ * @param diff The diff to calculate the ID for
+ * @param opts Options for how to calculate the patch ID. This is
+ *  intended for future changes, as currently no options are
+ *  available.
+ * @return 0 on success, an error code otherwise.
+ */
+GIT_EXTERN(int) git_diff_patchid(git_oid *out, git_diff *diff, git_diff_patchid_options *opts);
+
 GIT_END_DECL
 
 /** @} */
diff --git a/src/diff.c b/src/diff.c
index a93bd4c..bc40743 100644
--- a/src/diff.c
+++ b/src/diff.c
@@ -19,6 +19,12 @@
 #define DIFF_FLAG_SET(DIFF,FLAG,VAL) (DIFF)->opts.flags = \
 	(VAL) ? ((DIFF)->opts.flags | (FLAG)) : ((DIFF)->opts.flags & ~(VAL))
 
+struct patch_id_args {
+	git_hash_ctx ctx;
+	git_oid result;
+	int first_file;
+};
+
 GIT_INLINE(const char *) diff_delta__path(const git_diff_delta *delta)
 {
 	const char *str = delta->old_file.path;
@@ -374,3 +380,141 @@ int git_diff_format_email_init_options(
 	return 0;
 }
 
+static int flush_hunk(git_oid *result, git_hash_ctx *ctx)
+{
+	git_oid hash;
+	unsigned short carry = 0;
+	int error, i;
+
+	if ((error = git_hash_final(&hash, ctx)) < 0 ||
+	    (error = git_hash_init(ctx)) < 0)
+		return error;
+
+	for (i = 0; i < GIT_OID_RAWSZ; i++) {
+		carry += result->id[i] + hash.id[i];
+		result->id[i] = carry;
+		carry >>= 8;
+	}
+
+	return 0;
+}
+
+static void strip_spaces(git_buf *buf)
+{
+	char *src = buf->ptr, *dst = buf->ptr;
+	char c;
+	size_t len = 0;
+
+	while ((c = *src++) != '\0') {
+		if (!git__isspace(c)) {
+			*dst++ = c;
+			len++;
+		}
+	}
+
+	git_buf_truncate(buf, len);
+}
+
+static int file_cb(
+	const git_diff_delta *delta,
+	float progress,
+	void *payload)
+{
+	struct patch_id_args *args = (struct patch_id_args *) payload;
+	git_buf buf = GIT_BUF_INIT;
+	int error;
+
+	GIT_UNUSED(progress);
+
+	if (!args->first_file &&
+	    (error = flush_hunk(&args->result, &args->ctx)) < 0)
+		goto out;
+	args->first_file = 0;
+
+	if ((error = git_buf_printf(&buf,
+				    "diff--gita/%sb/%s---a/%s+++b/%s",
+				    delta->old_file.path,
+				    delta->new_file.path,
+				    delta->old_file.path,
+				    delta->new_file.path)) < 0)
+		goto out;
+
+	strip_spaces(&buf);
+
+	if ((error = git_hash_update(&args->ctx, buf.ptr, buf.size)) < 0)
+		goto out;
+
+out:
+	git_buf_free(&buf);
+	return error;
+}
+
+static int line_cb(
+	const git_diff_delta *delta,
+	const git_diff_hunk *hunk,
+	const git_diff_line *line,
+	void *payload)
+{
+	struct patch_id_args *args = (struct patch_id_args *) payload;
+	git_buf buf = GIT_BUF_INIT;
+	int error;
+
+	GIT_UNUSED(delta);
+	GIT_UNUSED(hunk);
+
+	switch (line->origin) {
+	    case GIT_DIFF_LINE_ADDITION:
+		git_buf_putc(&buf, '+');
+		break;
+	    case GIT_DIFF_LINE_DELETION:
+		git_buf_putc(&buf, '-');
+		break;
+	    case GIT_DIFF_LINE_CONTEXT:
+		break;
+	    default:
+		giterr_set(GITERR_PATCH, "invalid line origin for patch");
+		return -1;
+	}
+
+	git_buf_put(&buf, line->content, line->content_len);
+	strip_spaces(&buf);
+
+	if ((error = git_hash_update(&args->ctx, buf.ptr, buf.size)) < 0)
+		goto out;
+
+out:
+	git_buf_free(&buf);
+	return error;
+}
+
+int git_diff_patchid_init_options(git_diff_patchid_options *opts, unsigned int version)
+{
+	GIT_INIT_STRUCTURE_FROM_TEMPLATE(
+		opts, version, git_diff_patchid_options, GIT_DIFF_PATCHID_OPTIONS_INIT);
+	return 0;
+}
+
+int git_diff_patchid(git_oid *out, git_diff *diff, git_diff_patchid_options *opts)
+{
+	struct patch_id_args args;
+	int error;
+
+	GITERR_CHECK_VERSION(
+		opts, GIT_DIFF_PATCHID_OPTIONS_VERSION, "git_diff_patchid_options");
+
+	memset(&args, 0, sizeof(args));
+	args.first_file = 1;
+	if ((error = git_hash_ctx_init(&args.ctx)) < 0)
+		goto out;
+
+	if ((error = git_diff_foreach(diff, file_cb, NULL, NULL, line_cb, &args)) < 0)
+		goto out;
+
+	if ((error = (flush_hunk(&args.result, &args.ctx))) < 0)
+		goto out;
+
+	git_oid_cpy(out, &args.result);
+
+out:
+	return error;
+}
diff --git a/tests/core/structinit.c b/tests/core/structinit.c
index 78503fc..8feba86 100644
--- a/tests/core/structinit.c
+++ b/tests/core/structinit.c
@@ -176,4 +176,8 @@ void test_core_structinit__compare(void)
 	CHECK_MACRO_FUNC_INIT_EQUAL( \
 		git_proxy_options, GIT_PROXY_OPTIONS_VERSION, \
 		GIT_PROXY_OPTIONS_INIT, git_proxy_init_options);
+
+	CHECK_MACRO_FUNC_INIT_EQUAL( \
+		git_diff_patchid_options, GIT_DIFF_PATCHID_OPTIONS_VERSION, \
+		GIT_DIFF_PATCHID_OPTIONS_INIT, git_diff_patchid_init_options);
 }
diff --git a/tests/diff/patchid.c b/tests/diff/patchid.c
new file mode 100644
index 0000000..75a2aa8
--- /dev/null
+++ b/tests/diff/patchid.c
@@ -0,0 +1,60 @@
+#include "clar_libgit2.h"
+#include "patch/patch_common.h"
+
+static void verify_patch_id(const char *diff_content, const char *expected_id)
+{
+	git_oid expected_oid, actual_oid;
+	git_diff *diff;
+
+	cl_git_pass(git_oid_fromstr(&expected_oid, expected_id));
+	cl_git_pass(git_diff_from_buffer(&diff, diff_content, strlen(diff_content)));
+	cl_git_pass(git_diff_patchid(&actual_oid, diff, NULL));
+
+	cl_assert_equal_oid(&expected_oid, &actual_oid);
+
+	git_diff_free(diff);
+}
+
+void test_diff_patchid__simple_commit(void)
+{
+	verify_patch_id(PATCH_SIMPLE_COMMIT, "06094b1948b878b7d9ff7560b4eae672a014b0ec");
+}
+
+void test_diff_patchid__filename_with_spaces(void)
+{
+	verify_patch_id(PATCH_APPEND_NO_NL, "f0ba05413beaef743b630e796153839462ee477a");
+}
+
+void test_diff_patchid__multiple_hunks(void)
+{
+	verify_patch_id(PATCH_MULTIPLE_HUNKS, "81e26c34643d17f521e57c483a6a637e18ba1f57");
+}
+
+void test_diff_patchid__multiple_files(void)
+{
+	verify_patch_id(PATCH_MULTIPLE_FILES, "192d1f49d23f2004517963aecd3f8a6c467f50ff");
+}
+
+void test_diff_patchid__same_diff_with_differing_whitespace_has_same_id(void)
+{
+	const char *tabs =
+	    "diff --git a/file.txt b/file.txt\n"
+	    "index 8fecc09..1d43a92 100644\n"
+	    "--- a/file.txt\n"
+	    "+++ b/file.txt\n"
+	    "@@ -1 +1 @@\n"
+	    "-old text\n"
+	    "+		new text\n";
+	const char *spaces =
+	    "diff --git a/file.txt b/file.txt\n"
+	    "index 8fecc09..1d43a92 100644\n"
+	    "--- a/file.txt\n"
+	    "+++ b/file.txt\n"
+	    "@@ -1 +1 @@\n"
+	    "-old text\n"
+	    "+        new text\n";
+	const char *id = "11efdd13c30f7a1056eac2ae2fb952da475e2c23";
+
+	verify_patch_id(tabs, id);
+	verify_patch_id(spaces, id);
+}
diff --git a/tests/patch/patch_common.h b/tests/patch/patch_common.h
index 6ec5546..a20ebd6 100644
--- a/tests/patch/patch_common.h
+++ b/tests/patch/patch_common.h
@@ -253,7 +253,66 @@
 	"@@ -9,0 +10 @@ below it!\n" \
 	"+insert at end\n"
 
-/* An insertion at the beginning and end of file (and the resultant patch) */
+#define PATCH_SIMPLE_COMMIT \
+	"commit 15e119375018fba121cf58e02a9f17fe22df0df8\n" \
+	"Author: Edward Thomson <ethomson@edwardthomson.com>\n" \
+	"Date:   Wed Jun 14 13:31:20 2017 +0200\n" \
+	"\n" \
+	"    CHANGELOG: document git_filter_init and GIT_FILTER_INIT\n" \
+	"\n" \
+	"diff --git a/CHANGELOG.md b/CHANGELOG.md\n" \
+	"index 1b9e0c90a..24ecba426 100644\n" \
+	"--- a/CHANGELOG.md\n" \
+	"+++ b/CHANGELOG.md\n" \
+	"@@ -96,6 +96,9 @@ v0.26\n" \
+	" * `git_transport_smart_proxy_options()' enables you to get the proxy options for\n" \
+	"   smart transports.\n" \
+	"\n" \
+	"+* The `GIT_FILTER_INIT` macro and the `git_filter_init` function are provided\n" \
+	"+  to initialize a `git_filter` structure.\n" \
+	"+\n" \
+	" ### Breaking API changes\n" \
+	"\n" \
+	" * `clone_checkout_strategy` has been removed from\n"
+
+#define PATCH_MULTIPLE_HUNKS \
+	"diff --git a/x b/x\n" \
+	"index 0719398..fa0350c 100644\n" \
+	"--- a/x\n" \
+	"+++ b/x\n" \
+	"@@ -1,5 +1,4 @@\n" \
+	" 1\n" \
+	"-2\n" \
+	" 3\n" \
+	" 4\n" \
+	" 5\n" \
+	"@@ -7,3 +6,4 @@\n" \
+	" 7\n" \
+	" 8\n" \
+	" 9\n" \
+	"+10\n"
+
+#define PATCH_MULTIPLE_FILES \
+	"diff --git a/x b/x\n" \
+	"index 8a1218a..7059ba5 100644\n" \
+	"--- a/x\n" \
+	"+++ b/x\n" \
+	"@@ -1,5 +1,4 @@\n" \
+	" 1\n" \
+	" 2\n" \
+	"-3\n" \
+	" 4\n" \
+	" 5\n" \
+	"diff --git a/y b/y\n" \
+	"index e006065..9405325 100644\n" \
+	"--- a/y\n" \
+	"+++ b/y\n" \
+	"@@ -1,4 +1,5 @@\n" \
+	" a\n" \
+	" b\n" \
+	"+c\n" \
+	" d\n" \
+	" e\n"
 
 #define FILE_PREPEND_AND_APPEND \
 	"first and\n" \