Commit 9d83a2b08724211e564bffca740cd5fdc93d890e

Stan Hu 2018-02-22T22:55:50

Sanitize the hunk header to ensure it contains UTF-8 valid data The diff driver truncates the hunk header text to 80 bytes, which can truncate 4-byte Unicode characters and introduce garbage characters in the diff output. This change sanitizes the hunk header before it is displayed. This mirrors the test in git: https://github.com/git/git/blob/master/t/t4025-hunk-header.sh Closes https://github.com/libgit2/rugged/issues/716

diff --git a/src/diff_xdiff.c b/src/diff_xdiff.c
index 701eb1b..6907e52 100644
--- a/src/diff_xdiff.c
+++ b/src/diff_xdiff.c
@@ -6,6 +6,7 @@
  */
 
 #include "diff_xdiff.h"
+#include "util.h"
 
 #include "git2/errors.h"
 #include "diff.h"
@@ -115,6 +116,7 @@ static int git_xdiff_cb(void *priv, mmbuffer_t *bufs, int len)
 	const git_diff_delta *delta = patch->base.delta;
 	git_patch_generated_output *output = &info->xo->output;
 	git_diff_line line;
+	size_t buffer_len;
 
 	if (len == 1) {
 		output->error = git_xdiff_parse_hunk(&info->hunk, bufs[0].ptr);
@@ -124,6 +126,16 @@ static int git_xdiff_cb(void *priv, mmbuffer_t *bufs, int len)
 		info->hunk.header_len = bufs[0].size;
 		if (info->hunk.header_len >= sizeof(info->hunk.header))
 			info->hunk.header_len = sizeof(info->hunk.header) - 1;
+
+		/* Sanitize the hunk header in case there is invalid Unicode */
+		buffer_len = git__utf8_valid_buf_length((const uint8_t *) bufs[0].ptr, info->hunk.header_len);
+		/* Sanitizing the hunk header may delete the newline, so add it back again if there is room */
+		if (buffer_len < info->hunk.header_len) {
+			bufs[0].ptr[buffer_len] = '\n';
+			buffer_len += 1;
+			info->hunk.header_len = buffer_len;
+		}
+
 		memcpy(info->hunk.header, bufs[0].ptr, info->hunk.header_len);
 		info->hunk.header[info->hunk.header_len] = '\0';
 
diff --git a/src/util.c b/src/util.c
index 2955b7c..bf778a9 100644
--- a/src/util.c
+++ b/src/util.c
@@ -806,6 +806,22 @@ double git_time_monotonic(void)
 	return git__timer();
 }
 
+size_t git__utf8_valid_buf_length(const uint8_t *str, size_t str_len)
+{
+	size_t offset = 0;
+
+	while (offset < str_len) {
+		int length = git__utf8_charlen(str + offset, str_len - offset);
+
+		if (length < 0)
+			break;
+
+		offset += length;
+	}
+
+	return offset;
+}
+
 #ifdef GIT_WIN32
 int git__getenv(git_buf *out, const char *name)
 {
diff --git a/src/util.h b/src/util.h
index f6d19cf..67ae4ef 100644
--- a/src/util.h
+++ b/src/util.h
@@ -454,6 +454,16 @@ extern size_t git__unescape(char *str);
 extern int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst);
 
 /*
+ * Iterate through an UTF-8 string and stops after finding any invalid UTF-8
+ * codepoints.
+ *
+ * @param str string to scan
+ * @param str_len size of the string
+ * @return length in bytes of the string that contains valid data
+ */
+extern size_t git__utf8_valid_buf_length(const uint8_t *str, size_t str_len);
+
+/*
  * Safely zero-out memory, making sure that the compiler
  * doesn't optimize away the operation.
  */
diff --git a/tests/diff/patch.c b/tests/diff/patch.c
index 1184d19..4c83628 100644
--- a/tests/diff/patch.c
+++ b/tests/diff/patch.c
@@ -25,6 +25,12 @@ void test_diff_patch__cleanup(void)
 
 #define EXPECTED_HUNK "@@ -1,2 +0,0 @@\n"
 
+#define UTF8_HUNK_HEADER "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\n"
+
+#define UTF8_TRUNCATED_A_HUNK_HEADER "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\n"
+
+#define UTF8_TRUNCATED_L_HUNK_HEADER "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\n"
+
 static int check_removal_cb(
 	const git_diff_delta *delta,
 	const git_diff_hunk *hunk,
@@ -610,3 +616,89 @@ void test_diff_patch__line_counts_with_eofnl(void)
 
 	git_buf_free(&content);
 }
+
+void test_diff_patch__can_strip_bad_utf8(void)
+{
+	const char *a = "A " UTF8_HUNK_HEADER
+		"  B\n"
+		"  C\n"
+		"  D\n"
+		"  E\n"
+		"  F\n"
+		"  G\n"
+		"  H\n"
+		"  I\n"
+		"  J\n"
+		"  K\n"
+		"L  " UTF8_HUNK_HEADER
+		"  M\n"
+		"  N\n"
+		"  O\n"
+		"  P\n"
+		"  Q\n"
+		"  R\n"
+		"  S\n"
+		"  T\n"
+		"  U\n"
+		"  V\n";
+
+	const char *b = "A " UTF8_HUNK_HEADER
+		"  B\n"
+		"  C\n"
+		"  D\n"
+		"  E modified\n"
+		"  F\n"
+		"  G\n"
+		"  H\n"
+		"  I\n"
+		"  J\n"
+		"  K\n"
+		"L  " UTF8_HUNK_HEADER
+		"  M\n"
+		"  N\n"
+		"  O\n"
+		"  P modified\n"
+		"  Q\n"
+		"  R\n"
+		"  S\n"
+		"  T\n"
+		"  U\n"
+		"  V\n";
+
+	const char *expected = "diff --git a/file b/file\n"
+		"index d0647c4..7827ce5 100644\n"
+		"--- a/file\n"
+		"+++ b/file\n"
+		"@@ -2,7 +2,7 @@ A " UTF8_TRUNCATED_A_HUNK_HEADER
+		"   B\n"
+		"   C\n"
+		"   D\n"
+		"-  E\n"
+		"+  E modified\n"
+		"   F\n"
+		"   G\n"
+		"   H\n"
+		"@@ -13,7 +13,7 @@ L  " UTF8_TRUNCATED_L_HUNK_HEADER
+		"   M\n"
+		"   N\n"
+		"   O\n"
+		"-  P\n"
+		"+  P modified\n"
+		"   Q\n"
+		"   R\n"
+		"   S\n";
+
+	git_diff_options opts;
+	git_patch *patch;
+	git_buf buf = GIT_BUF_INIT;
+
+	cl_git_pass(git_diff_init_options(&opts, GIT_DIFF_OPTIONS_VERSION));
+
+	cl_git_pass(git_patch_from_buffers(&patch, a, strlen(a), NULL, b, strlen(b), NULL, &opts));
+	cl_git_pass(git_patch_to_buf(&buf, patch));
+
+	cl_assert_equal_s(expected, buf.ptr);
+
+	git_patch_free(patch);
+	git_buf_free(&buf);
+}