Sanitize the hunk header to ensure it contains UTF-8 valid data The diff driver truncates the hunk header text to 80 bytes, which can truncate 4-byte Unicode characters and introduce garbage characters in the diff output. This change sanitizes the hunk header before it is displayed. This mirrors the test in git: https://github.com/git/git/blob/master/t/t4025-hunk-header.sh Closes https://github.com/libgit2/rugged/issues/716
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
diff --git a/src/diff_xdiff.c b/src/diff_xdiff.c
index 701eb1b..6907e52 100644
--- a/src/diff_xdiff.c
+++ b/src/diff_xdiff.c
@@ -6,6 +6,7 @@
*/
#include "diff_xdiff.h"
+#include "util.h"
#include "git2/errors.h"
#include "diff.h"
@@ -115,6 +116,7 @@ static int git_xdiff_cb(void *priv, mmbuffer_t *bufs, int len)
const git_diff_delta *delta = patch->base.delta;
git_patch_generated_output *output = &info->xo->output;
git_diff_line line;
+ size_t buffer_len;
if (len == 1) {
output->error = git_xdiff_parse_hunk(&info->hunk, bufs[0].ptr);
@@ -124,6 +126,16 @@ static int git_xdiff_cb(void *priv, mmbuffer_t *bufs, int len)
info->hunk.header_len = bufs[0].size;
if (info->hunk.header_len >= sizeof(info->hunk.header))
info->hunk.header_len = sizeof(info->hunk.header) - 1;
+
+ /* Sanitize the hunk header in case there is invalid Unicode */
+ buffer_len = git__utf8_valid_buf_length((const uint8_t *) bufs[0].ptr, info->hunk.header_len);
+ /* Sanitizing the hunk header may delete the newline, so add it back again if there is room */
+ if (buffer_len < info->hunk.header_len) {
+ bufs[0].ptr[buffer_len] = '\n';
+ buffer_len += 1;
+ info->hunk.header_len = buffer_len;
+ }
+
memcpy(info->hunk.header, bufs[0].ptr, info->hunk.header_len);
info->hunk.header[info->hunk.header_len] = '\0';
diff --git a/src/util.c b/src/util.c
index 2955b7c..bf778a9 100644
--- a/src/util.c
+++ b/src/util.c
@@ -806,6 +806,22 @@ double git_time_monotonic(void)
return git__timer();
}
+size_t git__utf8_valid_buf_length(const uint8_t *str, size_t str_len)
+{
+ size_t offset = 0;
+
+ while (offset < str_len) {
+ int length = git__utf8_charlen(str + offset, str_len - offset);
+
+ if (length < 0)
+ break;
+
+ offset += length;
+ }
+
+ return offset;
+}
+
#ifdef GIT_WIN32
int git__getenv(git_buf *out, const char *name)
{
diff --git a/src/util.h b/src/util.h
index f6d19cf..67ae4ef 100644
--- a/src/util.h
+++ b/src/util.h
@@ -454,6 +454,16 @@ extern size_t git__unescape(char *str);
extern int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst);
/*
+ * Iterate through an UTF-8 string and stops after finding any invalid UTF-8
+ * codepoints.
+ *
+ * @param str string to scan
+ * @param str_len size of the string
+ * @return length in bytes of the string that contains valid data
+ */
+extern size_t git__utf8_valid_buf_length(const uint8_t *str, size_t str_len);
+
+/*
* Safely zero-out memory, making sure that the compiler
* doesn't optimize away the operation.
*/
diff --git a/tests/diff/patch.c b/tests/diff/patch.c
index 1184d19..4c83628 100644
--- a/tests/diff/patch.c
+++ b/tests/diff/patch.c
@@ -25,6 +25,12 @@ void test_diff_patch__cleanup(void)
#define EXPECTED_HUNK "@@ -1,2 +0,0 @@\n"
+#define UTF8_HUNK_HEADER "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\n"
+
+#define UTF8_TRUNCATED_A_HUNK_HEADER "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\n"
+
+#define UTF8_TRUNCATED_L_HUNK_HEADER "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\n"
+
static int check_removal_cb(
const git_diff_delta *delta,
const git_diff_hunk *hunk,
@@ -610,3 +616,89 @@ void test_diff_patch__line_counts_with_eofnl(void)
git_buf_free(&content);
}
+
+void test_diff_patch__can_strip_bad_utf8(void)
+{
+ const char *a = "A " UTF8_HUNK_HEADER
+ " B\n"
+ " C\n"
+ " D\n"
+ " E\n"
+ " F\n"
+ " G\n"
+ " H\n"
+ " I\n"
+ " J\n"
+ " K\n"
+ "L " UTF8_HUNK_HEADER
+ " M\n"
+ " N\n"
+ " O\n"
+ " P\n"
+ " Q\n"
+ " R\n"
+ " S\n"
+ " T\n"
+ " U\n"
+ " V\n";
+
+ const char *b = "A " UTF8_HUNK_HEADER
+ " B\n"
+ " C\n"
+ " D\n"
+ " E modified\n"
+ " F\n"
+ " G\n"
+ " H\n"
+ " I\n"
+ " J\n"
+ " K\n"
+ "L " UTF8_HUNK_HEADER
+ " M\n"
+ " N\n"
+ " O\n"
+ " P modified\n"
+ " Q\n"
+ " R\n"
+ " S\n"
+ " T\n"
+ " U\n"
+ " V\n";
+
+ const char *expected = "diff --git a/file b/file\n"
+ "index d0647c4..7827ce5 100644\n"
+ "--- a/file\n"
+ "+++ b/file\n"
+ "@@ -2,7 +2,7 @@ A " UTF8_TRUNCATED_A_HUNK_HEADER
+ " B\n"
+ " C\n"
+ " D\n"
+ "- E\n"
+ "+ E modified\n"
+ " F\n"
+ " G\n"
+ " H\n"
+ "@@ -13,7 +13,7 @@ L " UTF8_TRUNCATED_L_HUNK_HEADER
+ " M\n"
+ " N\n"
+ " O\n"
+ "- P\n"
+ "+ P modified\n"
+ " Q\n"
+ " R\n"
+ " S\n";
+
+ git_diff_options opts;
+ git_patch *patch;
+ git_buf buf = GIT_BUF_INIT;
+
+ cl_git_pass(git_diff_init_options(&opts, GIT_DIFF_OPTIONS_VERSION));
+
+ cl_git_pass(git_patch_from_buffers(&patch, a, strlen(a), NULL, b, strlen(b), NULL, &opts));
+ cl_git_pass(git_patch_to_buf(&buf, patch));
+
+ cl_assert_equal_s(expected, buf.ptr);
+
+ git_patch_free(patch);
+ git_buf_free(&buf);
+}