Fix diff binary file detection In the process of adding tests for the max file size threshold (which treats files over a certain size as binary) there seem to be a number of problems in the new code with detecting binaries. This should fix those up, as well as add a test for the file size threshold stuff. Also, this un-deprecates `GIT_DIFF_LINE_ADD_EOFNL`, since I finally found a legitimate situation where it would be returned.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337
diff --git a/include/git2/diff.h b/include/git2/diff.h
index 4b4591a..05825c5 100644
--- a/include/git2/diff.h
+++ b/include/git2/diff.h
@@ -79,13 +79,28 @@ typedef struct {
*/
typedef struct git_diff_list git_diff_list;
+/**
+ * Flags that can be set for the file on side of a diff.
+ *
+ * Most of the flags are just for internal consumption by libgit2,
+ * but some of them may be interesting to external users. They are:
+ *
+ * - VALID_OID - the `oid` value is computed and correct
+ * - FREE_PATH - the `path` string is separated allocated memory
+ * - BINARY - this file should be considered binary data
+ * - NOT_BINARY - this file should be considered text data
+ * - FREE_DATA - the internal file data is kept in allocated memory
+ * - UNMAP_DATA - the internal file data is kept in mmap'ed memory
+ * - NO_DATA - this side of the diff should not be loaded
+ */
enum {
GIT_DIFF_FILE_VALID_OID = (1 << 0),
GIT_DIFF_FILE_FREE_PATH = (1 << 1),
GIT_DIFF_FILE_BINARY = (1 << 2),
GIT_DIFF_FILE_NOT_BINARY = (1 << 3),
GIT_DIFF_FILE_FREE_DATA = (1 << 4),
- GIT_DIFF_FILE_UNMAP_DATA = (1 << 5)
+ GIT_DIFF_FILE_UNMAP_DATA = (1 << 5),
+ GIT_DIFF_FILE_NO_DATA = (1 << 6),
};
/**
@@ -176,7 +191,7 @@ enum {
GIT_DIFF_LINE_CONTEXT = ' ',
GIT_DIFF_LINE_ADDITION = '+',
GIT_DIFF_LINE_DELETION = '-',
- GIT_DIFF_LINE_ADD_EOFNL = '\n', /**< DEPRECATED - will not be returned */
+ GIT_DIFF_LINE_ADD_EOFNL = '\n', /**< Removed line w/o LF & added one with */
GIT_DIFF_LINE_DEL_EOFNL = '\0', /**< LF was removed at end of file */
/* The following values will only be sent to a `git_diff_data_fn` when
diff --git a/src/diff_output.c b/src/diff_output.c
index dbef7dd..ea40c33 100644
--- a/src/diff_output.c
+++ b/src/diff_output.c
@@ -172,9 +172,13 @@ static void update_delta_is_binary(git_diff_delta *delta)
if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0 ||
(delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0)
delta->binary = 1;
- else if ((delta->old_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0 &&
- (delta->new_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0)
+
+#define NOT_BINARY_FLAGS (GIT_DIFF_FILE_NOT_BINARY|GIT_DIFF_FILE_NO_DATA)
+
+ else if ((delta->old_file.flags & NOT_BINARY_FLAGS) != 0 &&
+ (delta->new_file.flags & NOT_BINARY_FLAGS) != 0)
delta->binary = 0;
+
/* otherwise leave delta->binary value untouched */
}
@@ -507,7 +511,7 @@ static int diff_delta_load(diff_delta_context *ctxt)
{
int error = 0;
git_diff_delta *delta = ctxt->delta;
- bool load_old = false, load_new = false, check_if_unmodified = false;
+ bool check_if_unmodified = false;
if (ctxt->loaded || !ctxt->delta)
return 0;
@@ -527,22 +531,33 @@ static int diff_delta_load(diff_delta_context *ctxt)
goto cleanup;
switch (delta->status) {
- case GIT_DELTA_ADDED: load_new = true; break;
- case GIT_DELTA_DELETED: load_old = true; break;
- case GIT_DELTA_MODIFIED: load_new = load_old = true; break;
- default: break;
+ case GIT_DELTA_ADDED:
+ delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA;
+ break;
+ case GIT_DELTA_DELETED:
+ delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA;
+ break;
+ case GIT_DELTA_MODIFIED:
+ break;
+ default:
+ delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA;
+ delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA;
+ break;
}
+#define CHECK_UNMODIFIED (GIT_DIFF_FILE_NO_DATA | GIT_DIFF_FILE_VALID_OID)
+
check_if_unmodified =
- (load_old && (delta->old_file.flags & GIT_DIFF_FILE_VALID_OID) == 0) ||
- (load_new && (delta->new_file.flags & GIT_DIFF_FILE_VALID_OID) == 0);
+ (delta->old_file.flags & CHECK_UNMODIFIED) == 0 &&
+ (delta->new_file.flags & CHECK_UNMODIFIED) == 0;
/* Always try to load workdir content first, since it may need to be
* filtered (and hence use 2x memory) and we want to minimize the max
* memory footprint during diff.
*/
- if (load_old && ctxt->old_src == GIT_ITERATOR_WORKDIR) {
+ if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
+ ctxt->old_src == GIT_ITERATOR_WORKDIR) {
if ((error = get_workdir_content(
ctxt, &delta->old_file, &ctxt->old_data)) < 0)
goto cleanup;
@@ -550,7 +565,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
goto cleanup;
}
- if (load_new && ctxt->new_src == GIT_ITERATOR_WORKDIR) {
+ if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
+ ctxt->new_src == GIT_ITERATOR_WORKDIR) {
if ((error = get_workdir_content(
ctxt, &delta->new_file, &ctxt->new_data)) < 0)
goto cleanup;
@@ -558,7 +574,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
goto cleanup;
}
- if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR) {
+ if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
+ ctxt->old_src != GIT_ITERATOR_WORKDIR) {
if ((error = get_blob_content(
ctxt, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0)
goto cleanup;
@@ -566,7 +583,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
goto cleanup;
}
- if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR) {
+ if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
+ ctxt->new_src != GIT_ITERATOR_WORKDIR) {
if ((error = get_blob_content(
ctxt, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0)
goto cleanup;
@@ -588,6 +606,10 @@ static int diff_delta_load(diff_delta_context *ctxt)
}
cleanup:
+ /* last change to update binary flag */
+ if (delta->binary == -1)
+ update_delta_is_binary(delta);
+
ctxt->loaded = !error;
/* flag if we would want to diff the contents of these files */
@@ -629,9 +651,10 @@ static int diff_delta_cb(void *priv, mmbuffer_t *bufs, int len)
}
if (len == 3 && !ctxt->cb_error) {
- /* This should only happen if we are adding a line that does not
- * have a newline at the end and the old code did. In that case,
- * we have a ADD with a DEL_EOFNL as a pair.
+ /* If we have a '+' and a third buf, then we have added a line
+ * without a newline and the old code had one, so DEL_EOFNL.
+ * If we have a '-' and a third buf, then we have removed a line
+ * with out a newline but added a blank line, so ADD_EOFNL.
*/
char origin =
(*bufs[0].ptr == '+') ? GIT_DIFF_LINE_DEL_EOFNL :
@@ -1036,6 +1059,7 @@ static void set_data_from_blob(
} else {
map->data = "";
file->size = map->len = 0;
+ file->flags |= GIT_DIFF_FILE_NO_DATA;
}
}
diff --git a/src/fileops.c b/src/fileops.c
index d4def1a..cbe3d47 100644
--- a/src/fileops.c
+++ b/src/fileops.c
@@ -115,40 +115,26 @@ mode_t git_futils_canonical_mode(mode_t raw_mode)
return 0;
}
-#define MAX_READ_STALLS 10
-
int git_futils_readbuffer_fd(git_buf *buf, git_file fd, size_t len)
{
- int stalls = MAX_READ_STALLS;
+ ssize_t read_size;
git_buf_clear(buf);
if (git_buf_grow(buf, len + 1) < 0)
return -1;
- buf->ptr[len] = '\0';
-
- while (len > 0) {
- ssize_t read_size = p_read(fd, buf->ptr + buf->size, len);
-
- if (read_size < 0) {
- giterr_set(GITERR_OS, "Failed to read descriptor");
- return -1;
- }
-
- if (read_size == 0) {
- stalls--;
+ /* p_read loops internally to read len bytes */
+ read_size = p_read(fd, buf->ptr, len);
- if (!stalls) {
- giterr_set(GITERR_OS, "Too many stalls reading descriptor");
- return -1;
- }
- }
-
- len -= read_size;
- buf->size += read_size;
+ if (read_size < 0) {
+ giterr_set(GITERR_OS, "Failed to read descriptor");
+ return -1;
}
+ buf->ptr[read_size] = '\0';
+ buf->size = read_size;
+
return 0;
}
diff --git a/tests-clar/diff/diff_helpers.c b/tests-clar/diff/diff_helpers.c
index ef59b68..767b343 100644
--- a/tests-clar/diff/diff_helpers.c
+++ b/tests-clar/diff/diff_helpers.c
@@ -89,7 +89,8 @@ int diff_line_fn(
e->line_adds++;
break;
case GIT_DIFF_LINE_ADD_EOFNL:
- assert(0);
+ /* technically not a line add, but we'll count it as such */
+ e->line_adds++;
break;
case GIT_DIFF_LINE_DELETION:
e->line_dels++;
diff --git a/tests-clar/diff/diffiter.c b/tests-clar/diff/diffiter.c
index 56c2547..23071e4 100644
--- a/tests-clar/diff/diffiter.c
+++ b/tests-clar/diff/diffiter.c
@@ -114,3 +114,88 @@ void test_diff_diffiter__iterate_files_and_hunks(void)
git_diff_iterator_free(iter);
git_diff_list_free(diff);
}
+
+void test_diff_diffiter__max_size_threshold(void)
+{
+ git_repository *repo = cl_git_sandbox_init("status");
+ git_diff_options opts = {0};
+ git_diff_list *diff = NULL;
+ git_diff_iterator *iter;
+ git_diff_delta *delta;
+ int error, file_count = 0, binary_count = 0, hunk_count = 0;
+
+ opts.context_lines = 3;
+ opts.interhunk_lines = 1;
+ opts.flags |= GIT_DIFF_INCLUDE_IGNORED | GIT_DIFF_INCLUDE_UNTRACKED;
+
+ cl_git_pass(git_diff_workdir_to_index(repo, &opts, &diff));
+ cl_git_pass(git_diff_iterator_new(&iter, diff));
+
+ while ((error = git_diff_iterator_next_file(&delta, iter)) != GIT_ITEROVER) {
+ cl_assert_equal_i(0, error);
+ cl_assert(delta);
+
+ file_count++;
+
+ hunk_count += git_diff_iterator_num_hunks_in_file(iter);
+
+ assert(delta->binary == 0 || delta->binary == 1);
+
+ binary_count += delta->binary;
+ }
+
+ cl_assert_equal_i(GIT_ITEROVER, error);
+ cl_assert(delta == NULL);
+
+ cl_assert_equal_i(13, file_count);
+ cl_assert_equal_i(0, binary_count);
+ cl_assert_equal_i(8, hunk_count);
+
+ git_diff_iterator_free(iter);
+ git_diff_list_free(diff);
+
+ /* try again with low file size threshold */
+
+ file_count = 0;
+ binary_count = 0;
+ hunk_count = 0;
+
+ opts.context_lines = 3;
+ opts.interhunk_lines = 1;
+ opts.flags |= GIT_DIFF_INCLUDE_IGNORED | GIT_DIFF_INCLUDE_UNTRACKED;
+ opts.max_size = 50; /* treat anything over 50 bytes as binary! */
+
+ cl_git_pass(git_diff_workdir_to_index(repo, &opts, &diff));
+ cl_git_pass(git_diff_iterator_new(&iter, diff));
+
+ while ((error = git_diff_iterator_next_file(&delta, iter)) != GIT_ITEROVER) {
+ cl_assert_equal_i(0, error);
+ cl_assert(delta);
+
+ file_count++;
+
+ hunk_count += git_diff_iterator_num_hunks_in_file(iter);
+
+ assert(delta->binary == 0 || delta->binary == 1);
+
+ binary_count += delta->binary;
+ }
+
+ cl_assert_equal_i(GIT_ITEROVER, error);
+ cl_assert(delta == NULL);
+
+ cl_assert_equal_i(13, file_count);
+
+ /* Three files are over the 50 byte threshold:
+ * - staged_changes_file_deleted
+ * - staged_changes_modified_file
+ * - staged_new_file_modified_file
+ */
+ cl_assert_equal_i(3, binary_count);
+
+ cl_assert_equal_i(5, hunk_count);
+
+ git_diff_iterator_free(iter);
+ git_diff_list_free(diff);
+
+}