Commit 1f35e89dbf6e0be8952cc4324a45fd600be5ca05

Russell Belfer 2012-09-11T12:03:33

Fix diff binary file detection In the process of adding tests for the max file size threshold (which treats files over a certain size as binary) there seem to be a number of problems in the new code with detecting binaries. This should fix those up, as well as add a test for the file size threshold stuff. Also, this un-deprecates `GIT_DIFF_LINE_ADD_EOFNL`, since I finally found a legitimate situation where it would be returned.

diff --git a/include/git2/diff.h b/include/git2/diff.h
index 4b4591a..05825c5 100644
--- a/include/git2/diff.h
+++ b/include/git2/diff.h
@@ -79,13 +79,28 @@ typedef struct {
  */
 typedef struct git_diff_list git_diff_list;
 
+/**
+ * Flags that can be set for the file on side of a diff.
+ *
+ * Most of the flags are just for internal consumption by libgit2,
+ * but some of them may be interesting to external users.  They are:
+ *
+ * - VALID_OID  - the `oid` value is computed and correct
+ * - FREE_PATH  - the `path` string is separated allocated memory
+ * - BINARY     - this file should be considered binary data
+ * - NOT_BINARY - this file should be considered text data
+ * - FREE_DATA  - the internal file data is kept in allocated memory
+ * - UNMAP_DATA - the internal file data is kept in mmap'ed memory
+ * - NO_DATA    - this side of the diff should not be loaded
+ */
 enum {
 	GIT_DIFF_FILE_VALID_OID  = (1 << 0),
 	GIT_DIFF_FILE_FREE_PATH  = (1 << 1),
 	GIT_DIFF_FILE_BINARY     = (1 << 2),
 	GIT_DIFF_FILE_NOT_BINARY = (1 << 3),
 	GIT_DIFF_FILE_FREE_DATA  = (1 << 4),
-	GIT_DIFF_FILE_UNMAP_DATA = (1 << 5)
+	GIT_DIFF_FILE_UNMAP_DATA = (1 << 5),
+	GIT_DIFF_FILE_NO_DATA    = (1 << 6),
 };
 
 /**
@@ -176,7 +191,7 @@ enum {
 	GIT_DIFF_LINE_CONTEXT   = ' ',
 	GIT_DIFF_LINE_ADDITION  = '+',
 	GIT_DIFF_LINE_DELETION  = '-',
-	GIT_DIFF_LINE_ADD_EOFNL = '\n', /**< DEPRECATED - will not be returned */
+	GIT_DIFF_LINE_ADD_EOFNL = '\n', /**< Removed line w/o LF & added one with */
 	GIT_DIFF_LINE_DEL_EOFNL = '\0', /**< LF was removed at end of file */
 
 	/* The following values will only be sent to a `git_diff_data_fn` when
diff --git a/src/diff_output.c b/src/diff_output.c
index dbef7dd..ea40c33 100644
--- a/src/diff_output.c
+++ b/src/diff_output.c
@@ -172,9 +172,13 @@ static void update_delta_is_binary(git_diff_delta *delta)
 	if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0 ||
 		(delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0)
 		delta->binary = 1;
-	else if ((delta->old_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0 &&
-			 (delta->new_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0)
+
+#define NOT_BINARY_FLAGS (GIT_DIFF_FILE_NOT_BINARY|GIT_DIFF_FILE_NO_DATA)
+
+	else if ((delta->old_file.flags & NOT_BINARY_FLAGS) != 0 &&
+			 (delta->new_file.flags & NOT_BINARY_FLAGS) != 0)
 		delta->binary = 0;
+
 	/* otherwise leave delta->binary value untouched */
 }
 
@@ -507,7 +511,7 @@ static int diff_delta_load(diff_delta_context *ctxt)
 {
 	int error = 0;
 	git_diff_delta *delta = ctxt->delta;
-	bool load_old = false, load_new = false, check_if_unmodified = false;
+	bool check_if_unmodified = false;
 
 	if (ctxt->loaded || !ctxt->delta)
 		return 0;
@@ -527,22 +531,33 @@ static int diff_delta_load(diff_delta_context *ctxt)
 		goto cleanup;
 
 	switch (delta->status) {
-	case GIT_DELTA_ADDED:    load_new = true; break;
-	case GIT_DELTA_DELETED:  load_old = true; break;
-	case GIT_DELTA_MODIFIED: load_new = load_old = true; break;
-	default: break;
+	case GIT_DELTA_ADDED:
+		delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA;
+		break;
+	case GIT_DELTA_DELETED:
+		delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA;
+		break;
+	case GIT_DELTA_MODIFIED:
+		break;
+	default:
+		delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA;
+		delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA;
+		break;
 	}
 
+#define CHECK_UNMODIFIED (GIT_DIFF_FILE_NO_DATA | GIT_DIFF_FILE_VALID_OID)
+
 	check_if_unmodified =
-		(load_old && (delta->old_file.flags & GIT_DIFF_FILE_VALID_OID) == 0) ||
-		(load_new && (delta->new_file.flags & GIT_DIFF_FILE_VALID_OID) == 0);
+		(delta->old_file.flags & CHECK_UNMODIFIED) == 0 &&
+		(delta->new_file.flags & CHECK_UNMODIFIED) == 0;
 
 	/* Always try to load workdir content first, since it may need to be
 	 * filtered (and hence use 2x memory) and we want to minimize the max
 	 * memory footprint during diff.
 	 */
 
-	if (load_old && ctxt->old_src == GIT_ITERATOR_WORKDIR) {
+	if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
+		ctxt->old_src == GIT_ITERATOR_WORKDIR) {
 		if ((error = get_workdir_content(
 				ctxt, &delta->old_file, &ctxt->old_data)) < 0)
 			goto cleanup;
@@ -550,7 +565,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
 			goto cleanup;
 	}
 
-	if (load_new && ctxt->new_src == GIT_ITERATOR_WORKDIR) {
+	if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
+		ctxt->new_src == GIT_ITERATOR_WORKDIR) {
 		if ((error = get_workdir_content(
 				ctxt, &delta->new_file, &ctxt->new_data)) < 0)
 			goto cleanup;
@@ -558,7 +574,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
 			goto cleanup;
 	}
 
-	if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR) {
+	if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
+		ctxt->old_src != GIT_ITERATOR_WORKDIR) {
 		if ((error = get_blob_content(
 				ctxt, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0)
 			goto cleanup;
@@ -566,7 +583,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
 			goto cleanup;
 	}
 
-	if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR) {
+	if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
+		ctxt->new_src != GIT_ITERATOR_WORKDIR) {
 		if ((error = get_blob_content(
 				ctxt, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0)
 			goto cleanup;
@@ -588,6 +606,10 @@ static int diff_delta_load(diff_delta_context *ctxt)
 	}
 
 cleanup:
+	/* last change to update binary flag */
+	if (delta->binary == -1)
+		update_delta_is_binary(delta);
+
 	ctxt->loaded = !error;
 
 	/* flag if we would want to diff the contents of these files */
@@ -629,9 +651,10 @@ static int diff_delta_cb(void *priv, mmbuffer_t *bufs, int len)
 	}
 
 	if (len == 3 && !ctxt->cb_error) {
-		/* This should only happen if we are adding a line that does not
-		 * have a newline at the end and the old code did.  In that case,
-		 * we have a ADD with a DEL_EOFNL as a pair.
+		/* If we have a '+' and a third buf, then we have added a line
+		 * without a newline and the old code had one, so DEL_EOFNL.
+		 * If we have a '-' and a third buf, then we have removed a line
+		 * with out a newline but added a blank line, so ADD_EOFNL.
 		 */
 		char origin =
 			(*bufs[0].ptr == '+') ? GIT_DIFF_LINE_DEL_EOFNL :
@@ -1036,6 +1059,7 @@ static void set_data_from_blob(
 	} else {
 		map->data = "";
 		file->size = map->len = 0;
+		file->flags |= GIT_DIFF_FILE_NO_DATA;
 	}
 }
 
diff --git a/src/fileops.c b/src/fileops.c
index d4def1a..cbe3d47 100644
--- a/src/fileops.c
+++ b/src/fileops.c
@@ -115,40 +115,26 @@ mode_t git_futils_canonical_mode(mode_t raw_mode)
 		return 0;
 }
 
-#define MAX_READ_STALLS 10
-
 int git_futils_readbuffer_fd(git_buf *buf, git_file fd, size_t len)
 {
-	int stalls = MAX_READ_STALLS;
+	ssize_t read_size;
 
 	git_buf_clear(buf);
 
 	if (git_buf_grow(buf, len + 1) < 0)
 		return -1;
 
-	buf->ptr[len] = '\0';
-
-	while (len > 0) {
-		ssize_t read_size = p_read(fd, buf->ptr + buf->size, len);
-
-		if (read_size < 0) {
-			giterr_set(GITERR_OS, "Failed to read descriptor");
-			return -1;
-		}
-
-		if (read_size == 0) {
-			stalls--;
+	/* p_read loops internally to read len bytes */
+	read_size = p_read(fd, buf->ptr, len);
 
-			if (!stalls) {
-				giterr_set(GITERR_OS, "Too many stalls reading descriptor");
-				return -1;
-			}
-		}
-
-		len -= read_size;
-		buf->size += read_size;
+	if (read_size < 0) {
+		giterr_set(GITERR_OS, "Failed to read descriptor");
+		return -1;
 	}
 
+	buf->ptr[read_size] = '\0';
+	buf->size = read_size;
+
 	return 0;
 }
 
diff --git a/tests-clar/diff/diff_helpers.c b/tests-clar/diff/diff_helpers.c
index ef59b68..767b343 100644
--- a/tests-clar/diff/diff_helpers.c
+++ b/tests-clar/diff/diff_helpers.c
@@ -89,7 +89,8 @@ int diff_line_fn(
 		e->line_adds++;
 		break;
 	case GIT_DIFF_LINE_ADD_EOFNL:
-		assert(0);
+		/* technically not a line add, but we'll count it as such */
+		e->line_adds++;
 		break;
 	case GIT_DIFF_LINE_DELETION:
 		e->line_dels++;
diff --git a/tests-clar/diff/diffiter.c b/tests-clar/diff/diffiter.c
index 56c2547..23071e4 100644
--- a/tests-clar/diff/diffiter.c
+++ b/tests-clar/diff/diffiter.c
@@ -114,3 +114,88 @@ void test_diff_diffiter__iterate_files_and_hunks(void)
 	git_diff_iterator_free(iter);
 	git_diff_list_free(diff);
 }
+
+void test_diff_diffiter__max_size_threshold(void)
+{
+	git_repository *repo = cl_git_sandbox_init("status");
+	git_diff_options opts = {0};
+	git_diff_list *diff = NULL;
+	git_diff_iterator *iter;
+	git_diff_delta *delta;
+	int error, file_count = 0, binary_count = 0, hunk_count = 0;
+
+	opts.context_lines = 3;
+	opts.interhunk_lines = 1;
+	opts.flags |= GIT_DIFF_INCLUDE_IGNORED | GIT_DIFF_INCLUDE_UNTRACKED;
+
+	cl_git_pass(git_diff_workdir_to_index(repo, &opts, &diff));
+	cl_git_pass(git_diff_iterator_new(&iter, diff));
+
+	while ((error = git_diff_iterator_next_file(&delta, iter)) != GIT_ITEROVER) {
+		cl_assert_equal_i(0, error);
+		cl_assert(delta);
+
+		file_count++;
+
+		hunk_count += git_diff_iterator_num_hunks_in_file(iter);
+
+		assert(delta->binary == 0 || delta->binary == 1);
+
+		binary_count += delta->binary;
+	}
+
+	cl_assert_equal_i(GIT_ITEROVER, error);
+	cl_assert(delta == NULL);
+
+	cl_assert_equal_i(13, file_count);
+	cl_assert_equal_i(0, binary_count);
+	cl_assert_equal_i(8, hunk_count);
+
+	git_diff_iterator_free(iter);
+	git_diff_list_free(diff);
+
+	/* try again with low file size threshold */
+
+	file_count = 0;
+	binary_count = 0;
+	hunk_count = 0;
+
+	opts.context_lines = 3;
+	opts.interhunk_lines = 1;
+	opts.flags |= GIT_DIFF_INCLUDE_IGNORED | GIT_DIFF_INCLUDE_UNTRACKED;
+	opts.max_size = 50; /* treat anything over 50 bytes as binary! */
+
+	cl_git_pass(git_diff_workdir_to_index(repo, &opts, &diff));
+	cl_git_pass(git_diff_iterator_new(&iter, diff));
+
+	while ((error = git_diff_iterator_next_file(&delta, iter)) != GIT_ITEROVER) {
+		cl_assert_equal_i(0, error);
+		cl_assert(delta);
+
+		file_count++;
+
+		hunk_count += git_diff_iterator_num_hunks_in_file(iter);
+
+		assert(delta->binary == 0 || delta->binary == 1);
+
+		binary_count += delta->binary;
+	}
+
+	cl_assert_equal_i(GIT_ITEROVER, error);
+	cl_assert(delta == NULL);
+
+	cl_assert_equal_i(13, file_count);
+
+	/* Three files are over the 50 byte threshold:
+	 * - staged_changes_file_deleted
+	 * - staged_changes_modified_file
+	 * - staged_new_file_modified_file
+	 */
+	cl_assert_equal_i(3, binary_count);
+
+	cl_assert_equal_i(5, hunk_count);
+
+	git_diff_iterator_free(iter);
+	git_diff_list_free(diff);
+
+}