Match binary file check of core git in diff Core git just looks for NUL bytes in files when deciding about is-binary inside diff (although it uses a better algorithm in checkout, when deciding if CRLF conversion should be done). Libgit2 was using the better algorithm in both places, but that is causing some confusion. For now, this makes diff just look for NUL bytes to decide if a file is binary by content in diff.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
diff --git a/src/buf_text.c b/src/buf_text.c
index a7122dc..0104a90 100644
--- a/src/buf_text.c
+++ b/src/buf_text.c
@@ -109,6 +109,11 @@ bool git_buf_text_is_binary(const git_buf *buf)
return ((printable >> 7) < nonprintable);
}
+bool git_buf_text_contains_nul(const git_buf *buf)
+{
+ return (strnlen(buf->ptr, buf->size) != buf->size);
+}
+
int git_buf_text_detect_bom(git_bom_t *bom, const git_buf *buf, size_t offset)
{
const char *ptr;
diff --git a/src/buf_text.h b/src/buf_text.h
index ae5e6ca..458ee33 100644
--- a/src/buf_text.h
+++ b/src/buf_text.h
@@ -71,6 +71,14 @@ extern int git_buf_text_common_prefix(git_buf *buf, const git_strarray *strs);
extern bool git_buf_text_is_binary(const git_buf *buf);
/**
+ * Check quickly if buffer contains a NUL byte
+ *
+ * @param buf Buffer to check
+ * @return true if buffer contains a NUL byte
+ */
+extern bool git_buf_text_contains_nul(const git_buf *buf);
+
+/**
* Check if a buffer begins with a UTF BOM
*
* @param bom Set to the type of BOM detected or GIT_BOM_NONE
diff --git a/src/diff_output.c b/src/diff_output.c
index f98665d..dcff788 100644
--- a/src/diff_output.c
+++ b/src/diff_output.c
@@ -142,7 +142,12 @@ static int diff_delta_is_binary_by_content(
GIT_UNUSED(ctxt);
if ((file->flags & KNOWN_BINARY_FLAGS) == 0) {
- if (git_buf_text_is_binary(&search))
+ /* TODO: provide encoding / binary detection callbacks that can
+ * be UTF-8 aware, etc. For now, instead of trying to be smart,
+ * let's just use the simple NUL-byte detection that core git uses.
+ */
+ /* previously was: if (git_buf_text_is_binary(&search)) */
+ if (git_buf_text_contains_nul(&search))
file->flags |= GIT_DIFF_FILE_BINARY;
else
file->flags |= GIT_DIFF_FILE_NOT_BINARY;
diff --git a/tests-clar/core/buffer.c b/tests-clar/core/buffer.c
index 40fc4c5..5d9b785 100644
--- a/tests-clar/core/buffer.c
+++ b/tests-clar/core/buffer.c
@@ -704,3 +704,26 @@ void test_core_buffer__base64(void)
git_buf_free(&buf);
}
+
+void test_core_buffer__classify_with_utf8(void)
+{
+ char *data0 = "Simple text\n";
+ size_t data0len = 12;
+ char *data1 = "Is that UTF-8 data I seeā¦\nYep!\n";
+ size_t data1len = 31;
+ char *data2 = "Internal NUL!!!\000\n\nI see you!\n";
+ size_t data2len = 29;
+ git_buf b;
+
+ b.ptr = data0; b.size = b.asize = data0len;
+ cl_assert(!git_buf_text_is_binary(&b));
+ cl_assert(!git_buf_text_contains_nul(&b));
+
+ b.ptr = data1; b.size = b.asize = data1len;
+ cl_assert(git_buf_text_is_binary(&b));
+ cl_assert(!git_buf_text_contains_nul(&b));
+
+ b.ptr = data2; b.size = b.asize = data2len;
+ cl_assert(git_buf_text_is_binary(&b));
+ cl_assert(git_buf_text_contains_nul(&b));
+}