Commit c0b01b7572232bd6c0cd848e9b92ed8a8d678bcf

Edward Thomson 2013-08-19T18:46:26

Skip UTF-8 BOM in binary detection When a git_buf contains a UTF-8 BOM, the three bytes comprising that BOM are treated as unprintable characters. For a small git_buf, the three BOM characters overwhelm the printable characters. This is problematic when trying to check out a small file as the CR/LF filtering will not apply.

diff --git a/src/buf_text.c b/src/buf_text.c
index 472339d..ecf592b 100644
--- a/src/buf_text.c
+++ b/src/buf_text.c
@@ -170,8 +170,14 @@ int git_buf_text_common_prefix(git_buf *buf, const git_strarray *strings)
 bool git_buf_text_is_binary(const git_buf *buf)
 {
 	const char *scan = buf->ptr, *end = buf->ptr + buf->size;
+	git_bom_t bom;
 	int printable = 0, nonprintable = 0;
 
+	scan += git_buf_text_detect_bom(&bom, buf, 0);
+
+	if (bom > GIT_BOM_UTF8)
+		return 1;
+
 	while (scan < end) {
 		unsigned char c = *scan++;
 
diff --git a/tests-clar/core/buffer.c b/tests-clar/core/buffer.c
index 9d9628c..8a0b671 100644
--- a/tests-clar/core/buffer.c
+++ b/tests-clar/core/buffer.c
@@ -718,6 +718,8 @@ void test_core_buffer__classify_with_utf8(void)
 	size_t data1len = 31;
 	char *data2 = "Internal NUL!!!\000\n\nI see you!\n";
 	size_t data2len = 29;
+	char *data3 = "\xef\xbb\xbfThis is UTF-8 with a BOM.\n";
+	size_t data3len = 20;
 	git_buf b;
 
 	b.ptr = data0; b.size = b.asize = data0len;
@@ -731,6 +733,10 @@ void test_core_buffer__classify_with_utf8(void)
 	b.ptr = data2; b.size = b.asize = data2len;
 	cl_assert(git_buf_text_is_binary(&b));
 	cl_assert(git_buf_text_contains_nul(&b));
+
+	b.ptr = data3; b.size = b.asize = data3len;
+	cl_assert(!git_buf_text_is_binary(&b));
+	cl_assert(!git_buf_text_contains_nul(&b));
 }
 
 #define SIMILARITY_TEST_DATA_1 \