Merge pull request #2704
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 96bd9a1..e1c02f9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,9 @@ v0.22 + 1
 
 ### Changes or improvements
 
+* Updated binary identification in CRLF filtering to avoid false positives in
+  UTF-8 files.
+
 * Rename and copy detection is enabled for small files.
 
 ### API additions
diff --git a/src/buf_text.c b/src/buf_text.c
index cead599..cb3661e 100644
--- a/src/buf_text.c
+++ b/src/buf_text.c
@@ -191,7 +191,10 @@ bool git_buf_text_is_binary(const git_buf *buf)
 	while (scan < end) {
 		unsigned char c = *scan++;
 
-		if (c > 0x1F && c < 0x7F)
+		/* Printable characters are those above SPACE (0x1F) excluding DEL,
+		 * and including BS, ESC and FF.
+		 */
+		if ((c > 0x1F && c != 127) || c == '\b' || c == '\033' || c == '\014')
 			printable++;
 		else if (c == '\0')
 			return true;
diff --git a/tests/checkout/crlf.c b/tests/checkout/crlf.c
index 496f83d..b6d4e94 100644
--- a/tests/checkout/crlf.c
+++ b/tests/checkout/crlf.c
@@ -106,6 +106,31 @@ void test_checkout_crlf__all_crlf_autocrlf_true(void)
 	check_file_contents("./crlf/all-crlf", ALL_CRLF_TEXT_RAW);
 }
 
+void test_checkout_crlf__detect_crlf_autocrlf_true_utf8(void)
+{
+	git_checkout_options opts = GIT_CHECKOUT_OPTIONS_INIT;
+	opts.checkout_strategy = GIT_CHECKOUT_SAFE_CREATE;
+
+	cl_repo_set_bool(g_repo, "core.autocrlf", true);
+
+	git_repository_set_head(g_repo, "refs/heads/utf8", NULL, NULL);
+	git_checkout_head(g_repo, &opts);
+
+	if (GIT_EOL_NATIVE == GIT_EOL_LF)
+	{
+		check_file_contents("./crlf/few-utf8-chars-lf.txt", FEW_UTF8_LF_RAW);
+		check_file_contents("./crlf/many-utf8-chars-lf.txt", MANY_UTF8_LF_RAW);
+	}
+	else
+	{
+		check_file_contents("./crlf/few-utf8-chars-lf.txt", FEW_UTF8_CRLF_RAW);
+		check_file_contents("./crlf/many-utf8-chars-lf.txt", MANY_UTF8_CRLF_RAW);
+	}
+
+	check_file_contents("./crlf/few-utf8-chars-crlf.txt", FEW_UTF8_CRLF_RAW);
+	check_file_contents("./crlf/many-utf8-chars-crlf.txt", MANY_UTF8_CRLF_RAW);
+}
+
 void test_checkout_crlf__autocrlf_true_index_size_is_filtered_size(void)
 {
 	git_index *index;
diff --git a/tests/core/buffer.c b/tests/core/buffer.c
index 87dec46..d28aa21 100644
--- a/tests/core/buffer.c
+++ b/tests/core/buffer.c
@@ -830,7 +830,7 @@ void test_core_buffer__classify_with_utf8(void)
 	cl_assert(!git_buf_text_contains_nul(&b));
 
 	b.ptr = data1; b.size = b.asize = data1len;
-	cl_assert(git_buf_text_is_binary(&b));
+	cl_assert(!git_buf_text_is_binary(&b));
 	cl_assert(!git_buf_text_contains_nul(&b));
 
 	b.ptr = data2; b.size = b.asize = data2len;
diff --git a/tests/filter/crlf.h b/tests/filter/crlf.h
index 9cb98ad..786edfc 100644
--- a/tests/filter/crlf.h
+++ b/tests/filter/crlf.h
@@ -22,4 +22,9 @@
 #define MORE_CRLF_TEXT_AS_LF	"crlf\ncrlf\nlf\ncrlf\ncrlf\n"
 #define MORE_LF_TEXT_AS_LF		"lf\nlf\ncrlf\nlf\nlf\n"
 
+#define FEW_UTF8_CRLF_RAW		"\xe2\x9a\xbdThe rest is ASCII01.\r\nThe rest is ASCII02.\r\nThe rest is ASCII03.\r\nThe rest is ASCII04.\r\nThe rest is ASCII05.\r\nThe rest is ASCII06.\r\nThe rest is ASCII07.\r\nThe rest is ASCII08.\r\nThe rest is ASCII09.\r\nThe rest is ASCII10.\r\nThe rest is ASCII11.\r\nThe rest is ASCII12.\r\nThe rest is ASCII13.\r\nThe rest is ASCII14.\r\nThe rest is ASCII15.\r\nThe rest is ASCII16.\r\nThe rest is ASCII17.\r\nThe rest is ASCII18.\r\nThe rest is ASCII19.\r\nThe rest is ASCII20.\r\nThe rest is ASCII21.\r\nThe rest is ASCII22.\r\n"
+#define FEW_UTF8_LF_RAW			"\xe2\x9a\xbdThe rest is ASCII01.\nThe rest is ASCII02.\nThe rest is ASCII03.\nThe rest is ASCII04.\nThe rest is ASCII05.\nThe rest is ASCII06.\nThe rest is ASCII07.\nThe rest is ASCII08.\nThe rest is ASCII09.\nThe rest is ASCII10.\nThe rest is ASCII11.\nThe rest is ASCII12.\nThe rest is ASCII13.\nThe rest is ASCII14.\nThe rest is ASCII15.\nThe rest is ASCII16.\nThe rest is ASCII17.\nThe rest is ASCII18.\nThe rest is ASCII19.\nThe rest is ASCII20.\nThe rest is ASCII21.\nThe rest is ASCII22.\n"
+#define MANY_UTF8_CRLF_RAW		"Lets sing!\r\n\xe2\x99\xab\xe2\x99\xaa\xe2\x99\xac\xe2\x99\xa9\r\nEat food\r\n\xf0\x9f\x8d\x85\xf0\x9f\x8d\x95\r\n"
+#define MANY_UTF8_LF_RAW		"Lets sing!\n\xe2\x99\xab\xe2\x99\xaa\xe2\x99\xac\xe2\x99\xa9\nEat food\n\xf0\x9f\x8d\x85\xf0\x9f\x8d\x95\n"
+
 #endif
diff --git a/tests/resources/crlf/.gitted/objects/0e/052888828a954ca17e5882638e3c6a083e75c0 b/tests/resources/crlf/.gitted/objects/0e/052888828a954ca17e5882638e3c6a083e75c0
new file mode 100644
index 0000000..746143f
Binary files /dev/null and b/tests/resources/crlf/.gitted/objects/0e/052888828a954ca17e5882638e3c6a083e75c0 differ
diff --git a/tests/resources/crlf/.gitted/objects/9a/6c3533fef19abd6eec8e61206b5c51982b80d9 b/tests/resources/crlf/.gitted/objects/9a/6c3533fef19abd6eec8e61206b5c51982b80d9
new file mode 100644
index 0000000..78fc8ae
Binary files /dev/null and b/tests/resources/crlf/.gitted/objects/9a/6c3533fef19abd6eec8e61206b5c51982b80d9 differ
diff --git a/tests/resources/crlf/.gitted/objects/a2/34455d62297f1856c4603686150c59fcb0aafe b/tests/resources/crlf/.gitted/objects/a2/34455d62297f1856c4603686150c59fcb0aafe
new file mode 100644
index 0000000..7d204f4
Binary files /dev/null and b/tests/resources/crlf/.gitted/objects/a2/34455d62297f1856c4603686150c59fcb0aafe differ
diff --git a/tests/resources/crlf/.gitted/objects/c3/e11722855ff260bd27418988ac1467c4e9e73a b/tests/resources/crlf/.gitted/objects/c3/e11722855ff260bd27418988ac1467c4e9e73a
new file mode 100644
index 0000000..5f96dc7
Binary files /dev/null and b/tests/resources/crlf/.gitted/objects/c3/e11722855ff260bd27418988ac1467c4e9e73a differ
diff --git a/tests/resources/crlf/.gitted/objects/cd/574f5a2baa4c79504f8837b730fa0b11defe99 b/tests/resources/crlf/.gitted/objects/cd/574f5a2baa4c79504f8837b730fa0b11defe99
new file mode 100644
index 0000000..e8d0202
Binary files /dev/null and b/tests/resources/crlf/.gitted/objects/cd/574f5a2baa4c79504f8837b730fa0b11defe99 differ
diff --git a/tests/resources/crlf/.gitted/objects/f4/d25b796d86387205a5498175d66e91d1e5006a b/tests/resources/crlf/.gitted/objects/f4/d25b796d86387205a5498175d66e91d1e5006a
new file mode 100644
index 0000000..792b165
Binary files /dev/null and b/tests/resources/crlf/.gitted/objects/f4/d25b796d86387205a5498175d66e91d1e5006a differ
diff --git a/tests/resources/crlf/.gitted/refs/heads/utf8 b/tests/resources/crlf/.gitted/refs/heads/utf8
index f8e6cf5..30ab61c 100644
--- a/tests/resources/crlf/.gitted/refs/heads/utf8
+++ b/tests/resources/crlf/.gitted/refs/heads/utf8
@@ -1 +1 @@
-2b55b4b94f655c857635b6a9005c056aa7de3532
+a234455d62297f1856c4603686150c59fcb0aafe