Optimize away git_text_gather_stats in diff GProf shows `git_text_gather_stats` as the most expensive call in large diffs. The function calculates a lot of information that is not actually used and does not do so in a optimal order. This introduces a tuned `git_buf_is_binary` function that executes the same algorithm in a fraction of the time.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
diff --git a/src/buffer.c b/src/buffer.c
index ef95839..29aaf3f 100644
--- a/src/buffer.c
+++ b/src/buffer.c
@@ -445,3 +445,21 @@ int git_buf_common_prefix(git_buf *buf, const git_strarray *strings)
return 0;
}
+
+bool git_buf_is_binary(const git_buf *buf)
+{
+ int i, printable = 0, nonprintable = 0;
+
+ for (i = 0; i < buf->size; i++) {
+ unsigned char c = buf->ptr[i];
+ if (c > 0x1F && c < 0x7f)
+ printable++;
+ else if (c == '\0')
+ return true;
+ else if (!git__isspace(c))
+ nonprintable++;
+ }
+
+ return ((printable >> 7) < nonprintable);
+}
+
diff --git a/src/buffer.h b/src/buffer.h
index af760f9..090b435 100644
--- a/src/buffer.h
+++ b/src/buffer.h
@@ -125,4 +125,7 @@ int git_buf_cmp(const git_buf *a, const git_buf *b);
/* Fill buf with the common prefix of a array of strings */
int git_buf_common_prefix(git_buf *buf, const git_strarray *strings);
+/* Check if buffer looks like it contains binary data */
+bool git_buf_is_binary(const git_buf *buf);
+
#endif
diff --git a/src/diff_output.c b/src/diff_output.c
index 9c8e079..4ad736e 100644
--- a/src/diff_output.c
+++ b/src/diff_output.c
@@ -174,15 +174,12 @@ static int file_is_binary_by_content(
git_map *new_data)
{
git_buf search;
- git_text_stats stats;
if ((delta->old_file.flags & BINARY_DIFF_FLAGS) == 0) {
search.ptr = old_data->data;
search.size = min(old_data->len, 4000);
- git_text_gather_stats(&stats, &search);
-
- if (git_text_is_binary(&stats))
+ if (git_buf_is_binary(&search))
delta->old_file.flags |= GIT_DIFF_FILE_BINARY;
else
delta->old_file.flags |= GIT_DIFF_FILE_NOT_BINARY;
@@ -192,9 +189,7 @@ static int file_is_binary_by_content(
search.ptr = new_data->data;
search.size = min(new_data->len, 4000);
- git_text_gather_stats(&stats, &search);
-
- if (git_text_is_binary(&stats))
+ if (git_buf_is_binary(&search))
delta->new_file.flags |= GIT_DIFF_FILE_BINARY;
else
delta->new_file.flags |= GIT_DIFF_FILE_NOT_BINARY;