Some similarity metric adjustments This makes the text similarity metric treat \r as equivalent to \n and makes it skip whitespace immediately following a line terminator, so line indentation will have less effect on the difference measurement (and so \r\n will be treated as just a single line terminator). This also separates the text and binary hash calculators into two separate functions instead of have more if statements inside the loop. This should make it easier to have more differentiated heuristics in the future if we so wish.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
diff --git a/src/buf_text.c b/src/buf_text.c
index ab583f8..49ec16a 100644
--- a/src/buf_text.c
+++ b/src/buf_text.c
@@ -232,7 +232,7 @@ struct git_buf_text_hashsig {
unsigned int pairs : 1;
};
-static int similarity_advance(git_buf_text_hashsig *sig, uint32_t hash)
+static int similarity_record_hash(git_buf_text_hashsig *sig, uint32_t hash)
{
if (sig->size >= sig->asize) {
size_t new_asize = sig->asize + 512;
@@ -248,31 +248,67 @@ static int similarity_advance(git_buf_text_hashsig *sig, uint32_t hash)
return 0;
}
-static int similarity_add_hashes(
+static int similarity_add_hashes_text(
git_buf_text_hashsig *sig,
uint32_t *hash_start,
size_t *hashlen_start,
const char *ptr,
size_t len)
{
- int error = 0;
+ int error;
const char *scan = ptr, *scan_end = ptr + len;
- char term = (sig->format == SIMILARITY_FORMAT_TEXT) ? '\n' : '\0';
- uint32_t hash = hash_start ? *hash_start : SIMILARITY_HASH_START;
- size_t hashlen = hashlen_start ? *hashlen_start : 0;
+ uint32_t hash = *hash_start;
+ size_t hashlen = *hashlen_start;
+
+ while (scan < scan_end) {
+ char ch = *scan++;
+
+ if (ch == '\r' || ch == '\n' || hashlen >= SIMILARITY_MAXRUN) {
+ if ((error = similarity_record_hash(sig, hash)) < 0)
+ break;
+
+ hash = SIMILARITY_HASH_START;
+ hashlen = 0;
+
+ /* skip all whitespace immediately after line ending */
+ while (scan < scan_end && git__isspace(*scan))
+ scan++;
+ } else {
+ hash = SIMILARITY_HASH_UPDATE(hash, ch);
+ hashlen++;
+ }
+ }
+
+ *hash_start = hash;
+ *hashlen_start = hashlen;
+
+ return error;
+}
+
+static int similarity_add_hashes_binary(
+ git_buf_text_hashsig *sig,
+ uint32_t *hash_start,
+ size_t *hashlen_start,
+ const char *ptr,
+ size_t len)
+{
+ int error;
+ const char *scan = ptr, *scan_end = ptr + len;
+ uint32_t hash = *hash_start;
+ size_t hashlen = *hashlen_start;
while (scan < scan_end) {
char ch = *scan++;
- if (ch == term || hashlen >= SIMILARITY_MAXRUN) {
- if ((error = similarity_advance(sig, hash)) < 0)
+ if (!ch || hashlen >= SIMILARITY_MAXRUN) {
+ if ((error = similarity_record_hash(sig, hash)) < 0)
break;
hash = SIMILARITY_HASH_START;
hashlen = 0;
/* skip run of terminators */
- while (scan < scan_end && *scan == term)
+ while (scan < scan_end && !*scan)
scan++;
} else {
hash = SIMILARITY_HASH_UPDATE(hash, ch);
@@ -280,6 +316,28 @@ static int similarity_add_hashes(
}
}
+ *hash_start = hash;
+ *hashlen_start = hashlen;
+
+ return error;
+}
+
+static int similarity_add_hashes(
+ git_buf_text_hashsig *sig,
+ uint32_t *hash_start,
+ size_t *hashlen_start,
+ const char *ptr,
+ size_t len)
+{
+ int error = 0;
+ uint32_t hash = hash_start ? *hash_start : SIMILARITY_HASH_START;
+ size_t hashlen = hashlen_start ? *hashlen_start : 0;
+
+ if (sig->format == SIMILARITY_FORMAT_TEXT)
+ error = similarity_add_hashes_text(sig, &hash, &hashlen, ptr, len);
+ else
+ error = similarity_add_hashes_binary(sig, &hash, &hashlen, ptr, len);
+
if (hash_start)
*hash_start = hash;
if (hashlen_start)
@@ -287,7 +345,7 @@ static int similarity_add_hashes(
/* if we're not saving intermediate state, add final hash as needed */
if (!error && !hash_start && hashlen > 0)
- error = similarity_advance(sig, hash);
+ error = similarity_record_hash(sig, hash);
return error;
}
@@ -436,7 +494,7 @@ int git_buf_text_hashsig_create_fromfile(
p_close(fd);
if (!error && hashlen > 0)
- error = similarity_advance(sig, hash);
+ error = similarity_record_hash(sig, hash);
if (!error)
error = similarity_finalize_hashes(sig, generate_pairs);