Commit ad6f2153ed06405477704f4f0eefe08ab0c0d807

Edward Thomson 2019-05-21T12:50:46

utf8: use size_t for length of buffer The `git__utf8_charlen` now takes `size_t` as the buffer length, since it contains the full length of the buffer at the current position. It now returns `-1` in all cases where utf8 codepoints are invalid, since callers only care about a valid length of a sequence of codepoints, or if the current position is not valid utf8.

diff --git a/src/util.c b/src/util.c
index 508dce5..48cbc13 100644
--- a/src/util.c
+++ b/src/util.c
@@ -802,23 +802,23 @@ static const int8_t utf8proc_utf8class[256] = {
 	4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-int git__utf8_charlen(const uint8_t *str, int str_len)
+int git__utf8_charlen(const uint8_t *str, size_t str_len)
 {
-	int length, i;
+	size_t length, i;
 
 	length = utf8proc_utf8class[str[0]];
 	if (!length)
 		return -1;
 
-	if (str_len >= 0 && length > str_len)
-		return -str_len;
+	if (str_len > 0 && length > str_len)
+		return -1;
 
 	for (i = 1; i < length; i++) {
 		if ((str[i] & 0xC0) != 0x80)
-			return -i;
+			return -1;
 	}
 
-	return length;
+	return (int)length;
 }
 
 int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst)