Commit 1d95b59b4dbd8eda3f83f8af2a4ae07c7cdfc245

Edward Thomson 2021-04-14T15:47:27

utf8: refactor utf8 functions Move the utf8 functions into a proper namespace `git_utf8` instead of being in the namespaceless `git__` function group. Update them to have out-params first and use `char *` instead of `uint8_t *` to match our API treating strings as `char *` (even if they truly contain `uchar`s inside).

diff --git a/src/common.h b/src/common.h
index bda3edb..9123fb5 100644
--- a/src/common.h
+++ b/src/common.h
@@ -83,6 +83,7 @@
 #include "thread.h"
 #include "integer.h"
 #include "assert_safe.h"
+#include "utf8.h"
 
 /*
  * Include the declarations for deprecated functions; this ensures
diff --git a/src/diff_xdiff.c b/src/diff_xdiff.c
index c4668fa..8622623 100644
--- a/src/diff_xdiff.c
+++ b/src/diff_xdiff.c
@@ -6,7 +6,6 @@
  */
 
 #include "diff_xdiff.h"
-#include "util.h"
 
 #include "git2/errors.h"
 #include "diff.h"
@@ -128,7 +127,7 @@ static int git_xdiff_cb(void *priv, mmbuffer_t *bufs, int len)
 			info->hunk.header_len = sizeof(info->hunk.header) - 1;
 
 		/* Sanitize the hunk header in case there is invalid Unicode */
-		buffer_len = git__utf8_valid_buf_length((const uint8_t *) bufs[0].ptr, info->hunk.header_len);
+		buffer_len = git_utf8_valid_buf_length(bufs[0].ptr, info->hunk.header_len);
 		/* Sanitizing the hunk header may delete the newline, so add it back again if there is room */
 		if (buffer_len < info->hunk.header_len) {
 			bufs[0].ptr[buffer_len] = '\n';
diff --git a/src/path.c b/src/path.c
index dde3efb..8ebb581 100644
--- a/src/path.c
+++ b/src/path.c
@@ -1562,8 +1562,8 @@ GIT_INLINE(bool) verify_dospath(
 static int32_t next_hfs_char(const char **in, size_t *len)
 {
 	while (*len) {
-		int32_t codepoint;
-		int cp_len = git__utf8_iterate((const uint8_t *)(*in), (int)(*len), &codepoint);
+		uint32_t codepoint;
+		int cp_len = git_utf8_iterate(&codepoint, *in, *len);
 		if (cp_len < 0)
 			return -1;
 
@@ -1595,7 +1595,7 @@ static int32_t next_hfs_char(const char **in, size_t *len)
 		 * the ASCII range, which is perfectly fine, because the
 		 * git folder name can only be composed of ascii characters
 		 */
-		return git__tolower(codepoint);
+		return git__tolower((int)codepoint);
 	}
 	return 0; /* NULL byte -- end of string */
 }
diff --git a/src/utf8.c b/src/utf8.c
new file mode 100644
index 0000000..1a37da6
--- /dev/null
+++ b/src/utf8.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) the libgit2 contributors. All rights reserved.
+ *
+ * This file is part of libgit2, distributed under the GNU GPL v2 with
+ * a Linking Exception. For full terms see the included COPYING file.
+ */
+
+#include "utf8.h"
+
+#include "common.h"
+
+/*
+ * git_utf8_iterate is taken from the utf8proc project,
+ * http://www.public-software-group.org/utf8proc
+ *
+ * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the ""Software""),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+static const uint8_t utf8proc_utf8class[256] = {
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static int utf8_charlen(const uint8_t *str, size_t str_len)
+{
+	uint8_t length;
+	size_t i;
+
+	length = utf8proc_utf8class[str[0]];
+	if (!length)
+		return -1;
+
+	if (str_len > 0 && length > str_len)
+		return -1;
+
+	for (i = 1; i < length; i++) {
+		if ((str[i] & 0xC0) != 0x80)
+			return -1;
+	}
+
+	return (int)length;
+}
+
+int git_utf8_iterate(uint32_t *out, const char *_str, size_t str_len)
+{
+	const uint8_t *str = (const uint8_t *)_str;
+	uint32_t uc = 0;
+	int length;
+
+	*out = 0;
+
+	if ((length = utf8_charlen(str, str_len)) < 0)
+		return -1;
+
+	switch (length) {
+		case 1:
+			uc = str[0];
+			break;
+		case 2:
+			uc = ((str[0] & 0x1F) <<  6) + (str[1] & 0x3F);
+			if (uc < 0x80) uc = -1;
+			break;
+		case 3:
+			uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) <<  6)
+				+ (str[2] & 0x3F);
+			if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
+					(uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
+			break;
+		case 4:
+			uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
+				+ ((str[2] & 0x3F) <<  6) + (str[3] & 0x3F);
+			if (uc < 0x10000 || uc >= 0x110000) uc = -1;
+			break;
+		default:
+			return -1;
+	}
+
+	if ((uc & 0xFFFF) >= 0xFFFE)
+		return -1;
+
+	*out = uc;
+	return length;
+}
+
+size_t git_utf8_valid_buf_length(const char *_str, size_t str_len)
+{
+	const uint8_t *str = (const uint8_t *)_str;
+	size_t offset = 0;
+
+	while (offset < str_len) {
+		int length = utf8_charlen(str + offset, str_len - offset);
+
+		if (length < 0)
+			break;
+
+		offset += length;
+	}
+
+	return offset;
+}
diff --git a/src/utf8.h b/src/utf8.h
new file mode 100644
index 0000000..71c8f3b
--- /dev/null
+++ b/src/utf8.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) the libgit2 contributors. All rights reserved.
+ *
+ * This file is part of libgit2, distributed under the GNU GPL v2 with
+ * a Linking Exception. For full terms see the included COPYING file.
+ */
+#ifndef INCLUDE_utf8_h__
+#define INCLUDE_utf8_h__
+
+#include "common.h"
+
+/*
+ * Iterate through an UTF-8 string, yielding one codepoint at a time.
+ *
+ * @param out pointer where to store the current codepoint
+ * @param str current position in the string
+ * @param str_len size left in the string
+ * @return length in bytes of the read codepoint; -1 if the codepoint was invalid
+ */
+extern int git_utf8_iterate(uint32_t *out, const char *str, size_t str_len);
+
+/**
+ * Iterate through an UTF-8 string and stops after finding any invalid UTF-8
+ * codepoints.
+ *
+ * @param str string to scan
+ * @param str_len size of the string
+ * @return length in bytes of the string that contains valid data
+ */
+extern size_t git_utf8_valid_buf_length(const char *str, size_t str_len);
+
+#endif
diff --git a/src/util.c b/src/util.c
index af825e4..c7af296 100644
--- a/src/util.c
+++ b/src/util.c
@@ -734,123 +734,6 @@ void git__qsort_r(
 #endif
 }
 
-/*
- * git__utf8_iterate is taken from the utf8proc project,
- * http://www.public-software-group.org/utf8proc
- *
- * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the ""Software""),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-static const int8_t utf8proc_utf8class[256] = {
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-	4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-static int util_utf8_charlen(const uint8_t *str, size_t str_len)
-{
-	size_t length, i;
-
-	length = utf8proc_utf8class[str[0]];
-	if (!length)
-		return -1;
-
-	if (str_len > 0 && length > str_len)
-		return -1;
-
-	for (i = 1; i < length; i++) {
-		if ((str[i] & 0xC0) != 0x80)
-			return -1;
-	}
-
-	return (int)length;
-}
-
-int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst)
-{
-	int length;
-	int32_t uc = -1;
-
-	*dst = -1;
-	length = util_utf8_charlen(str, str_len);
-	if (length < 0)
-		return -1;
-
-	switch (length) {
-		case 1:
-			uc = str[0];
-			break;
-		case 2:
-			uc = ((str[0] & 0x1F) <<  6) + (str[1] & 0x3F);
-			if (uc < 0x80) uc = -1;
-			break;
-		case 3:
-			uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) <<  6)
-				+ (str[2] & 0x3F);
-			if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
-					(uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
-			break;
-		case 4:
-			uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
-				+ ((str[2] & 0x3F) <<  6) + (str[3] & 0x3F);
-			if (uc < 0x10000 || uc >= 0x110000) uc = -1;
-			break;
-	}
-
-	if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
-		return -1;
-
-	*dst = uc;
-	return length;
-}
-
-size_t git__utf8_valid_buf_length(const uint8_t *str, size_t str_len)
-{
-	size_t offset = 0;
-
-	while (offset < str_len) {
-		int length = util_utf8_charlen(str + offset, str_len - offset);
-
-		if (length < 0)
-			break;
-
-		offset += length;
-	}
-
-	return offset;
-}
-
 #ifdef GIT_WIN32
 int git__getenv(git_buf *out, const char *name)
 {
diff --git a/src/util.h b/src/util.h
index 5076df2..dabd4c9 100644
--- a/src/util.h
+++ b/src/util.h
@@ -317,27 +317,6 @@ extern int git__date_rfc2822_fmt(char *out, size_t len, const git_time *date);
 extern size_t git__unescape(char *str);
 
 /*
- * Iterate through an UTF-8 string, yielding one
- * codepoint at a time.
- *
- * @param str current position in the string
- * @param str_len size left in the string; -1 if the string is NULL-terminated
- * @param dst pointer where to store the current codepoint
- * @return length in bytes of the read codepoint; -1 if the codepoint was invalid
- */
-extern int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst);
-
-/*
- * Iterate through an UTF-8 string and stops after finding any invalid UTF-8
- * codepoints.
- *
- * @param str string to scan
- * @param str_len size of the string
- * @return length in bytes of the string that contains valid data
- */
-extern size_t git__utf8_valid_buf_length(const uint8_t *str, size_t str_len);
-
-/*
  * Safely zero-out memory, making sure that the compiler
  * doesn't optimize away the operation.
  */