utf8: introduce git_utf8_char_length Introduce a function to determine the number of Unicode characters in a given UTF-8 string.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
diff --git a/src/utf8.c b/src/utf8.c
index 1a37da6..77065cb 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -114,6 +114,24 @@ int git_utf8_iterate(uint32_t *out, const char *_str, size_t str_len)
return length;
}
+size_t git_utf8_char_length(const char *_str, size_t str_len)
+{
+ const uint8_t *str = (const uint8_t *)_str;
+ size_t offset = 0, count = 0;
+
+ while (offset < str_len) {
+ int length = utf8_charlen(str + offset, str_len - offset);
+
+ if (length < 0)
+ length = 1;
+
+ offset += length;
+ count++;
+ }
+
+ return count;
+}
+
size_t git_utf8_valid_buf_length(const char *_str, size_t str_len)
{
const uint8_t *str = (const uint8_t *)_str;
diff --git a/src/utf8.h b/src/utf8.h
index 71c8f3b..dff91b2 100644
--- a/src/utf8.h
+++ b/src/utf8.h
@@ -20,6 +20,26 @@
extern int git_utf8_iterate(uint32_t *out, const char *str, size_t str_len);
/**
+ * Returns the number of characters in the given string.
+ *
+ * This function will count invalid codepoints; if any given byte is
+ * not part of a valid UTF-8 codepoint, then it will be counted toward
+ * the length in characters.
+ *
+ * In other words:
+ * 0x24 (U+0024 "$") has length 1
+ * 0xc2 0xa2 (U+00A2 "¢") has length 1
+ * 0x24 0xc2 0xa2 (U+0024 U+00A2 "$¢") has length 2
+ * 0xf0 0x90 0x8d 0x88 (U+10348 "𐍈") has length 1
+ * 0x24 0xc0 0xc1 0x34 (U+0024 <invalid> <invalid> "4) has length 4
+ *
+ * @param str string to scan
+ * @param str_len size of the string
+ * @return length in characters of the string
+ */
+extern size_t git_utf8_char_length(const char *str, size_t str_len);
+
+/**
* Iterate through an UTF-8 string and stops after finding any invalid UTF-8
* codepoints.
*
diff --git a/tests/core/utf8.c b/tests/core/utf8.c
new file mode 100644
index 0000000..021828e
--- /dev/null
+++ b/tests/core/utf8.c
@@ -0,0 +1,19 @@
+#include "clar_libgit2.h"
+
+void test_core_utf8__char_length(void)
+{
+ cl_assert_equal_i(0, git_utf8_char_length("", 0));
+ cl_assert_equal_i(1, git_utf8_char_length("$", 1));
+ cl_assert_equal_i(5, git_utf8_char_length("abcde", 5));
+ cl_assert_equal_i(1, git_utf8_char_length("\xc2\xa2", 2));
+ cl_assert_equal_i(2, git_utf8_char_length("\x24\xc2\xa2", 3));
+ cl_assert_equal_i(1, git_utf8_char_length("\xf0\x90\x8d\x88", 4));
+
+ /* uncontinued character counted as single characters */
+ cl_assert_equal_i(2, git_utf8_char_length("\x24\xc2", 2));
+ cl_assert_equal_i(3, git_utf8_char_length("\x24\xc2\xc2\xa2", 4));
+
+ /* invalid characters are counted as single characters */
+ cl_assert_equal_i(4, git_utf8_char_length("\x24\xc0\xc0\x34", 4));
+ cl_assert_equal_i(4, git_utf8_char_length("\x24\xf5\xfd\xc2", 4));
+}