Commit 8e35527de25ac156f3600e2ce49b0c3483c258c4

Vicent Marti 2014-12-16T13:03:02

path: Use UTF8 iteration for HFS chars

diff --git a/src/path.c b/src/path.c
index 724d9ed..b9c9729 100644
--- a/src/path.c
+++ b/src/path.c
@@ -1282,93 +1282,57 @@ GIT_INLINE(bool) verify_dospath(
 		component[last] != ':');
 }
 
-GIT_INLINE(bool) verify_dotgit_hfs(const char *component, size_t len)
+static int32_t next_hfs_char(const char **in, size_t *len)
 {
-	const unsigned char *c;
-	int git = 0, ign = 0;
-	unsigned char one, two;
-
-	while (len) {
-		switch (*(c = (const unsigned char *)component++)) {
-		case '.':
-			if (ign || git++ != 0)
-				return true;
-			break;
-		case 'g':
-		case 'G':
-			if (ign || git++ != 1)
-				return true;
-			break;
-		case 'i':
-		case 'I':
-			if (ign || git++ != 2)
-				return true;
-			break;
-		case 't':
-		case 'T':
-			if (ign || git++ != 3)
-				return true;
-			break;
-
-		case 0xe2:
-		case 0xef:
-			if (ign++ != 0)
-				return true;
-			one = *c;
-			break;
-
-		case 0x80:
-		case 0x81:
-			if (ign++ != 1 || one != 0xe2)
-				return true;
-			two = *c;
-			break;
-
-		case 0xbb:
-			if (ign++ != 1 || one != 0xef)
-				return true;
-			two = *c;
-			break;
-
-		case 0x8c:
-		case 0x8d:
-		case 0x8e:
-		case 0x8f:
-			if (ign != 2 || two != 0x80)
-				return true;
-			ign = 0;
-			break;
-
-		case 0xaa:
-		case 0xab:
-		case 0xac:
-		case 0xad:
-		case 0xae:
-			if (ign != 2 || (two != 0x80 && two != 0x81))
-				return true;
-			ign = 0;
-			break;
-
-		case 0xaf:
-			if (ign != 2 || two != 0x81)
-				return true;
-			ign = 0;
-			break;
-
-		case 0xbf:
-			if (ign != 2 || two != 0xbb)
-				return true;
-			ign = 0;
-			break;
+	while (*len) {
+		int32_t codepoint;
+		int cp_len = git__utf8_iterate((const uint8_t *)(*in), (int)(*len), &codepoint);
+		if (cp_len < 0)
+			return -1;
 
-		default:
-			return true;
+		(*in) += cp_len;
+		(*len) -= cp_len;
+
+		/* these code points are ignored completely */
+		switch (codepoint) {
+		case 0x200c: /* ZERO WIDTH NON-JOINER */
+		case 0x200d: /* ZERO WIDTH JOINER */
+		case 0x200e: /* LEFT-TO-RIGHT MARK */
+		case 0x200f: /* RIGHT-TO-LEFT MARK */
+		case 0x202a: /* LEFT-TO-RIGHT EMBEDDING */
+		case 0x202b: /* RIGHT-TO-LEFT EMBEDDING */
+		case 0x202c: /* POP DIRECTIONAL FORMATTING */
+		case 0x202d: /* LEFT-TO-RIGHT OVERRIDE */
+		case 0x202e: /* RIGHT-TO-LEFT OVERRIDE */
+		case 0x206a: /* INHIBIT SYMMETRIC SWAPPING */
+		case 0x206b: /* ACTIVATE SYMMETRIC SWAPPING */
+		case 0x206c: /* INHIBIT ARABIC FORM SHAPING */
+		case 0x206d: /* ACTIVATE ARABIC FORM SHAPING */
+		case 0x206e: /* NATIONAL DIGIT SHAPES */
+		case 0x206f: /* NOMINAL DIGIT SHAPES */
+		case 0xfeff: /* ZERO WIDTH NO-BREAK SPACE */
+			continue;
 		}
 
-		len--;
+		/* fold into lowercase -- this will only fold characters in
+		 * the ASCII range, which is perfectly fine, because the
+		 * git folder name can only be composed of ascii characters
+		 */
+		return tolower(codepoint);
 	}
+	return 0; /* NULL byte -- end of string */
+}
+
+static bool verify_dotgit_hfs(const char *path, size_t len)
+{
+	if (next_hfs_char(&path, &len) != '.' ||
+		next_hfs_char(&path, &len) != 'g' ||
+		next_hfs_char(&path, &len) != 'i' ||
+		next_hfs_char(&path, &len) != 't' ||
+		next_hfs_char(&path, &len) != 0)
+		return true;
 
-	return (ign || git != 4);
+	return false;
 }
 
 GIT_INLINE(bool) verify_char(unsigned char c, unsigned int flags)
diff --git a/src/util.c b/src/util.c
index 6b0efbe..7ee3e2f 100644
--- a/src/util.c
+++ b/src/util.c
@@ -664,3 +664,79 @@ void git__insertsort_r(
 	if (freeswap)
 		git__free(swapel);
 }
+
+static const int8_t utf8proc_utf8class[256] = {
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+int git__utf8_charlen(const uint8_t *str, int str_len)
+{
+	int length, i;
+
+	length = utf8proc_utf8class[str[0]];
+	if (!length)
+		return -1;
+
+	if (str_len >= 0 && length > str_len)
+		return -str_len;
+
+	for (i = 1; i < length; i++) {
+		if ((str[i] & 0xC0) != 0x80)
+			return -i;
+	}
+
+	return length;
+}
+
+int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst)
+{
+	int length;
+	int32_t uc = -1;
+
+	*dst = -1;
+	length = git__utf8_charlen(str, str_len);
+	if (length < 0)
+		return -1;
+
+	switch (length) {
+		case 1:
+			uc = str[0];
+			break;
+		case 2:
+			uc = ((str[0] & 0x1F) <<  6) + (str[1] & 0x3F);
+			if (uc < 0x80) uc = -1;
+			break;
+		case 3:
+			uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) <<  6)
+				+ (str[2] & 0x3F);
+			if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
+					(uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
+			break;
+		case 4:
+			uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
+				+ ((str[2] & 0x3F) <<  6) + (str[3] & 0x3F);
+			if (uc < 0x10000 || uc >= 0x110000) uc = -1;
+			break;
+	}
+
+	if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
+		return -1;
+
+	*dst = uc;
+	return length;
+}
diff --git a/src/util.h b/src/util.h
index 17cc089..7cfc0d6 100644
--- a/src/util.h
+++ b/src/util.h
@@ -368,6 +368,17 @@ extern int git__date_rfc2822_fmt(char *out, size_t len, const git_time *date);
 extern size_t git__unescape(char *str);
 
 /*
+ * Iterate through an UTF-8 string, yielding one
+ * codepoint at a time.
+ *
+ * @param str current position in the string
+ * @param str_len size left in the string; -1 if the string is NULL-terminated
+ * @param dst pointer where to store the current codepoint
+ * @return length in bytes of the read codepoint; -1 if the codepoint was invalid
+ */
+extern int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst);
+
+/*
  * Safely zero-out memory, making sure that the compiler
  * doesn't optimize away the operation.
  */