Commit 83e8a6b36acc67f2702cbbc7d4e334c7f7737719

Patrick Steinhardt 2018-10-18T16:08:46

util: provide `git__memmem` function Unfortunately, neither the `memmem` nor the `strnstr` functions are part of any C standard but are merely extensions of C that are implemented by e.g. glibc. Thus, there is no standardized way to search for a string in a block of memory with a limited size, and using `strstr` is to be considered unsafe in case where the buffer has not been sanitized. In fact, there are some uses of `strstr` in exactly that unsafe way in our codebase. Provide a new function `git__memmem` that implements the `memmem` semantics. That is in a given haystack of `n` bytes, search for the occurrence of a byte sequence of `m` bytes and return a pointer to the first occurrence. The implementation chosen is the "Not So Naive" algorithm from [1]. It was chosen as the implementation is comparably simple while still being reasonably efficient in most cases. Preprocessing happens in constant time and space, searching has a time complexity of O(n*m) with a slightly sub-linear average case. [1]: http://www-igm.univ-mlv.fr/~lecroq/string/

diff --git a/src/util.c b/src/util.c
index 79b362f..d06df26 100644
--- a/src/util.c
+++ b/src/util.c
@@ -355,6 +355,47 @@ size_t git__linenlen(const char *buffer, size_t buffer_len)
 	return nl ? (size_t)(nl - buffer) + 1 : buffer_len;
 }
 
+/*
+ * Adapted Not So Naive algorithm from http://www-igm.univ-mlv.fr/~lecroq/string/
+ */
+const void * git__memmem(const void *haystack, size_t haystacklen,
+			 const void *needle, size_t needlelen)
+{
+	const char *h, *n;
+	size_t j, k, l;
+
+	if (needlelen > haystacklen || !haystacklen || !needlelen)
+		return NULL;
+
+	h = (const char *) haystack,
+	n = (const char *) needle;
+
+	if (needlelen == 1)
+		return memchr(haystack, *n, haystacklen);
+
+	if (n[0] == n[1]) {
+		k = 2;
+		l = 1;
+	} else {
+		k = 1;
+		l = 2;
+	}
+
+	j = 0;
+	while (j <= haystacklen - needlelen) {
+		if (n[1] != h[j + 1]) {
+			j += k;
+		} else {
+			if (memcmp(n + 2, h + j + 2, needlelen - 2) == 0 &&
+			    n[0] == h[j])
+				return h + j;
+			j += l;
+		}
+	}
+
+	return NULL;
+}
+
 void git__hexdump(const char *buffer, size_t len)
 {
 	static const size_t LINE_WIDTH = 16;
diff --git a/src/util.h b/src/util.h
index b6f5b75..9847266 100644
--- a/src/util.h
+++ b/src/util.h
@@ -113,6 +113,9 @@ GIT_INLINE(const void *) git__memrchr(const void *s, int c, size_t n)
 	return NULL;
 }
 
+extern const void * git__memmem(const void *haystack, size_t haystacklen,
+				const void *needle, size_t needlelen);
+
 typedef int (*git__tsort_cmp)(const void *a, const void *b);
 
 extern void git__tsort(void **dst, size_t size, git__tsort_cmp cmp);
diff --git a/tests/core/memmem.c b/tests/core/memmem.c
new file mode 100644
index 0000000..fd9986d
--- /dev/null
+++ b/tests/core/memmem.c
@@ -0,0 +1,46 @@
+#include "clar_libgit2.h"
+
+static void assert_found(const char *haystack, const char *needle, size_t expected_pos)
+{
+	cl_assert_equal_p(git__memmem(haystack, haystack ? strlen(haystack) : 0,
+				      needle, needle ? strlen(needle) : 0),
+			  haystack + expected_pos);
+}
+
+static void assert_absent(const char *haystack, const char *needle)
+{
+	cl_assert_equal_p(git__memmem(haystack, haystack ? strlen(haystack) : 0,
+				      needle, needle ? strlen(needle) : 0),
+			  NULL);
+}
+
+void test_core_memmem__found(void)
+{
+	assert_found("a", "a", 0);
+	assert_found("ab", "a", 0);
+	assert_found("ba", "a", 1);
+	assert_found("aa", "a", 0);
+	assert_found("aab", "aa", 0);
+	assert_found("baa", "aa", 1);
+	assert_found("dabc", "abc", 1);
+	assert_found("abababc", "abc", 4);
+}
+
+void test_core_memmem__absent(void)
+{
+	assert_absent("a", "b");
+	assert_absent("a", "aa");
+	assert_absent("ba", "ab");
+	assert_absent("ba", "ab");
+	assert_absent("abc", "abcd");
+	assert_absent("abcabcabc", "bcac");
+}
+
+void test_core_memmem__edgecases(void)
+{
+	assert_absent(NULL, NULL);
+	assert_absent("a", NULL);
+	assert_absent(NULL, "a");
+	assert_absent("", "a");
+	assert_absent("a", "");
+}