Commit 36fc5497810f60cacdfab249c84583d25032a150

Pierre-Olivier Latour 2014-12-02T05:11:12

Added GIT_HASHSIG_ALLOW_SMALL_FILES to allow computing signatures for small files The implementation of the hashsig API disallows computing a signature on small files containing only a few lines. This new flag disables this behavior. git_diff_find_similar() sets this flag by default which means that rename / copy detection of small files will now work. This in turn affects the behavior of the git_status and git_blame APIs which will now detect rename of small files assuming the right options are passed.

diff --git a/include/git2/sys/hashsig.h b/include/git2/sys/hashsig.h
index 2bc32f3..09c19ae 100644
--- a/include/git2/sys/hashsig.h
+++ b/include/git2/sys/hashsig.h
@@ -12,33 +12,52 @@
 GIT_BEGIN_DECL
 
 /**
- * Similarity signature of line hashes for a buffer
+ * Similarity signature of arbitrary text content based on line hashes
  */
 typedef struct git_hashsig git_hashsig;
 
 /**
- * Options for hashsig calculation
+ * Options for hashsig computation
+ *
+ * The options GIT_HASHSIG_NORMAL, GIT_HASHSIG_IGNORE_WHITESPACE,
+ * GIT_HASHSIG_SMART_WHITESPACE are exclusive and should not be combined.
  */
 typedef enum {
-	GIT_HASHSIG_NORMAL = 0, /* use all data */
-	GIT_HASHSIG_IGNORE_WHITESPACE = 1, /* ignore whitespace */
-	GIT_HASHSIG_SMART_WHITESPACE = 2, /* ignore \r and all space after \n */
+	/**
+	 * Use all data
+	 */
+	GIT_HASHSIG_NORMAL = 0,
+
+	/**
+	 * Ignore whitespace
+	 */
+	GIT_HASHSIG_IGNORE_WHITESPACE = (1 << 0),
+
+	/**
+	 * Ignore \r and all space after \n
+	 */
+	GIT_HASHSIG_SMART_WHITESPACE = (1 << 1),
+
+	/**
+	 * Allow hashing of small files
+	 */
+	GIT_HASHSIG_ALLOW_SMALL_FILES = (1 << 2)
 } git_hashsig_option_t;
 
 /**
- * Build a similarity signature for a buffer
- *
- * If you have passed a whitespace-ignoring buffer, then the whitespace
- * will be removed from the buffer while it is being processed, modifying
- * the buffer in place.  Sorry about that!
+ * Compute a similarity signature for a text buffer
  *
- * This will return an error if the buffer doesn't contain enough data to
- * compute a valid signature.
+ * If you have passed the option GIT_HASHSIG_IGNORE_WHITESPACE, then the
+ * whitespace will be removed from the buffer while it is being processed,
+ * modifying the buffer in place. Sorry about that!
  *
- * @param out The array of hashed runs representing the file content
- * @param buf The contents of the file to hash
- * @param buflen The length of the data at `buf`
- * @param generate_pairwise_hashes Should pairwise runs be hashed
+ * @param out The computed similarity signature.
+ * @param buf The input buffer.
+ * @param buflen The input buffer size.
+ * @param opts The signature computation options (see above).
+ * @return 0 on success, GIT_EBUFS if the buffer doesn't contain enough data to
+ * compute a valid signature (unless GIT_HASHSIG_ALLOW_SMALL_FILES is set), or
+ * error code.
  */
 GIT_EXTERN(int) git_hashsig_create(
 	git_hashsig **out,
@@ -47,13 +66,17 @@ GIT_EXTERN(int) git_hashsig_create(
 	git_hashsig_option_t opts);
 
 /**
- * Build a similarity signature from a file
+ * Compute a similarity signature for a text file
  *
  * This walks through the file, only loading a maximum of 4K of file data at
- * a time.  Otherwise, it acts just like `git_hashsig_create`.
+ * a time. Otherwise, it acts just like `git_hashsig_create`.
  *
- * This will return an error if the file doesn't contain enough data to
- * compute a valid signature.
+ * @param out The computed similarity signature.
+ * @param path The path to the input file.
+ * @param opts The signature computation options (see above).
+ * @return 0 on success, GIT_EBUFS if the buffer doesn't contain enough data to
+ * compute a valid signature (unless GIT_HASHSIG_ALLOW_SMALL_FILES is set), or
+ * error code.
  */
 GIT_EXTERN(int) git_hashsig_create_fromfile(
 	git_hashsig **out,
@@ -62,13 +85,17 @@ GIT_EXTERN(int) git_hashsig_create_fromfile(
 
 /**
  * Release memory for a content similarity signature
+ *
+ * @param sig The similarity signature to free.
  */
 GIT_EXTERN(void) git_hashsig_free(git_hashsig *sig);
 
 /**
- * Measure similarity between two files
+ * Measure similarity score between two similarity signatures
  *
- * @return <0 for error, [0 to 100] as similarity score
+ * @param a The first similarity signature to compare.
+ * @param b The second similarity signature to compare.
+ * @return [0 to 100] on success as the similarity score, or error code.
  */
 GIT_EXTERN(int) git_hashsig_compare(
 	const git_hashsig *a,
diff --git a/src/diff_tform.c b/src/diff_tform.c
index d576317..9133a9b 100644
--- a/src/diff_tform.c
+++ b/src/diff_tform.c
@@ -219,34 +219,18 @@ int git_diff_find_similar__hashsig_for_file(
 	void **out, const git_diff_file *f, const char *path, void *p)
 {
 	git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
-	int error = 0;
 
 	GIT_UNUSED(f);
-	error = git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
-
-	if (error == GIT_EBUFS) {
-		error = 0;
-		giterr_clear();
-	}
-
-	return error;
+	return git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
 }
 
 int git_diff_find_similar__hashsig_for_buf(
 	void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
 {
 	git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
-	int error = 0;
 
 	GIT_UNUSED(f);
-	error = git_hashsig_create((git_hashsig **)out, buf, len, opt);
-
-	if (error == GIT_EBUFS) {
-		error = 0;
-		giterr_clear();
-	}
-
-	return error;
+	return git_hashsig_create((git_hashsig **)out, buf, len, opt);
 }
 
 void git_diff_find_similar__hashsig_free(void *sig, void *payload)
@@ -258,8 +242,14 @@ void git_diff_find_similar__hashsig_free(void *sig, void *payload)
 int git_diff_find_similar__calc_similarity(
 	int *score, void *siga, void *sigb, void *payload)
 {
+	int error;
+
 	GIT_UNUSED(payload);
-	*score = git_hashsig_compare(siga, sigb);
+	error = git_hashsig_compare(siga, sigb);
+	if (error < 0)
+		return error;
+
+	*score = error;
 	return 0;
 }
 
@@ -273,6 +263,7 @@ static int normalize_find_opts(
 	const git_diff_find_options *given)
 {
 	git_config *cfg = NULL;
+	git_hashsig_option_t hashsig_opts;
 
 	GITERR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");
 
@@ -354,11 +345,13 @@ static int normalize_find_opts(
 		opts->metric->similarity = git_diff_find_similar__calc_similarity;
 
 		if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
-			opts->metric->payload = (void *)GIT_HASHSIG_IGNORE_WHITESPACE;
+			hashsig_opts = GIT_HASHSIG_IGNORE_WHITESPACE;
 		else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
-			opts->metric->payload = (void *)GIT_HASHSIG_NORMAL;
+			hashsig_opts = GIT_HASHSIG_NORMAL;
 		else
-			opts->metric->payload = (void *)GIT_HASHSIG_SMART_WHITESPACE;
+			hashsig_opts = GIT_HASHSIG_SMART_WHITESPACE;
+		hashsig_opts |= GIT_HASHSIG_ALLOW_SMALL_FILES;
+		opts->metric->payload = (void *)hashsig_opts;
 	}
 
 	return 0;
diff --git a/src/hashsig.c b/src/hashsig.c
index a6d5f20..0ddfed9 100644
--- a/src/hashsig.c
+++ b/src/hashsig.c
@@ -35,7 +35,6 @@ struct git_hashsig {
 	hashsig_heap mins;
 	hashsig_heap maxs;
 	git_hashsig_option_t opt;
-	int considered;
 };
 
 #define HEAP_LCHILD_OF(I) (((I)<<1)+1)
@@ -135,25 +134,23 @@ static void hashsig_in_progress_init(
 {
 	int i;
 
-	switch (sig->opt) {
-	case GIT_HASHSIG_IGNORE_WHITESPACE:
+	/* no more than one can be set */
+	assert(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
+		   !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));
+
+	if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
 		for (i = 0; i < 256; ++i)
 			prog->ignore_ch[i] = git__isspace_nonlf(i);
 		prog->use_ignores = 1;
-		break;
-	case GIT_HASHSIG_SMART_WHITESPACE:
+	} else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
 		for (i = 0; i < 256; ++i)
 			prog->ignore_ch[i] = git__isspace(i);
 		prog->use_ignores = 1;
-		break;
-	default:
+	} else {
 		memset(prog, 0, sizeof(*prog));
-		break;
 	}
 }
 
-#define HASHSIG_IN_PROGRESS_INIT { 1 }
-
 static int hashsig_add_hashes(
 	git_hashsig *sig,
 	const uint8_t *data,
@@ -174,12 +171,13 @@ static int hashsig_add_hashes(
 			if (use_ignores)
 				for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
 					++scan;
-			else if (sig->opt != GIT_HASHSIG_NORMAL)
+			else if (sig->opt &
+					 (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
 				for (; scan < end && ch == '\r'; ch = *scan)
 					++scan;
 
 			/* peek at next character to decide what to do next */
-			if (sig->opt == GIT_HASHSIG_SMART_WHITESPACE)
+			if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
 				use_ignores = (ch == '\n');
 
 			if (scan >= end)
@@ -198,8 +196,6 @@ static int hashsig_add_hashes(
 			hashsig_heap_insert(&sig->mins, (hashsig_t)state);
 			hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
 
-			sig->considered++;
-
 			while (scan < end && (*scan == '\n' || !*scan))
 				++scan;
 		}
@@ -212,7 +208,8 @@ static int hashsig_add_hashes(
 
 static int hashsig_finalize_hashes(git_hashsig *sig)
 {
-	if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE) {
+	if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
+		!(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
 		giterr_set(GITERR_INVALID,
 			"File too small for similarity signature calculation");
 		return GIT_EBUFS;
diff --git a/tests/clar_libgit2.c b/tests/clar_libgit2.c
index 10f37ad..a8a8ba6 100644
--- a/tests/clar_libgit2.c
+++ b/tests/clar_libgit2.c
@@ -53,6 +53,11 @@ void cl_git_rewritefile(const char *path, const char *content)
 	cl_git_write2file(path, content, 0, O_WRONLY | O_CREAT | O_TRUNC, 0644);
 }
 
+void cl_git_rmfile(const char *filename)
+{
+	cl_must_pass(p_unlink(filename));
+}
+
 #ifdef GIT_WIN32
 
 #include "win32/utf-conv.h"
diff --git a/tests/clar_libgit2.h b/tests/clar_libgit2.h
index f515542..e1d62c8 100644
--- a/tests/clar_libgit2.h
+++ b/tests/clar_libgit2.h
@@ -112,6 +112,7 @@ void cl_git_append2file(const char *filename, const char *new_content);
 void cl_git_rewritefile(const char *filename, const char *new_content);
 void cl_git_write2file(const char *path, const char *data,
 	size_t datalen, int flags, unsigned int mode);
+void cl_git_rmfile(const char *filename);
 
 bool cl_toggle_filemode(const char *filename);
 bool cl_is_chmod_supported(void);
diff --git a/tests/diff/rename.c b/tests/diff/rename.c
index 4bc3eb5..28e0bf1 100644
--- a/tests/diff/rename.c
+++ b/tests/diff/rename.c
@@ -381,37 +381,53 @@ void test_diff_rename__not_exact_match(void)
 	git_tree_free(new_tree);
 }
 
-void test_diff_rename__handles_small_files(void)
+void test_diff_rename__test_small_files(void)
 {
-	const char *tree_sha = "2bc7f351d20b53f1c72c16c4b036e491c478c49a";
 	git_index *index;
-	git_tree *tree;
+	git_reference *head_reference;
+	git_commit *head_commit;
+	git_tree *head_tree;
+	git_tree *commit_tree;
+	git_signature *signature;
 	git_diff *diff;
-	git_diff_options diffopts = GIT_DIFF_OPTIONS_INIT;
-	git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT;
+	git_oid oid;
+	const git_diff_delta *delta;
+	git_diff_options diff_options = GIT_DIFF_OPTIONS_INIT;
+	git_diff_find_options find_options = GIT_DIFF_FIND_OPTIONS_INIT;
 
 	cl_git_pass(git_repository_index(&index, g_repo));
 
-	tree = resolve_commit_oid_to_tree(g_repo, tree_sha);
+	cl_git_mkfile("renames/small.txt", "Hello World!\n");
+	cl_git_pass(git_index_add_bypath(index, "small.txt"));
 
-	cl_git_rewritefile("renames/songof7cities.txt", "single line\n");
-	cl_git_pass(git_index_add_bypath(index, "songof7cities.txt"));
+	cl_git_pass(git_repository_head(&head_reference, g_repo));
+	cl_git_pass(git_reference_peel((git_object**)&head_commit, head_reference, GIT_OBJ_COMMIT));
+	cl_git_pass(git_commit_tree(&head_tree, head_commit));
+	cl_git_pass(git_index_write_tree(&oid, index));
+	cl_git_pass(git_tree_lookup(&commit_tree, g_repo, &oid));
+	cl_git_pass(git_signature_new(&signature, "Rename", "rename@example.com", 1404157834, 0));
+	cl_git_pass(git_commit_create(&oid, g_repo, "HEAD", signature, signature, NULL, "Test commit", commit_tree, 1, (const git_commit**)&head_commit));
 
-	cl_git_rewritefile("renames/untimely.txt", "untimely\n");
-	cl_git_pass(git_index_add_bypath(index, "untimely.txt"));
+	cl_git_mkfile("renames/copy.txt", "Hello World!\n");
+	cl_git_rmfile("renames/small.txt");
 
-	/* Tests that we can invoke find_similar on small files
-	 * and that the GIT_EBUFS (too small) error code is not
-	 * propagated to the caller.
-	 */
-	cl_git_pass(git_diff_tree_to_index(&diff, g_repo, tree, index, &diffopts));
+	diff_options.flags = GIT_DIFF_INCLUDE_UNTRACKED;
+	cl_git_pass(git_diff_tree_to_workdir(&diff, g_repo, commit_tree, &diff_options));
+	find_options.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_FOR_UNTRACKED;
+	cl_git_pass(git_diff_find_similar(diff, &find_options));
 
-	opts.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES |
-		GIT_DIFF_FIND_AND_BREAK_REWRITES;
-	cl_git_pass(git_diff_find_similar(diff, &opts));
+	cl_assert_equal_i(git_diff_num_deltas(diff), 1);
+	delta = git_diff_get_delta(diff, 0);
+	cl_assert_equal_i(delta->status, GIT_DELTA_RENAMED);
+	cl_assert_equal_s(delta->old_file.path, "small.txt");
+	cl_assert_equal_s(delta->new_file.path, "copy.txt");
 
 	git_diff_free(diff);
-	git_tree_free(tree);
+	git_signature_free(signature);
+	git_tree_free(commit_tree);
+	git_tree_free(head_tree);
+	git_commit_free(head_commit);
+	git_reference_free(head_reference);
 	git_index_free(index);
 }