Commit 28a0741f1ae6f5e1261c8e73854dda69e7a61067

Patrick Steinhardt 2017-04-10T09:30:08

odb: verify object hashes The upstream git.git project verifies objects when looking them up from disk. This avoids scenarios where objects have somehow become corrupt on disk, e.g. due to hardware failures or bit flips. While our mantra is usually to follow upstream behavior, we do not do so in this case, as we never check hashes of objects we have just read from disk. To fix this, we create a new error class `GIT_EMISMATCH` which denotes that we have looked up an object with a hashsum mismatch. `odb_read_1` will then, after having read the object from its backend, hash the object and compare the resulting hash to the expected hash. If hashes do not match, it will return an error. This obviously introduces another computation of checksums and could potentially impact performance. Note though that we usually perform I/O operations directly before doing this computation, and as such the actual overhead should be drowned out by I/O. Running our test suite seems to confirm this guess. On a Linux system with best-of-five timings, we had 21.592s with the check enabled and 21.590s with the ckeck disabled. Note though that our test suite mostly contains very small blobs only. It is expected that repositories with bigger blobs may notice an increased hit by this check. In addition to a new test, we also had to change the odb::backend::nonrefreshing test suite, which now triggers a hashsum mismatch when looking up the commit "deadbeef...". This is expected, as the fake backend allocated inside of the test will return an empty object for the OID "deadbeef...", which will obviously not hash back to "deadbeef..." again. We can simply adjust the hash to equal the hash of the empty object here to fix this test.

diff --git a/include/git2/errors.h b/include/git2/errors.h
index 71bff0f..6f55802 100644
--- a/include/git2/errors.h
+++ b/include/git2/errors.h
@@ -54,6 +54,7 @@ typedef enum {
 	GIT_PASSTHROUGH     = -30,	/**< Internal only */
 	GIT_ITEROVER        = -31,	/**< Signals end of iteration with iterator */
 	GIT_RETRY           = -32,	/**< Internal only */
+	GIT_EMISMATCH       = -33,	/**< Hashsum mismatch in object */
 } git_error_code;
 
 /**
diff --git a/src/odb.c b/src/odb.c
index cf321f5..0efa234 100644
--- a/src/odb.c
+++ b/src/odb.c
@@ -998,7 +998,9 @@ static int odb_read_1(git_odb_object **out, git_odb *db, const git_oid *id,
 	size_t i;
 	git_rawobj raw;
 	git_odb_object *object;
+	git_oid hashed;
 	bool found = false;
+	int error;
 
 	if (!only_refreshed && odb_read_hardcoded(&raw, id) == 0)
 		found = true;
@@ -1011,7 +1013,7 @@ static int odb_read_1(git_odb_object **out, git_odb *db, const git_oid *id,
 			continue;
 
 		if (b->read != NULL) {
-			int error = b->read(&raw.data, &raw.len, &raw.type, b, id);
+			error = b->read(&raw.data, &raw.len, &raw.type, b, id);
 			if (error == GIT_PASSTHROUGH || error == GIT_ENOTFOUND)
 				continue;
 
@@ -1025,12 +1027,24 @@ static int odb_read_1(git_odb_object **out, git_odb *db, const git_oid *id,
 	if (!found)
 		return GIT_ENOTFOUND;
 
+	if ((error = git_odb_hash(&hashed, raw.data, raw.len, raw.type)) < 0)
+		goto out;
+
+	if (!git_oid_equal(id, &hashed)) {
+		error = git_odb__error_mismatch(id, &hashed);
+		goto out;
+	}
+
 	giterr_clear();
 	if ((object = odb_object__alloc(id, &raw)) == NULL)
-		return -1;
+		goto out;
 
 	*out = git_cache_store_raw(odb_cache(db), object);
-	return 0;
+
+out:
+	if (error)
+		git__free(raw.data);
+	return error;
 }
 
 int git_odb_read(git_odb_object **out, git_odb *db, const git_oid *id)
@@ -1411,6 +1425,19 @@ int git_odb_refresh(struct git_odb *db)
 	return 0;
 }
 
+int git_odb__error_mismatch(const git_oid *expected, const git_oid *actual)
+{
+	char expected_oid[GIT_OID_HEXSZ + 1], actual_oid[GIT_OID_HEXSZ + 1];
+
+	git_oid_tostr(expected_oid, sizeof(expected_oid), expected);
+	git_oid_tostr(actual_oid, sizeof(actual_oid), actual);
+
+	giterr_set(GITERR_ODB, "object hash mismatch - expected %s but got %s",
+		expected_oid, actual_oid);
+
+	return GIT_EMISMATCH;
+}
+
 int git_odb__error_notfound(
 	const char *message, const git_oid *oid, size_t oid_len)
 {
diff --git a/src/odb.h b/src/odb.h
index 4f548bb..78c7b03 100644
--- a/src/odb.h
+++ b/src/odb.h
@@ -96,6 +96,12 @@ int git_odb__hashfd_filtered(
  */
 int git_odb__hashlink(git_oid *out, const char *path);
 
+/**
+ * Generate a GIT_EMISMATCH error for the ODB.
+ */
+int git_odb__error_mismatch(
+	const git_oid *expected, const git_oid *actual);
+
 /*
  * Generate a GIT_ENOTFOUND error for the ODB.
  */
diff --git a/tests/object/lookup.c b/tests/object/lookup.c
index 0116ee4..ed12f91 100644
--- a/tests/object/lookup.c
+++ b/tests/object/lookup.c
@@ -92,3 +92,25 @@ void test_object_lookup__lookup_corrupt_object_returns_error(void)
 	git_buf_free(&contents);
 }
 
+void test_object_lookup__lookup_object_with_wrong_hash_returns_error(void)
+{
+	const char *oldloose = "objects/8e/73b769e97678d684b809b163bebdae2911720f",
+	      *newloose = "objects/8e/73b769e97678d684b809b163bebdae2911720e",
+	      *commit = "8e73b769e97678d684b809b163bebdae2911720e";
+	git_buf oldpath = GIT_BUF_INIT, newpath = GIT_BUF_INIT;
+	git_object *object;
+	git_oid oid;
+
+	cl_git_pass(git_oid_fromstr(&oid, commit));
+
+	/* Copy object to another location with wrong hash */
+	cl_git_pass(git_buf_joinpath(&oldpath, git_repository_path(g_repo), oldloose));
+	cl_git_pass(git_buf_joinpath(&newpath, git_repository_path(g_repo), newloose));
+	cl_git_pass(git_futils_cp(oldpath.ptr, newpath.ptr, 0644));
+
+	/* Verify that lookup fails due to a hashsum mismatch */
+	cl_git_fail_with(GIT_EMISMATCH, git_object_lookup(&object, g_repo, &oid, GIT_OBJ_COMMIT));
+
+	git_buf_free(&oldpath);
+	git_buf_free(&newpath);
+}
diff --git a/tests/odb/backend/nonrefreshing.c b/tests/odb/backend/nonrefreshing.c
index 4c23628..f12ac74 100644
--- a/tests/odb/backend/nonrefreshing.c
+++ b/tests/odb/backend/nonrefreshing.c
@@ -18,6 +18,7 @@ static fake_backend *_fake;
 static git_oid _oid;
 
 #define HASH "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef"
+#define EMPTY_HASH "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
 
 static int fake_backend__exists(git_odb_backend *backend, const git_oid *oid)
 {
@@ -225,7 +226,7 @@ void test_odb_backend_nonrefreshing__read_is_invoked_once_on_success(void)
 {
 	git_object *obj;
 
-	setup_repository_and_backend(GIT_OK, HASH);
+	setup_repository_and_backend(GIT_OK, EMPTY_HASH);
 
 	cl_git_pass(git_object_lookup(&obj, _repo, &_oid, GIT_OBJ_ANY));