Commit 0a5c6028898e637544962c2c6b1ef8eeeb9c1d38

Carlos Martín Nieto 2015-11-04T10:30:48

blob: introduce creating a blob by writing into a stream The pair of `git_blob_create_frombuffer()` and `git_blob_create_frombuffer_commit()` is meant to replace `git_blob_create_fromchunks()` by providing a way for a user to write a new blob when they want filtering or they do not know the size. This approach allows the caller to retain control over when to add data to this buffer and a more natural fit into higher-level language's own stream abstractions instead of having to handle IO wait in the callback. The in-memory buffer size of 2MB is chosen somewhat arbitrarily to be a round multiple of usual page sizes and a value where most blobs seem likely to be either going to be way below or way over that size. It's also a round number of pages. This implementation re-uses the helper we have from `_fromchunks()` so we end up writing everything to disk, but hopefully more efficiently than with a default filebuf. A later optimisation can be to avoid writing the in-memory contents to disk, with some extra complexity.

diff --git a/include/git2/blob.h b/include/git2/blob.h
index 9a57c37..f451593 100644
--- a/include/git2/blob.h
+++ b/include/git2/blob.h
@@ -192,6 +192,49 @@ GIT_EXTERN(int) git_blob_create_fromchunks(
 	void *payload);
 
 /**
+ * Create a stream to write a new blob into the object db
+ *
+ * This function may need to buffer the data on disk and will in
+ * general not be the right choice if you know the size of the data
+ * to write. If you have data in memory, use
+ * `git_blob_create_frombuffer()`. If you do not, but know the size of
+ * the contents (and don't want/need to perform filtering), use
+ * `git_odb_open_wstream()`.
+ *
+ * Don't close this stream yourself but pass it to
+ * `git_blob_create_fromstream_commit()` to commit the write to the
+ * object db and get the object id.
+ *
+ * If the `hintpath` parameter is filled, it will be used to determine
+ * what git filters should be applied to the object before it is written
+ * to the object database.
+ *
+ * @param out the stream into which to write
+ * @param repo Repository where the blob will be written.
+ *        This repository can be bare or not.
+ * @param hintpath If not NULL, will be used to select data filters
+ *        to apply onto the content of the blob to be created.
+ * @return 0 or error code
+ */
+GIT_EXTERN(int) git_blob_create_fromstream(
+	git_writestream **out,
+	git_repository *repo,
+	const char *hintpath);
+
+/**
+ * Close the stream and write the blob to the object db
+ *
+ * The stream will be closed and freed.
+ *
+ * @param out the id of the new blob
+ * @param stream the stream to close
+ * @return 0 or an error code
+ */
+GIT_EXTERN(int) git_blob_create_fromstream_commit(
+	git_oid *out,
+	git_writestream *stream);
+
+/**
  * Write an in-memory buffer to the ODB as a blob
  *
  * @param id return the id of the written blob
diff --git a/src/blob.c b/src/blob.c
index ad0f4ac..a1ef247 100644
--- a/src/blob.c
+++ b/src/blob.c
@@ -334,6 +334,98 @@ cleanup:
 	return error;
 }
 
+typedef struct {
+	git_writestream parent;
+	git_filebuf fbuf;
+	git_repository *repo;
+	char *hintpath;
+} blob_writestream;
+
+static int blob_writestream_close(git_writestream *_stream)
+{
+	blob_writestream *stream = (blob_writestream *) _stream;
+
+	git_filebuf_cleanup(&stream->fbuf);
+	return 0;
+}
+
+static void blob_writestream_free(git_writestream *_stream)
+{
+	blob_writestream *stream = (blob_writestream *) _stream;
+
+	git_filebuf_cleanup(&stream->fbuf);
+	git__free(stream->hintpath);
+	git__free(stream);
+}
+
+static int blob_writestream_write(git_writestream *_stream, const char *buffer, size_t len)
+{
+	blob_writestream *stream = (blob_writestream *) _stream;
+
+	return git_filebuf_write(&stream->fbuf, buffer, len);
+}
+
+int git_blob_create_fromstream(git_writestream **out, git_repository *repo, const char *hintpath)
+{
+	int error;
+	git_buf path = GIT_BUF_INIT;
+	blob_writestream *stream;
+
+	assert(out && repo);
+
+	stream = git__calloc(1, sizeof(blob_writestream));
+	GITERR_CHECK_ALLOC(stream);
+
+	if (hintpath) {
+		stream->hintpath = git__strdup(hintpath);
+		GITERR_CHECK_ALLOC(stream->hintpath);
+	}
+
+	stream->repo = repo;
+	stream->parent.write = blob_writestream_write;
+	stream->parent.close = blob_writestream_close;
+	stream->parent.free  = blob_writestream_free;
+
+	if ((error = git_buf_joinpath(&path,
+				      git_repository_path(repo), GIT_OBJECTS_DIR "streamed")) < 0)
+		goto cleanup;
+
+	if ((error = git_filebuf_open_withsize(&stream->fbuf, git_buf_cstr(&path), GIT_FILEBUF_TEMPORARY,
+					       0666, 2 * 1024 * 1024)) < 0)
+		goto cleanup;
+
+	*out = (git_writestream *) stream;
+
+cleanup:
+	if (error < 0)
+		blob_writestream_free((git_writestream *) stream);
+
+	git_buf_free(&path);
+	return error;
+}
+
+int git_blob_create_fromstream_commit(git_oid *out, git_writestream *_stream)
+{
+	int error;
+	blob_writestream *stream = (blob_writestream *) _stream;
+
+	/*
+	 * We can make this more officient by avoiding writing to
+	 * disk, but for now let's re-use the helper functions we
+	 * have.
+	 */
+	if ((error = git_filebuf_flush(&stream->fbuf)) < 0)
+		goto cleanup;
+
+	error = git_blob__create_from_paths(out, NULL, stream->repo, stream->fbuf.path_lock,
+					    stream->hintpath, 0, !!stream->hintpath);
+
+cleanup:
+	blob_writestream_free(_stream);
+	return error;
+
+}
+
 int git_blob_is_binary(const git_blob *blob)
 {
 	git_buf content = GIT_BUF_INIT;
diff --git a/tests/object/blob/fromstream.c b/tests/object/blob/fromstream.c
new file mode 100644
index 0000000..10f2d8b
--- /dev/null
+++ b/tests/object/blob/fromstream.c
@@ -0,0 +1,103 @@
+#include "clar_libgit2.h"
+#include "buffer.h"
+#include "posix.h"
+#include "path.h"
+#include "fileops.h"
+
+static git_repository *repo;
+static char textual_content[] = "libgit2\n\r\n\0";
+
+void test_object_blob_fromstream__initialize(void)
+{
+	repo = cl_git_sandbox_init("testrepo.git");
+}
+
+void test_object_blob_fromstream__cleanup(void)
+{
+	cl_git_sandbox_cleanup();
+}
+
+static int text_chunked_source_cb(char *content, size_t max_length, void *payload)
+{
+	int *count;
+
+	GIT_UNUSED(max_length);
+
+	count = (int *)payload;
+	(*count)--;
+
+	if (*count == 0)
+		return 0;
+
+	strcpy(content, textual_content);
+	return (int)strlen(textual_content);
+}
+
+void test_object_blob_fromstream__multiple_write(void)
+{
+	git_oid expected_id, id;
+	git_object *blob;
+	git_writestream *stream;
+	int i, howmany = 6;
+
+	cl_git_pass(git_oid_fromstr(&expected_id, "321cbdf08803c744082332332838df6bd160f8f9"));
+
+	cl_git_fail_with(GIT_ENOTFOUND,
+			 git_object_lookup(&blob, repo, &expected_id, GIT_OBJ_ANY));
+
+	cl_git_pass(git_blob_create_fromstream(&stream, repo, NULL));
+
+	for (i = 0; i < howmany; i++)
+		cl_git_pass(stream->write(stream, textual_content, strlen(textual_content)));
+
+	cl_git_pass(git_blob_create_fromstream_end(&id, stream));
+	cl_assert_equal_oid(&expected_id, &id);
+
+	cl_git_pass(git_object_lookup(&blob, repo, &expected_id, GIT_OBJ_BLOB));
+
+	git_object_free(blob);
+}
+
+#define GITATTR "* text=auto\n" \
+	"*.txt text\n" \
+	"*.data binary\n"
+
+static void write_attributes(git_repository *repo)
+{
+	git_buf buf = GIT_BUF_INIT;
+
+	cl_git_pass(git_buf_joinpath(&buf, git_repository_path(repo), "info"));
+	cl_git_pass(git_buf_joinpath(&buf, git_buf_cstr(&buf), "attributes"));
+
+	cl_git_pass(git_futils_mkpath2file(git_buf_cstr(&buf), 0777));
+	cl_git_rewritefile(git_buf_cstr(&buf), GITATTR);
+
+	git_buf_free(&buf);
+}
+
+static void assert_named_chunked_blob(const char *expected_sha, const char *fake_name)
+{
+	git_oid expected_id, id;
+	git_writestream *stream;
+	int i, howmany = 6;
+
+	cl_git_pass(git_oid_fromstr(&expected_id, expected_sha));
+
+	cl_git_pass(git_blob_create_fromstream(&stream, repo, fake_name));
+
+	for (i = 0; i < howmany; i++)
+		cl_git_pass(stream->write(stream, textual_content, strlen(textual_content)));
+
+	cl_git_pass(git_blob_create_fromstream_end(&id, stream));
+
+	cl_assert_equal_oid(&expected_id, &id);
+}
+
+void test_object_blob_fromstream__creating_a_blob_from_chunks_honors_the_attributes_directives(void)
+{
+	write_attributes(repo);
+
+	assert_named_chunked_blob("321cbdf08803c744082332332838df6bd160f8f9", "dummy.data");
+	assert_named_chunked_blob("e9671e138a780833cb689753570fd10a55be84fb", "dummy.txt");
+	assert_named_chunked_blob("e9671e138a780833cb689753570fd10a55be84fb", "dummy.dunno");
+}