Commit d582f26ced34c492facade402975ceabe4c97adc

Stefan Sperling 2020-03-18T16:13:48

write large objects to disk when resolving deltas; raise in-mem delta threshold

diff --git a/lib/fetch.c b/lib/fetch.c
index f5838d2..7b43b9f 100644
--- a/lib/fetch.c
+++ b/lib/fetch.c
@@ -62,6 +62,10 @@
 #include "got_lib_object_cache.h"
 #include "got_lib_repository.h"
 
+#ifndef nitems
+#define nitems(_a)	(sizeof((_a)) / sizeof((_a)[0]))
+#endif
+
 #define GOT_PROTOMAX	64
 #define GOT_HOSTMAX	256
 #define GOT_PATHMAX	512
@@ -351,7 +355,7 @@ got_fetch_pack(struct got_object_id **pack_hash, struct got_pathlist_head *refs,
 {
 	int imsg_fetchfds[2], imsg_idxfds[2];
 	int packfd = -1, npackfd = -1, idxfd = -1, nidxfd = -1, nfetchfd = -1;
-	int tmpfd = -1;
+	int tmpfds[3], i;
 	int fetchstatus, idxstatus, done = 0;
 	const struct got_error *err;
 	struct imsgbuf fetchibuf, idxibuf;
@@ -365,6 +369,8 @@ got_fetch_pack(struct got_object_id **pack_hash, struct got_pathlist_head *refs,
 	char *path;
 
 	*pack_hash = NULL;
+	for (i = 0; i < nitems(tmpfds); i++)
+		tmpfds[i] = -1;
 
 	TAILQ_INIT(&have_refs);
 
@@ -397,10 +403,12 @@ got_fetch_pack(struct got_object_id **pack_hash, struct got_pathlist_head *refs,
 		goto done;
 	}
 
-	tmpfd = got_opentempfd();
-	if (tmpfd == -1) {
-		err = got_error_from_errno("got_opentempfd");
-		goto done;
+	for (i = 0; i < nitems(tmpfds); i++) {
+		tmpfds[i] = got_opentempfd();
+		if (tmpfds[i] == -1) {
+			err = got_error_from_errno("got_opentempfd");
+			goto done;
+		}
 	}
 
 	if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, imsg_fetchfds) == -1) {
@@ -518,10 +526,12 @@ got_fetch_pack(struct got_object_id **pack_hash, struct got_pathlist_head *refs,
 	if (err != NULL)
 		goto done;
 	nidxfd = -1;
-	err = got_privsep_send_tmpfd(&idxibuf, tmpfd);
-	if (err != NULL)
-		goto done;
-	tmpfd = -1;
+	for (i = 0; i < nitems(tmpfds); i++) {
+		err = got_privsep_send_tmpfd(&idxibuf, tmpfds[i]);
+		if (err != NULL)
+			goto done;
+		tmpfds[i] = -1;
+	}
 	done = 0;
 	while (!done) {
 		int nobj_total, nobj_indexed, nobj_loose, nobj_resolved;
@@ -582,8 +592,10 @@ done:
 		err = got_error_from_errno("close");
 	if (idxfd != -1 && close(idxfd) == -1 && err == NULL)
 		err = got_error_from_errno("close");
-	if (tmpfd != -1 && close(tmpfd) == -1 && err == NULL)
-		err = got_error_from_errno("close");
+	for (i = 0; i < nitems(tmpfds); i++) {
+		if (tmpfds[i] != -1 && close(tmpfds[i]) == -1 && err == NULL)
+			err = got_error_from_errno("close");
+	}
 	free(tmppackpath);
 	free(tmpidxpath);
 	free(idxpath);
diff --git a/lib/got_lib_delta.h b/lib/got_lib_delta.h
index b04e243..4a95b0e 100644
--- a/lib/got_lib_delta.h
+++ b/lib/got_lib_delta.h
@@ -44,7 +44,7 @@ const struct got_error *got_delta_apply(FILE *, const uint8_t *, size_t,
  * The amount of result data we may keep in RAM while applying deltas.
  * Data larger than this is written to disk during delta application (slow).
  */
-#define GOT_DELTA_RESULT_SIZE_CACHED_MAX	(4 * 1024 * 1024) /* bytes */
+#define GOT_DELTA_RESULT_SIZE_CACHED_MAX	(8 * 1024 * 1024) /* bytes */
 
 /*
  * Definitions for delta data streams.
diff --git a/lib/got_lib_pack.h b/lib/got_lib_pack.h
index f95085e..3329614 100644
--- a/lib/got_lib_pack.h
+++ b/lib/got_lib_pack.h
@@ -177,6 +177,8 @@ const struct got_error *got_packidx_match_id_str_prefix(
 
 const struct got_error *got_packfile_open_object(struct got_object **,
     struct got_pack *, struct got_packidx *, int, struct got_object_id *);
+const struct got_error *got_pack_get_delta_chain_max_size(uint64_t *,
+    struct got_delta_chain *, struct got_pack *);
 const struct got_error *got_pack_get_max_delta_object_size(uint64_t *,
     struct got_object *, struct got_pack *);
 const struct got_error *got_pack_dump_delta_chain_to_file(size_t *,
diff --git a/lib/pack.c b/lib/pack.c
index 9dc1a83..6aab7a5 100644
--- a/lib/pack.c
+++ b/lib/pack.c
@@ -966,8 +966,8 @@ got_packfile_open_object(struct got_object **obj, struct got_pack *pack,
 	return err;
 }
 
-static const struct got_error *
-get_delta_chain_max_size(uint64_t *max_size, struct got_delta_chain *deltas,
+const struct got_error *
+got_pack_get_delta_chain_max_size(uint64_t *max_size, struct got_delta_chain *deltas,
     struct got_pack *pack)
 {
 	struct got_delta *delta;
@@ -1026,7 +1026,7 @@ got_pack_get_max_delta_object_size(uint64_t *size, struct got_object *obj,
 	if ((obj->flags & GOT_OBJ_FLAG_DELTIFIED) == 0)
 		return got_error(GOT_ERR_OBJ_TYPE);
 
-	return get_delta_chain_max_size(size, &obj->deltas, pack);
+	return got_pack_get_delta_chain_max_size(size, &obj->deltas, pack);
 }
 
 const struct got_error *
@@ -1047,7 +1047,7 @@ got_pack_dump_delta_chain_to_file(size_t *result_size,
 		return got_error(GOT_ERR_BAD_DELTA_CHAIN);
 
 	/* We process small enough files entirely in memory for speed. */
-	err = get_delta_chain_max_size(&max_size, deltas, pack);
+	err = got_pack_get_delta_chain_max_size(&max_size, deltas, pack);
 	if (err)
 		return err;
 	if (max_size < GOT_DELTA_RESULT_SIZE_CACHED_MAX) {
@@ -1215,7 +1215,7 @@ got_pack_dump_delta_chain_to_mem(uint8_t **outbuf, size_t *outlen,
 	if (SIMPLEQ_EMPTY(&deltas->entries))
 		return got_error(GOT_ERR_BAD_DELTA_CHAIN);
 
-	err = get_delta_chain_max_size(&max_size, deltas, pack);
+	err = got_pack_get_delta_chain_max_size(&max_size, deltas, pack);
 	if (err)
 		return err;
 	accum_buf = malloc(max_size);
diff --git a/libexec/got-index-pack/got-index-pack.c b/libexec/got-index-pack/got-index-pack.c
index cbeb792..e2ac91e 100644
--- a/libexec/got-index-pack/got-index-pack.c
+++ b/libexec/got-index-pack/got-index-pack.c
@@ -52,6 +52,10 @@
 #include "got_lib_pack.h"
 #include "got_lib_delta_cache.h"
 
+#ifndef nitems
+#define nitems(_a)	(sizeof((_a)) / sizeof((_a)[0]))
+#endif
+
 struct got_indexed_object {
 	struct got_object_id id;
 
@@ -145,13 +149,13 @@ read_crc(uint32_t *crc, int fd, size_t len)
 }
 
 static const struct got_error *
-read_file_sha1(SHA1_CTX *ctx, FILE *f)
+read_file_sha1(SHA1_CTX *ctx, FILE *f, size_t len)
 {
 	uint8_t buf[8192];
-	size_t r;
+	size_t n, r;
 
-	for (;;) {
-		r = fread(buf, 1, sizeof(buf), f);
+	for (n = len; n > 0; n -= r) {
+		r = fread(buf, 1, n > sizeof(buf) ? sizeof(buf) : n, f);
 		if (r == 0) {
 			if (feof(f))
 				return NULL;
@@ -170,7 +174,7 @@ read_packed_object(struct got_pack *pack, struct got_indexed_object *obj,
 	const struct got_error *err = NULL;
 	SHA1_CTX ctx;
 	uint8_t *data = NULL;
-	size_t datalen;
+	size_t datalen = 0;
 	ssize_t n;
 	char *header;
 	size_t headerlen;
@@ -238,7 +242,7 @@ read_packed_object(struct got_pack *pack, struct got_indexed_object *obj,
 		headerlen = strlen(header) + 1;
 		SHA1Update(&ctx, header, headerlen);
 		if (obj->size > GOT_DELTA_RESULT_SIZE_CACHED_MAX) {
-			err = read_file_sha1(&ctx, tmpfile);
+			err = read_file_sha1(&ctx, tmpfile, datalen);
 			if (err)
 				break;
 		} else
@@ -349,16 +353,18 @@ hwrite(int fd, void *buf, int len, SHA1_CTX *ctx)
 
 static const struct got_error *
 resolve_deltified_object(struct got_pack *pack, struct got_packidx *packidx,
-    struct got_indexed_object *obj)
+    struct got_indexed_object *obj, FILE *tmpfile, FILE *delta_base_file,
+    FILE *delta_accum_file)
 {
 	const struct got_error *err = NULL;
 	struct got_delta_chain deltas;
 	struct got_delta *delta;
 	uint8_t *buf = NULL;
-	size_t len;
+	size_t len = 0;
 	SHA1_CTX ctx;
 	char *header = NULL;
 	size_t headerlen;
+	uint64_t max_size;
 	int base_obj_type;
 	const char *obj_label;
 
@@ -371,12 +377,23 @@ resolve_deltified_object(struct got_pack *pack, struct got_packidx *packidx,
 	if (err)
 		goto done;
 
-	/* XXX TODO reading large objects into memory is bad! */
-	err = got_pack_dump_delta_chain_to_mem(&buf, &len, &deltas, pack);
+	err = got_pack_get_delta_chain_max_size(&max_size, &deltas, pack);
+	if (err)
+		goto done;
+	if (max_size > GOT_DELTA_RESULT_SIZE_CACHED_MAX) {
+		rewind(tmpfile);
+		rewind(delta_base_file);
+		rewind(delta_accum_file);
+		err = got_pack_dump_delta_chain_to_file(&len, &deltas,
+		    pack, tmpfile, delta_base_file, delta_accum_file);
+		if (err)
+			goto done;
+	} else {
+		err = got_pack_dump_delta_chain_to_mem(&buf, &len,
+		    &deltas, pack);
+	}
 	if (err)
 		goto done;
-
-	SHA1Init(&ctx);
 
 	err = got_delta_chain_get_base_type(&base_obj_type, &deltas);
 	if (err)
@@ -389,8 +406,14 @@ resolve_deltified_object(struct got_pack *pack, struct got_packidx *packidx,
 		goto done;
 	}
 	headerlen = strlen(header) + 1;
+	SHA1Init(&ctx);
 	SHA1Update(&ctx, header, headerlen);
-	SHA1Update(&ctx, buf, len);
+	if (max_size > GOT_DELTA_RESULT_SIZE_CACHED_MAX) {
+		err = read_file_sha1(&ctx, tmpfile, len);
+		if (err)
+			goto done;
+	} else
+		SHA1Update(&ctx, buf, len);
 	SHA1Final(obj->id.sha1, &ctx);
 done:
 	free(buf);
@@ -550,7 +573,8 @@ update_packidx(struct got_packidx *packidx, int nobj,
 
 static const struct got_error *
 index_pack(struct got_pack *pack, int idxfd, FILE *tmpfile,
-    uint8_t *pack_hash, struct imsgbuf *ibuf)
+    FILE *delta_base_file, FILE *delta_accum_file, uint8_t *pack_hash,
+    struct imsgbuf *ibuf)
 {
 	const struct got_error *err;
 	struct got_packfile_hdr hdr;
@@ -748,7 +772,8 @@ index_pack(struct got_pack *pack, int idxfd, FILE *tmpfile,
 				goto done;
 			}
 
-			err = resolve_deltified_object(pack, &packidx, obj);
+			err = resolve_deltified_object(pack, &packidx, obj,
+			    tmpfile, delta_base_file, delta_accum_file);
 			if (err) {
 				if (err->code != GOT_ERR_NO_OBJ)
 					goto done;
@@ -862,8 +887,8 @@ main(int argc, char **argv)
 	const struct got_error *err = NULL, *close_err;
 	struct imsgbuf ibuf;
 	struct imsg imsg;
-	int idxfd = -1, tmpfd = -1;
-	FILE *tmpfile = NULL;
+	int idxfd = -1, tmpfd = -1, i;
+	FILE *tmpfiles[3];
 	struct got_pack pack;
 	uint8_t pack_hash[SHA1_DIGEST_LENGTH];
 	off_t packfile_size;
@@ -873,6 +898,9 @@ main(int argc, char **argv)
 		sleep(1);
 #endif
 
+	for (i = 0; i < nitems(tmpfiles); i++)
+		tmpfiles[i] = NULL;
+
 	memset(&pack, 0, sizeof(pack));
 	pack.fd = -1;
 	pack.delta_cache = got_delta_cache_alloc(500,
@@ -922,26 +950,28 @@ main(int argc, char **argv)
 	}
 	idxfd = imsg.fd;
 
-	err = got_privsep_recv_imsg(&imsg, &ibuf, 0);
-	if (err)
-		goto done;
-	if (imsg.hdr.type == GOT_IMSG_STOP)
-		goto done;
-	if (imsg.hdr.type != GOT_IMSG_TMPFD) {
-		err = got_error(GOT_ERR_PRIVSEP_MSG);
-		goto done;
-	}
-	if (imsg.hdr.len - IMSG_HEADER_SIZE != 0) {
-		err = got_error(GOT_ERR_PRIVSEP_LEN);
-		goto done;
-	}
-	tmpfd = imsg.fd;
-	tmpfile = fdopen(tmpfd, "w+");
-	if (tmpfile == NULL) {
-		err = got_error_from_errno("fdopen");
-		goto done;
+	for (i = 0; i < nitems(tmpfiles); i++) {
+		err = got_privsep_recv_imsg(&imsg, &ibuf, 0);
+		if (err)
+			goto done;
+		if (imsg.hdr.type == GOT_IMSG_STOP)
+			goto done;
+		if (imsg.hdr.type != GOT_IMSG_TMPFD) {
+			err = got_error(GOT_ERR_PRIVSEP_MSG);
+			goto done;
+		}
+		if (imsg.hdr.len - IMSG_HEADER_SIZE != 0) {
+			err = got_error(GOT_ERR_PRIVSEP_LEN);
+			goto done;
+		}
+		tmpfd = imsg.fd;
+		tmpfiles[i] = fdopen(tmpfd, "w+");
+		if (tmpfiles[i] == NULL) {
+			err = got_error_from_errno("fdopen");
+			goto done;
+		}
+		tmpfd = -1;
 	}
-	tmpfd = -1;
 
 	if (lseek(pack.fd, 0, SEEK_END) == -1) {
 		err = got_error_from_errno("lseek");
@@ -965,7 +995,8 @@ main(int argc, char **argv)
 	if (pack.map == MAP_FAILED)
 		pack.map = NULL; /* fall back to read(2) */
 #endif
-	err = index_pack(&pack, idxfd, tmpfile, pack_hash, &ibuf);
+	err = index_pack(&pack, idxfd, tmpfiles[0], tmpfiles[1], tmpfiles[2],
+	    pack_hash, &ibuf);
 done:
 	close_err = got_pack_close(&pack);
 	if (close_err && err == NULL)
@@ -974,6 +1005,11 @@ done:
 		err = got_error_from_errno("close");
 	if (tmpfd != -1 && close(tmpfd) == -1 && err == NULL)
 		err = got_error_from_errno("close");
+	for (i = 0; i < nitems(tmpfiles); i++) {
+		if (tmpfiles[i] != NULL && fclose(tmpfiles[i]) == EOF &&
+		    err == NULL)
+			err = got_error_from_errno("close");
+	}
 
 	if (err == NULL)
 		err = got_privsep_send_index_pack_done(&ibuf);