Commit 1e87a3c39e7647b4dba91b8b1c77da25d677294a

Stefan Sperling 2020-03-18T16:13:42

avoid re-reading the entirety of an object's data to calculate the CRC

diff --git a/lib/got_lib_inflate.h b/lib/got_lib_inflate.h
index d8d1ad0..e97951c 100644
--- a/lib/got_lib_inflate.h
+++ b/lib/got_lib_inflate.h
@@ -23,12 +23,13 @@ struct got_inflate_buf {
 	int flags;
 #define GOT_INFLATE_F_HAVE_MORE		0x01
 #define GOT_INFLATE_F_OWN_OUTBUF	0x02
+	uint32_t *input_crc;
 };
 
 #define GOT_INFLATE_BUFSIZE		32768
 
 const struct got_error *got_inflate_init(struct got_inflate_buf *, uint8_t *,
-    size_t);
+    size_t, uint32_t *);
 const struct got_error *got_inflate_read(struct got_inflate_buf *, FILE *,
     size_t *, size_t *);
 const struct got_error *got_inflate_read_fd(struct got_inflate_buf *, int,
@@ -39,7 +40,7 @@ void got_inflate_end(struct got_inflate_buf *);
 const struct got_error *got_inflate_to_mem(uint8_t **, size_t *, size_t *,
     FILE *);
 const struct got_error *got_inflate_to_mem_fd(uint8_t **, size_t *, size_t *,
-    int);
+    uint32_t *, int);
 const struct got_error *got_inflate_to_mem_mmap(uint8_t **, size_t *, uint8_t *,
     size_t, size_t);
 const struct got_error *got_inflate_to_file(size_t *, FILE *, FILE *);
diff --git a/lib/inflate.c b/lib/inflate.c
index d093fd1..75d3b40 100644
--- a/lib/inflate.c
+++ b/lib/inflate.c
@@ -35,7 +35,8 @@
 #endif
 
 const struct got_error *
-got_inflate_init(struct got_inflate_buf *zb, uint8_t *outbuf, size_t bufsize)
+got_inflate_init(struct got_inflate_buf *zb, uint8_t *outbuf, size_t bufsize,
+    uint32_t *input_crc)
 {
 	const struct got_error *err = NULL;
 	int zerr;
@@ -74,6 +75,7 @@ got_inflate_init(struct got_inflate_buf *zb, uint8_t *outbuf, size_t bufsize)
 	} else
 		zb->outbuf = outbuf;
 
+	zb->input_crc = input_crc;
 done:
 	if (err)
 		got_inflate_end(zb);
@@ -96,6 +98,9 @@ got_inflate_read(struct got_inflate_buf *zb, FILE *f, size_t *outlenp,
 	if (consumed)
 		*consumed = 0;
 	do {
+		char *crc_in = NULL;
+		size_t crc_avail = 0;
+
 		if (z->avail_in == 0) {
 			size_t n = fread(zb->inbuf, 1, zb->inlen, f);
 			if (n == 0) {
@@ -108,7 +113,15 @@ got_inflate_read(struct got_inflate_buf *zb, FILE *f, size_t *outlenp,
 			z->next_in = zb->inbuf;
 			z->avail_in = n;
 		}
+		if (zb->input_crc) {
+			crc_in = z->next_in;
+			crc_avail = z->avail_in;
+		}
 		ret = inflate(z, Z_SYNC_FLUSH);
+		if (zb->input_crc) {
+			*zb->input_crc = crc32(*zb->input_crc,
+			    crc_in, crc_avail - z->avail_in);
+		}
 	} while (ret == Z_OK && z->avail_out > 0);
 
 	if (ret == Z_OK || ret == Z_BUF_ERROR) {
@@ -141,6 +154,9 @@ got_inflate_read_fd(struct got_inflate_buf *zb, int fd, size_t *outlenp,
 	if (consumed)
 		*consumed = 0;
 	do {
+		char *crc_in = NULL;
+		size_t crc_avail = 0;
+
 		if (z->avail_in == 0) {
 			ssize_t n = read(fd, zb->inbuf, zb->inlen);
 			if (n < 0)
@@ -153,7 +169,15 @@ got_inflate_read_fd(struct got_inflate_buf *zb, int fd, size_t *outlenp,
 			z->next_in = zb->inbuf;
 			z->avail_in = n;
 		}
+		if (zb->input_crc) {
+			crc_in = z->next_in;
+			crc_avail = z->avail_in;
+		}
 		ret = inflate(z, Z_SYNC_FLUSH);
+		if (zb->input_crc) {
+			*zb->input_crc = crc32(*zb->input_crc,
+			    crc_in, crc_avail - z->avail_in);
+		}
 	} while (ret == Z_OK && z->avail_out > 0);
 
 	if (ret == Z_OK || ret == Z_BUF_ERROR) {
@@ -185,7 +209,10 @@ got_inflate_read_mmap(struct got_inflate_buf *zb, uint8_t *map, size_t offset,
 	*consumed = 0;
 
 	do {
+		char *crc_in = NULL;
+		size_t crc_avail = 0;
 		size_t last_total_in = zb->z.total_in;
+
 		if (z->avail_in == 0) {
 			if (len == 0) {
 				/* EOF */
@@ -195,7 +222,15 @@ got_inflate_read_mmap(struct got_inflate_buf *zb, uint8_t *map, size_t offset,
 			z->next_in = map + offset + *consumed;
 			z->avail_in = len - *consumed;
 		}
+		if (zb->input_crc) {
+			crc_in = z->next_in;
+			crc_avail = z->avail_in;
+		}
 		ret = inflate(z, Z_SYNC_FLUSH);
+		if (zb->input_crc) {
+			*zb->input_crc = crc32(*zb->input_crc,
+			    crc_in, crc_avail - z->avail_in);
+		}
 		*consumed += z->total_in - last_total_in;
 	} while (ret == Z_OK && z->avail_out > 0);
 
@@ -234,9 +269,9 @@ got_inflate_to_mem(uint8_t **outbuf, size_t *outlen,
 		*outbuf = malloc(GOT_INFLATE_BUFSIZE);
 		if (*outbuf == NULL)
 			return got_error_from_errno("malloc");
-		err = got_inflate_init(&zb, *outbuf, GOT_INFLATE_BUFSIZE);
+		err = got_inflate_init(&zb, *outbuf, GOT_INFLATE_BUFSIZE, NULL);
 	} else
-		err = got_inflate_init(&zb, NULL, GOT_INFLATE_BUFSIZE);
+		err = got_inflate_init(&zb, NULL, GOT_INFLATE_BUFSIZE, NULL);
 	if (err)
 		return err;
 
@@ -276,7 +311,7 @@ done:
 
 const struct got_error *
 got_inflate_to_mem_fd(uint8_t **outbuf, size_t *outlen,
-    size_t *consumed_total, int infd)
+    size_t *consumed_total, uint32_t *input_crc, int infd)
 {
 	const struct got_error *err;
 	size_t avail, consumed;
@@ -288,9 +323,11 @@ got_inflate_to_mem_fd(uint8_t **outbuf, size_t *outlen,
 		*outbuf = malloc(GOT_INFLATE_BUFSIZE);
 		if (*outbuf == NULL)
 			return got_error_from_errno("malloc");
-		err = got_inflate_init(&zb, *outbuf, GOT_INFLATE_BUFSIZE);
+		err = got_inflate_init(&zb, *outbuf, GOT_INFLATE_BUFSIZE,
+		    input_crc);
 	} else
-		err = got_inflate_init(&zb, NULL, GOT_INFLATE_BUFSIZE);
+		err = got_inflate_init(&zb, NULL, GOT_INFLATE_BUFSIZE,
+		    input_crc);
 	if (err)
 		goto done;
 
@@ -341,7 +378,7 @@ got_inflate_to_mem_mmap(uint8_t **outbuf, size_t *outlen, uint8_t *map,
 	*outbuf = malloc(GOT_INFLATE_BUFSIZE);
 	if (*outbuf == NULL)
 		return got_error_from_errno("malloc");
-	err = got_inflate_init(&zb, *outbuf, GOT_INFLATE_BUFSIZE);
+	err = got_inflate_init(&zb, *outbuf, GOT_INFLATE_BUFSIZE, NULL);
 	if (err) {
 		free(*outbuf);
 		*outbuf = NULL;
@@ -387,7 +424,7 @@ got_inflate_to_fd(size_t *outlen, FILE *infile, int outfd)
 	size_t avail;
 	struct got_inflate_buf zb;
 
-	err = got_inflate_init(&zb, NULL, GOT_INFLATE_BUFSIZE);
+	err = got_inflate_init(&zb, NULL, GOT_INFLATE_BUFSIZE, NULL);
 	if (err)
 		goto done;
 
@@ -424,7 +461,7 @@ got_inflate_to_file(size_t *outlen, FILE *infile, FILE *outfile)
 	size_t avail;
 	struct got_inflate_buf zb;
 
-	err = got_inflate_init(&zb, NULL, GOT_INFLATE_BUFSIZE);
+	err = got_inflate_init(&zb, NULL, GOT_INFLATE_BUFSIZE, NULL);
 	if (err)
 		goto done;
 
@@ -459,7 +496,7 @@ got_inflate_to_file_fd(size_t *outlen, int infd, FILE *outfile)
 	size_t avail;
 	struct got_inflate_buf zb;
 
-	err = got_inflate_init(&zb, NULL, GOT_INFLATE_BUFSIZE);
+	err = got_inflate_init(&zb, NULL, GOT_INFLATE_BUFSIZE, NULL);
 	if (err)
 		goto done;
 
@@ -496,7 +533,7 @@ got_inflate_to_file_mmap(size_t *outlen, uint8_t *map, size_t offset,
 	struct got_inflate_buf zb;
 	size_t consumed;
 
-	err = got_inflate_init(&zb, NULL, GOT_INFLATE_BUFSIZE);
+	err = got_inflate_init(&zb, NULL, GOT_INFLATE_BUFSIZE, NULL);
 	if (err)
 		goto done;
 
diff --git a/lib/object_parse.c b/lib/object_parse.c
index a4a82ed..032e903 100644
--- a/lib/object_parse.c
+++ b/lib/object_parse.c
@@ -221,7 +221,7 @@ got_object_read_header(struct got_object **obj, int fd)
 	if (buf == NULL)
 		return got_error_from_errno("malloc");
 
-	err = got_inflate_init(&zb, buf, zbsize);
+	err = got_inflate_init(&zb, buf, zbsize, NULL);
 	if (err)
 		return err;
 
diff --git a/lib/pack.c b/lib/pack.c
index 7f264cd..28dd529 100644
--- a/lib/pack.c
+++ b/lib/pack.c
@@ -720,7 +720,7 @@ read_delta_data(uint8_t **delta_buf, size_t *delta_len,
 		if (lseek(pack->fd, delta_data_offset, SEEK_SET) == -1)
 			return got_error_from_errno("lseek");
 		err = got_inflate_to_mem_fd(delta_buf, delta_len, NULL,
-		    pack->fd);
+		    NULL, pack->fd);
 	}
 	return err;
 }
@@ -1101,7 +1101,7 @@ dump_delta_chain_to_file(size_t *result_size, struct got_delta_chain *deltas,
 					    pack->filesize - mapoff);
 				} else
 					err = got_inflate_to_mem_fd(&base_buf,
-					    &base_bufsz, NULL, pack->fd);
+					    &base_bufsz, NULL, NULL, pack->fd);
 			}
 			if (err)
 				goto done;
@@ -1248,7 +1248,7 @@ got_pack_dump_delta_chain_to_mem(uint8_t **outbuf, size_t *outlen,
 					goto done;
 				}
 				err = got_inflate_to_mem_fd(&base_buf,
-				    &base_bufsz, NULL, pack->fd);
+				    &base_bufsz, NULL, NULL, pack->fd);
 			}
 			if (err)
 				goto done;
@@ -1368,7 +1368,8 @@ got_packfile_extract_object_to_mem(uint8_t **buf, size_t *len,
 		} else {
 			if (lseek(pack->fd, obj->pack_offset, SEEK_SET) == -1)
 				return got_error_from_errno("lseek");
-			err = got_inflate_to_mem_fd(buf, len, NULL, pack->fd);
+			err = got_inflate_to_mem_fd(buf, len, NULL, NULL,
+			    pack->fd);
 		}
 	} else
 		err = got_pack_dump_delta_chain_to_mem(buf, len, &obj->deltas,
diff --git a/libexec/got-index-pack/got-index-pack.c b/libexec/got-index-pack/got-index-pack.c
index 580ee43..4593055 100644
--- a/libexec/got-index-pack/got-index-pack.c
+++ b/libexec/got-index-pack/got-index-pack.c
@@ -130,6 +130,24 @@ get_obj_type_label(const char **label, int obj_type)
 	return err;
 }
 
+static const struct got_error *
+read_crc(uint32_t *crc, int fd, size_t len)
+{
+	uint8_t buf[8192];
+	size_t n;
+	ssize_t r;
+
+	for (n = len; n > 0; n -= r){
+		r = read(fd, buf, n > sizeof(buf) ? sizeof(buf) : n);
+		if (r == -1)
+			return got_error_from_errno("read");
+		if (r == 0)
+			break;
+		*crc = crc32(*crc, buf, r);
+	}
+
+	return NULL;
+}
 
 static const struct got_error *
 read_packed_object(struct got_pack *pack, struct got_indexed_object *obj)
@@ -148,13 +166,21 @@ read_packed_object(struct got_pack *pack, struct got_indexed_object *obj)
 	if (err)
 		return err;
 
+	/* XXX Seek back and get the CRC of on-disk type+size bytes. */
+	if (lseek(pack->fd, obj->off, SEEK_SET) == -1)
+		return got_error_from_errno("lseek");
+	err = read_crc(&obj->crc, pack->fd, obj->tslen);
+	if (err)
+		return err;
+
 	switch (obj->type) {
 	case GOT_OBJ_TYPE_BLOB:
 	case GOT_OBJ_TYPE_COMMIT:
 	case GOT_OBJ_TYPE_TREE:
 	case GOT_OBJ_TYPE_TAG:
 		/* XXX TODO reading large objects into memory is bad! */
-		err = got_inflate_to_mem_fd(&data, &datalen, &obj->len, pack->fd);
+		err = got_inflate_to_mem_fd(&data, &datalen, &obj->len,
+		    &obj->crc, pack->fd);
 		if (err)
 			break;
 		SHA1Init(&ctx);
@@ -184,7 +210,10 @@ read_packed_object(struct got_pack *pack, struct got_indexed_object *obj)
 			err = got_error(GOT_ERR_BAD_PACKFILE);
 			break;
 		}
-		err = got_inflate_to_mem_fd(NULL, &datalen, &obj->len, pack->fd);
+		obj->crc = crc32(obj->crc, obj->ref_id.sha1,
+		    SHA1_DIGEST_LENGTH);
+		err = got_inflate_to_mem_fd(NULL, &datalen, &obj->len,
+		    &obj->crc, pack->fd);
 		if (err)
 			break;
 		obj->len += SHA1_DIGEST_LENGTH;
@@ -195,7 +224,18 @@ read_packed_object(struct got_pack *pack, struct got_indexed_object *obj)
 		    &obj->base_offsetlen, pack, obj->off, obj->tslen);
 		if (err)
 			break;
-		err = got_inflate_to_mem_fd(NULL, &datalen, &obj->len, pack->fd);
+
+		/* XXX Seek back and get the CRC of on-disk offset bytes. */
+		if (lseek(pack->fd, obj->off + obj->tslen, SEEK_SET) == -1) {
+			err = got_error_from_errno("lseek");
+			break;
+		}
+		err = read_crc(&obj->crc, pack->fd, obj->base_offsetlen);
+		if (err)
+			break;
+
+		err = got_inflate_to_mem_fd(NULL, &datalen, &obj->len,
+		    &obj->crc, pack->fd);
 		if (err)
 			break;
 		obj->len += obj->base_offsetlen;
@@ -225,28 +265,6 @@ hwrite(int fd, void *buf, int len, SHA1_CTX *ctx)
 }
 
 static const struct got_error *
-object_crc(int packfd, struct got_indexed_object *obj)
-{
-	char buf[8096];
-	size_t n;
-	ssize_t r;
-
-	if (lseek(packfd, obj->off, SEEK_SET) == -1)
-		return got_error_from_errno("lseek");
-
-	obj->crc = crc32(0L, NULL, 0);
-	for (n = obj->tslen + obj->len; n > 0; n -= r){
-		r = read(packfd, buf, n > sizeof(buf) ? sizeof(buf) : n);
-		if (r == -1)
-			return got_error_from_errno("read");
-		if (r == 0)
-			return NULL;
-		obj->crc = crc32(obj->crc, buf, r);
-	}
-	return 0;
-}
-
-static const struct got_error *
 resolve_deltified_object(struct got_pack *pack, struct got_packidx *packidx,
     struct got_indexed_object *obj)
 {
@@ -517,6 +535,7 @@ index_pack(struct got_pack *pack, int idxfd, uint8_t *pack_hash,
 			err = got_error_from_errno("calloc");
 			goto done;
 		}
+		obj->crc = crc32(0L, NULL, 0);
 
 		/* Store offset to type+size information for this object. */
 		obj->off = lseek(pack->fd, 0, SEEK_CUR);
@@ -531,9 +550,11 @@ index_pack(struct got_pack *pack, int idxfd, uint8_t *pack_hash,
 
 		objects[i] = obj;
 
-		err = object_crc(pack->fd, obj);
-		if (err)
+		if (lseek(pack->fd, obj->off + obj->tslen + obj->len,
+		    SEEK_SET) == -1) {
+			err = got_error_from_errno("lseek");
 			goto done;
+		}
 
 		if (obj->type == GOT_OBJ_TYPE_BLOB ||
 		    obj->type == GOT_OBJ_TYPE_TREE ||