commit: faster parsing The current code issues a lot of strncmp() calls in order to check for the end of the header, simply in order to copy it and start going through it again. These are a lot of calls for something we can check as we go along. Knowing the amount of parents beforehand to reduce allocations in extreme cases does not make up for them. Instead start parsing immediately and check for the double-newline after each header field, leaving the raw_header allocation for the end, which lets us go through the header once and reduces the amount of strncmp() calls significantly. In unscientific testing, this has reduced a shortlog-like usage (walking though the whole history of a branch and extracting data from the commits) of git.git from ~830ms to ~700ms and makes the time we spend in strncmp() negligible.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
diff --git a/src/commit.c b/src/commit.c
index da7c499..730fa64 100644
--- a/src/commit.c
+++ b/src/commit.c
@@ -164,33 +164,15 @@ int git_commit__parse(void *_commit, git_odb_object *odb_obj)
const char *buffer_start = git_odb_object_data(odb_obj), *buffer;
const char *buffer_end = buffer_start + git_odb_object_size(odb_obj);
git_oid parent_id;
- uint32_t parent_count = 0;
size_t header_len;
- /* find end-of-header (counting parents as we go) */
- for (buffer = buffer_start; buffer < buffer_end; ++buffer) {
- if (!strncmp("\n\n", buffer, 2)) {
- ++buffer;
- break;
- }
- if (!strncmp("\nparent ", buffer, strlen("\nparent ")))
- ++parent_count;
- }
-
- header_len = buffer - buffer_start;
- commit->raw_header = git__strndup(buffer_start, header_len);
- GITERR_CHECK_ALLOC(commit->raw_header);
+ buffer = buffer_start;
- /* point "buffer" to header data */
- buffer = commit->raw_header;
- buffer_end = commit->raw_header + header_len;
-
- if (parent_count < 1)
- parent_count = 1;
-
- git_array_init_to_size(commit->parent_ids, parent_count);
+ /* Allocate for one, which will allow not to realloc 90% of the time */
+ git_array_init_to_size(commit->parent_ids, 1);
GITERR_CHECK_ARRAY(commit->parent_ids);
+ /* The tree is always the first field */
if (git_oid__parse(&commit->tree_id, &buffer, buffer_end, "tree ") < 0)
goto bad_buffer;
@@ -221,6 +203,9 @@ int git_commit__parse(void *_commit, git_odb_object *odb_obj)
/* Parse add'l header entries */
while (buffer < buffer_end) {
const char *eoln = buffer;
+ if (buffer[-1] == '\n' && buffer[0] == '\n')
+ break;
+
while (eoln < buffer_end && *eoln != '\n')
++eoln;
@@ -236,13 +221,12 @@ int git_commit__parse(void *_commit, git_odb_object *odb_obj)
buffer = eoln;
}
- /* point "buffer" to data after header */
- buffer = git_odb_object_data(odb_obj);
- buffer_end = buffer + git_odb_object_size(odb_obj);
+ header_len = buffer - buffer_start;
+ commit->raw_header = git__strndup(buffer_start, header_len);
+ GITERR_CHECK_ALLOC(commit->raw_header);
- buffer += header_len;
- if (*buffer == '\n')
- ++buffer;
+ /* point "buffer" to data after header, +1 for the final LF */
+ buffer = buffer_start + header_len + 1;
/* extract commit message */
if (buffer <= buffer_end) {