Commit f6ed5e2d055fec0ed6cc36d613362f733860ccdd

Philip Kelley 2013-01-07T09:53:43

Revert changes from git/git diff-delta.c by dak@gnu.org, proski@gnu.org

diff --git a/src/delta.c b/src/delta.c
index 2514dcc..3db319c 100644
--- a/src/delta.c
+++ b/src/delta.c
@@ -1,16 +1,8 @@
 /*
- * diff-delta.c: generate a delta between two buffers
+ * Copyright (C) 2009-2012 the libgit2 contributors
  *
- * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
- * http://www.xmailserver.org/xdiff-lib.html
- *
- * Rewritten for GIT by Nicolas Pitre <nico@fluxnic.net>, (C) 2005-2007
- *
- * Modified for libgit2 by Michael Schubert <schu@schu.io>, (C) 2012
- *
- * This code is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * This file is part of libgit2, distributed under the GNU GPL v2 with
+ * a Linking Exception. For full terms see the included COPYING file.
  */
 
 #include "delta.h"
@@ -116,11 +108,7 @@ static const unsigned int U[256] = {
 struct index_entry {
 	const unsigned char *ptr;
 	unsigned int val;
-};
-
-struct unpacked_index_entry {
-	struct index_entry entry;
-	struct unpacked_index_entry *next;
+	struct index_entry *next;
 };
 
 struct git_delta_index {
@@ -137,8 +125,7 @@ git_delta_create_index(const void *buf, unsigned long bufsize)
 	unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 	const unsigned char *data, *buffer = buf;
 	struct git_delta_index *index;
-	struct unpacked_index_entry *entry, **hash;
-	struct index_entry *packed_entry, **packed_hash;
+	struct index_entry *entry, **hash;
 	void *mem;
 	unsigned long memsize;
 
@@ -146,7 +133,7 @@ git_delta_create_index(const void *buf, unsigned long bufsize)
 		return NULL;
 
 	/* Determine index hash size.  Note that indexing skips the
-	   first byte to allow for optimizing the Rabin's polynomial
+	   first byte to allow for optimizing the rabin polynomial
 	   initialization in create_delta(). */
 	entries = (unsigned int)(bufsize - 1) / RABIN_WINDOW;
 	if (bufsize >= 0xffffffffUL) {
@@ -162,21 +149,28 @@ git_delta_create_index(const void *buf, unsigned long bufsize)
 	hmask = hsize - 1;
 
 	/* allocate lookup index */
-	memsize = sizeof(*hash) * hsize +
+	memsize = sizeof(*index) +
+		  sizeof(*hash) * hsize +
 		  sizeof(*entry) * entries;
 	mem = git__malloc(memsize);
 	if (!mem)
 		return NULL;
+	index = mem;
+	mem = index->hash;
 	hash = mem;
 	mem = hash + hsize;
 	entry = mem;
 
+	index->memsize = memsize;
+	index->src_buf = buf;
+	index->src_size = bufsize;
+	index->hash_mask = hmask;
 	memset(hash, 0, hsize * sizeof(*hash));
 
 	/* allocate an array to count hash entries */
 	hash_count = calloc(hsize, sizeof(*hash_count));
 	if (!hash_count) {
-		git__free(hash);
+		git__free(index);
 		return NULL;
 	}
 
@@ -190,13 +184,12 @@ git_delta_create_index(const void *buf, unsigned long bufsize)
 			val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 		if (val == prev_val) {
 			/* keep the lowest of consecutive identical blocks */
-			entry[-1].entry.ptr = data + RABIN_WINDOW;
-			--entries;
+			entry[-1].ptr = data + RABIN_WINDOW;
 		} else {
 			prev_val = val;
 			i = val & hmask;
-			entry->entry.ptr = data + RABIN_WINDOW;
-			entry->entry.val = val;
+			entry->ptr = data + RABIN_WINDOW;
+			entry->val = val;
 			entry->next = hash[i];
 			hash[i] = entry++;
 			hash_count[i]++;
@@ -205,7 +198,7 @@ git_delta_create_index(const void *buf, unsigned long bufsize)
 
 	/*
 	 * Determine a limit on the number of entries in the same hash
-	 * bucket.  This guards us against pathological data sets causing
+	 * bucket.  This guard us against patological data sets causing
 	 * really bad hash distribution with most entries in the same hash
 	 * bucket that would bring us to O(m*n) computing costs (m and n
 	 * corresponding to reference and target buffer sizes).
@@ -216,84 +209,21 @@ git_delta_create_index(const void *buf, unsigned long bufsize)
 	 * the reference buffer.
 	 */
 	for (i = 0; i < hsize; i++) {
-		int acc;
-
-		if (hash_count[i] <= HASH_LIMIT)
+		if (hash_count[i] < HASH_LIMIT)
 			continue;
 
-		/* We leave exactly HASH_LIMIT entries in the bucket */
-		entries -= hash_count[i] - HASH_LIMIT;
-
 		entry = hash[i];
-		acc = 0;
-
-		/*
-		 * Assume that this loop is gone through exactly
-		 * HASH_LIMIT times and is entered and left with
-		 * acc==0.  So the first statement in the loop
-		 * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
-		 * to the accumulator, and the inner loop consequently
-		 * is run (hash_count[i]-HASH_LIMIT) times, removing
-		 * one element from the list each time.  Since acc
-		 * balances out to 0 at the final run, the inner loop
-		 * body can't be left with entry==NULL.  So we indeed
-		 * encounter entry==NULL in the outer loop only.
-		 */
 		do {
-			acc += hash_count[i] - HASH_LIMIT;
-			if (acc > 0) {
-				struct unpacked_index_entry *keep = entry;
-				do {
-					entry = entry->next;
-					acc -= HASH_LIMIT;
-				} while (acc > 0);
-				keep->next = entry->next;
-			}
-			entry = entry->next;
+			struct index_entry *keep = entry;
+			int skip = hash_count[i] / HASH_LIMIT / 2;
+			do {
+				entry = entry->next;
+			} while(--skip && entry);
+			keep->next = entry;
 		} while (entry);
 	}
 	git__free(hash_count);
 
-	/*
-	 * Now create the packed index in array form
-	 * rather than linked lists.
-	 */
-	memsize = sizeof(*index)
-		+ sizeof(*packed_hash) * (hsize+1)
-		+ sizeof(*packed_entry) * entries;
-	mem = git__malloc(memsize);
-	if (!mem) {
-		git__free(hash);
-		return NULL;
-	}
-
-	index = mem;
-	index->memsize = memsize;
-	index->src_buf = buf;
-	index->src_size = bufsize;
-	index->hash_mask = hmask;
-
-	mem = index->hash;
-	packed_hash = mem;
-	mem = packed_hash + (hsize+1);
-	packed_entry = mem;
-
-	for (i = 0; i < hsize; i++) {
-		/*
-		 * Coalesce all entries belonging to one linked list
-		 * into consecutive array entries.
-		 */
-		packed_hash[i] = packed_entry;
-		for (entry = hash[i]; entry; entry = entry->next)
-			*packed_entry++ = entry->entry;
-	}
-
-	/* Sentinel value to indicate the length of the last hash bucket */
-	packed_hash[hsize] = packed_entry;
-
-	assert(packed_entry - (struct index_entry *)mem == entries);
-	git__free(hash);
-
 	return index;
 }
 
@@ -312,7 +242,7 @@ unsigned long git_delta_sizeof_index(struct git_delta_index *index)
 
 /*
  * The maximum size for any opcode sequence, including the initial header
- * plus Rabin window plus biggest copy.
+ * plus rabin window plus biggest copy.
  */
 #define MAX_OP_SIZE	(5 + 5 + 1 + RABIN_WINDOW + 7)
 
@@ -377,7 +307,7 @@ git_delta_create(
 			val ^= U[data[-RABIN_WINDOW]];
 			val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 			i = val & index->hash_mask;
-			for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
+			for (entry = index->hash[i]; entry; entry = entry->next) {
 				const unsigned char *ref = entry->ptr;
 				const unsigned char *src = data;
 				unsigned int ref_size = (unsigned int)(ref_top - ref);