Reduce content hash function collisions

The hash code returned by RawTextComparator (or that is used by the SimilarityIndex) play an important role in the speed of any algorithm that is based upon them. The lower the number of collisions produced by the hash function, the shorter the hash chains within hash tables will be, and the less likely we are to fall into O(N^2) runtime behaviors for algorithms like PatienceDiff. Our prior hash function was absolutely horrid, so replace it with the proper definition of the DJB hash that was originally published by Professor Daniel J. Bernstein. To support this assertion, below is a table listing the maximum number of collisions that result when hashing the unique lines in each source code file of 3 randomly chosen projects: test_jgit: 931 files; 122 avg. unique lines/file Algorithm | Collisions -------------+----------- prior_hash 418 djb 5 sha1 6 string_hash31 11 test_linux26: 30198 files; 258 avg. unique lines/file Algorithm | Collisions -------------+----------- prior_hash 8675 djb 32 sha1 8 string_hash31 32 test_frameworks_base: 8381 files; 184 avg. unique lines/file Algorithm | Collisions -------------+----------- prior_hash 4615 djb 10 sha1 6 string_hash31 13 We can clearly see that prior_hash performed very poorly, resulting in 8,675 collisions (elements in the same hash bucket) for at least one file in the Linux kernel repository. This leads to some very bad O(N) style insertion and lookup performance, even though the hash table was sized to be the next power-of-2 larger than the total number of unique lines in the file. The djb hash we are replacing prior_hash with performs closer to SHA-1 in terms of having very few collisions. This indicates it provides a reasonably distributed output for this type of input, despite being a much simpler algorithm (and therefore will be much faster to execute). The string_hash31 function is provided just to compare results with, it is the algorithm commonly used by java.lang.String hashCode(). However, life isn't quite this simple. djb produces a 32 bit hash code, but our hash tables are always smaller than 2^32 buckets. Mashing the 32 bit code into an array index used to be done by simply taking the lower bits of the hash code by a bitwise and operator. This unfortuntely still produces many collisions, e.g. 32 on the linux-2.6 repository files. From [1] we can apply a final "cleanup" step to the hash code to mix the bits together a little better, and give priority to the higher order bits as they include data from more bytes of input: test_jgit: 931 files; 122 avg. unique lines/file Algorithm | Collisions -------------+----------- prior_hash 418 djb 5 djb + cleanup 6 test_linux26: 30198 files; 258 avg. unique lines/file Algorithm | Collisions -------------+----------- prior_hash 8675 djb 32 djb + cleanup 7 test_frameworks_base: 8381 files; 184 avg. unique lines/file Algorithm | Collisions -------------+----------- prior_hash 4615 djb 10 djb + cleanup 7 This is a massive improvement, as the number of collisions for common inputs drops to acceptable levels, and we haven't really made the hash functions any more complex than they were before. [1] http://lkml.org/lkml/2009/10/27/404 Change-Id: Ia753b695de9526a157ddba265824240bd05dead1 Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
14 years ago · 11f99fecfd
3 changed files with 45 additions and 46 deletions
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/PatienceDiffIndex.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/PatienceDiffIndex.java
@ -91,10 +91,11 @@ final class PatienceDiffIndex<S extends Sequence> {
 	/** 1 past the last valid entry in {@link #pCommon}. */
 	private final int pEnd;
-	/** Keyed by {@code cmp.hash() & tableMask} to yield an entry offset. */
+	/** Keyed by {@link #hash(HashedSequence, int)} to get an entry offset. */
 	private final int[] table;
-	private final int tableMask;
+	/** Number of low bits to discard from a key to index {@link #table}. */
 	private final int keyShift;
 	// To save memory the buckets for hash chains are stored in correlated
 	// arrays. This permits us to get 3 values per entry, without paying
@ -158,8 +159,9 @@ final class PatienceDiffIndex<S extends Sequence> {
 		this.pEnd = pCnt;
 		final int sz = region.getLengthB();
-		table = new int[tableSize(sz)];
+		final int tableBits = tableBits(sz);
-		tableMask = table.length - 1;
+		table = new int[1 << tableBits];
 		keyShift = 32 - tableBits;
 		// As we insert elements we preincrement so that 0 is never a
 		// valid entry. Therefore we have to allocate one extra space.
@ -187,7 +189,7 @@ final class PatienceDiffIndex<S extends Sequence> {
 		final int end = region.endB;
 		int pIdx = pBegin;
 		SCAN: while (ptr < end) {
-			final int tIdx = cmp.hash(b, ptr) & tableMask;
+			final int tIdx = hash(b, ptr);
 			if (pIdx < pEnd) {
 				final long priorRec = pCommon[pIdx];
@ -244,7 +246,7 @@ final class PatienceDiffIndex<S extends Sequence> {
 		final int end = region.endA;
 		int pLast = pBegin - 1;
 		SCAN: while (ptr < end) {
-			final int tIdx = cmp.hash(a, ptr) & tableMask;
+			final int tIdx = hash(a, ptr);
 			for (int eIdx = table[tIdx]; eIdx != 0; eIdx = next[eIdx]) {
 				final long rec = ptrs[eIdx];
@ -391,6 +393,10 @@ final class PatienceDiffIndex<S extends Sequence> {
 		return lcs;
 	}
 	private int hash(HashedSequence<S> s, int idx) {
 		return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
 	}
 	private static boolean isDuplicate(long rec) {
 		return (((int) rec) & DUPLICATE_MASK) != 0;
 	}
@ -407,11 +413,12 @@ final class PatienceDiffIndex<S extends Sequence> {
 		return (int) (rec >>> B_SHIFT);
 	}
-	private static int tableSize(final int worstCaseBlockCnt) {
+	private static int tableBits(final int sz) {
-		int shift = 32 - Integer.numberOfLeadingZeros(worstCaseBlockCnt);
+		int bits = 31 - Integer.numberOfLeadingZeros(sz);
-		int sz = 1 << (shift - 1);
+		if (bits == 0)
-		if (sz < worstCaseBlockCnt)
+			bits = 1;
-			sz <<= 1;
+		if (1 << bits < sz)
-		return sz;
+			bits++;
 		return bits;
 	}
 }
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java
@ -78,7 +78,7 @@ public abstract class RawTextComparator extends SequenceComparator<RawText> {
 		protected int hashRegion(final byte[] raw, int ptr, final int end) {
 			int hash = 5381;
 			for (; ptr < end; ptr++)
-				hash = (hash << 5) ^ (raw[ptr] & 0xff);
+				hash = ((hash << 5) + hash) + (raw[ptr] & 0xff);
 			return hash;
 		}
 	};
@ -128,7 +128,7 @@ public abstract class RawTextComparator extends SequenceComparator<RawText> {
 			for (; ptr < end; ptr++) {
 				byte c = raw[ptr];
 				if (!isWhitespace(c))
-					hash = (hash << 5) ^ (c & 0xff);
+					hash = ((hash << 5) + hash) + (c & 0xff);
 			}
 			return hash;
 		}
@ -163,9 +163,8 @@ public abstract class RawTextComparator extends SequenceComparator<RawText> {
 		protected int hashRegion(final byte[] raw, int ptr, int end) {
 			int hash = 5381;
 			ptr = trimLeadingWhitespace(raw, ptr, end);
-			for (; ptr < end; ptr++) {
+			for (; ptr < end; ptr++)
-				hash = (hash << 5) ^ (raw[ptr] & 0xff);
+				hash = ((hash << 5) + hash) + (raw[ptr] & 0xff);
 			}
 			return hash;
 		}
 	};
@ -199,9 +198,8 @@ public abstract class RawTextComparator extends SequenceComparator<RawText> {
 		protected int hashRegion(final byte[] raw, int ptr, int end) {
 			int hash = 5381;
 			end = trimTrailingWhitespace(raw, ptr, end);
-			for (; ptr < end; ptr++) {
+			for (; ptr < end; ptr++)
-				hash = (hash << 5) ^ (raw[ptr] & 0xff);
+				hash = ((hash << 5) + hash) + (raw[ptr] & 0xff);
 			}
 			return hash;
 		}
 	};
@ -247,7 +245,7 @@ public abstract class RawTextComparator extends SequenceComparator<RawText> {
 			end = trimTrailingWhitespace(raw, ptr, end);
 			while (ptr < end) {
 				byte c = raw[ptr];
-				hash = (hash << 5) ^ (c & 0xff);
+				hash = ((hash << 5) + hash) + (c & 0xff);
 				if (isWhitespace(c))
 					ptr = trimLeadingWhitespace(raw, ptr, end);
 				else
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
@ -68,20 +68,13 @@ class SimilarityIndex {
 	/** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */
 	private static final int MAX_HASH_BITS = 17;
 	/** The {@link #idHash} table will not grow bigger than this, ever. */
 	private static final int MAX_HASH_SIZE = 1 << MAX_HASH_BITS;
 	/** Prime just before {@link #MAX_HASH_SIZE}. */
 	private static final int P = 131071;
 	/**
 	 * Shift to apply before storing a key.
 	 * <p>
 	 * Within the 64 bit table record space, we leave the highest bit unset so
-	 * all values are positive, and we need {@link #MAX_HASH_BITS} bits for the
+	 * all values are positive. The lower 32 bits to count bytes.
 	 * keys. The lower 32 bits are used to count bytes impacted.
 	 */
-	private static final int KEY_SHIFT = 64 - 1 - MAX_HASH_BITS;
+	private static final int KEY_SHIFT = 32;
 	/** Total size of the file we hashed into the structure. */
 	private long fileSize;
@ -100,8 +93,12 @@ class SimilarityIndex {
 	 */
 	private long[] idHash;
 	/** {@code idHash.length == 1 << idHashBits}. */
 	private int idHashBits;
 	SimilarityIndex() {
-		idHash = new long[256];
+		idHashBits = 8;
 		idHash = new long[1 << idHashBits];
 	}
 	long getFileSize() {
@ -138,7 +135,7 @@ class SimilarityIndex {
 				int c = raw[ptr++] & 0xff;
 				if (c == '\n')
 					break;
-				hash = (hash << 5) ^ c;
+				hash = (hash << 5) + hash + c;
 			} while (ptr < end && ptr - start < 64);
 			add(hash, ptr - start);
 		}
@ -166,7 +163,7 @@ class SimilarityIndex {
 				int c = buf[ptr++] & 0xff;
 				if (c == '\n')
 					break;
-				hash = (hash << 5) ^ c;
+				hash = (hash << 5) + hash + c;
 			} while (n < 64 && n < remaining);
 			add(hash, n);
 			remaining -= n;
@ -272,7 +269,8 @@ class SimilarityIndex {
 	}
 	void add(int key, int cnt) {
-		key = hash(key);
+		key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
 		int j = slot(key);
 		for (;;) {
 			long v = idHash[j];
@ -298,28 +296,24 @@ class SimilarityIndex {
 		}
 	}
 	private static int hash(int key) {
 		// Make the key fit into our table. Since we have a maximum size
 		// that we cap the table at, all keys get squashed before going
 		// into the table. This prevents overflow.
 		//
 		return (key >>> 1) % P;
 	}
 	private int slot(int key) {
-		return key % idHash.length;
+		// We use 31 - idHashBits because the upper bit was already forced
 		// to be 0 and we want the remaining high bits to be used as the
 		// table slot.
 		//
 		return key >>> (31 - idHashBits);
 	}
 	private boolean shouldGrow() {
-		int n = idHash.length;
+		return idHashBits < MAX_HASH_BITS && idHash.length <= idSize * 2;
 		return n < MAX_HASH_SIZE && n <= idSize * 2;
 	}
 	private void grow() {
 		long[] oldHash = idHash;
 		int oldSize = idHash.length;
-		idHash = new long[2 * oldSize];
+		idHashBits++;
 		idHash = new long[1 << idHashBits];
 		for (int i = 0; i < oldSize; i++) {
 			long v = oldHash[i];
 			if (v != 0) {