Merge branch 'rename-detection'

* rename-detection: RenameDetector: Only scan deletes if adds exist SimilarityRenameDetector: Initialize sizes to 0 SimilarityRenameDetector: Avoid allocating source index SimilarityRenameDetector: Only attempt to index large files once SimilarityIndex: Don't overflow internal counter fields SimilarityIndex: Accept files larger than 8 MB SimilarityIndex: Correct comment explaining the logic
14 years ago · 51bf8ea2a4
4 changed files with 141 additions and 65 deletions
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
@ -48,10 +48,11 @@ import java.io.IOException;

 import junit.framework.TestCase;

+import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
 import org.eclipse.jgit.lib.Constants;

 public class SimilarityIndexTest extends TestCase {
-	public void testIndexingSmallObject() {
+	public void testIndexingSmallObject() throws TableFullException {
 		SimilarityIndex si = hash("" //
 				+ "A\n" //
 				+ "B\n" //
@ -70,7 +71,8 @@ public class SimilarityIndexTest extends TestCase {
 		assertEquals(2, si.count(si.findIndex(key_D)));
 	}

-	public void testIndexingLargeObject() throws IOException {
+	public void testIndexingLargeObject() throws IOException,
+			TableFullException {
 		byte[] in = ("" //
 				+ "A\n" //
 				+ "B\n" //
@ -81,7 +83,7 @@ public class SimilarityIndexTest extends TestCase {
 		assertEquals(2, si.size());
 	}

-	public void testCommonScore_SameFiles() {
+	public void testCommonScore_SameFiles() throws TableFullException {
 		String text = "" //
 				+ "A\n" //
 				+ "B\n" //
@ -96,21 +98,22 @@ public class SimilarityIndexTest extends TestCase {
 		assertEquals(100, dst.score(src, 100));
 	}

-	public void testCommonScore_EmptyFiles() {
+	public void testCommonScore_EmptyFiles() throws TableFullException {
 		SimilarityIndex src = hash("");
 		SimilarityIndex dst = hash("");
 		assertEquals(0, src.common(dst));
 		assertEquals(0, dst.common(src));
 	}

-	public void testCommonScore_TotallyDifferentFiles() {
+	public void testCommonScore_TotallyDifferentFiles()
+			throws TableFullException {
 		SimilarityIndex src = hash("A\n");
 		SimilarityIndex dst = hash("D\n");
 		assertEquals(0, src.common(dst));
 		assertEquals(0, dst.common(src));
 	}

-	public void testCommonScore_SimiliarBy75() {
+	public void testCommonScore_SimiliarBy75() throws TableFullException {
 		SimilarityIndex src = hash("A\nB\nC\nD\n");
 		SimilarityIndex dst = hash("A\nB\nC\nQ\n");
 		assertEquals(6, src.common(dst));
@ -120,10 +123,11 @@ public class SimilarityIndexTest extends TestCase {
 		assertEquals(75, dst.score(src, 100));
 	}

-	private static SimilarityIndex hash(String text) {
+	private static SimilarityIndex hash(String text) throws TableFullException {
 		SimilarityIndex src = new SimilarityIndex() {
 			@Override
-			void hash(byte[] raw, int ptr, final int end) {
+			void hash(byte[] raw, int ptr, final int end)
+					throws TableFullException {
 				while (ptr < end) {
 					int hash = raw[ptr] & 0xff;
 					int start = ptr;
@ -143,7 +147,7 @@ public class SimilarityIndexTest extends TestCase {
 		return src;
 	}

-	private static int keyFor(String line) {
+	private static int keyFor(String line) throws TableFullException {
 		SimilarityIndex si = hash(line);
 		assertEquals("single line scored", 1, si.size());
 		return si.key(0);
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
@ -57,6 +57,7 @@ import java.util.List;

 import org.eclipse.jgit.JGitText;
 import org.eclipse.jgit.diff.DiffEntry.ChangeType;
+import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
 import org.eclipse.jgit.lib.AbbreviatedObjectId;
 import org.eclipse.jgit.lib.FileMode;
 import org.eclipse.jgit.lib.NullProgressMonitor;
@ -356,9 +357,17 @@ public class RenameDetector {

 			if (pm == null)
 				pm = NullProgressMonitor.INSTANCE;
+
+			if (0 < breakScore)
 				breakModifies(reader, pm);
+
+			if (!added.isEmpty() && !deleted.isEmpty())
 				findExactRenames(pm);
+
+			if (!added.isEmpty() && !deleted.isEmpty())
 				findContentRenames(reader, pm);
+
+			if (0 < breakScore && !added.isEmpty() && !deleted.isEmpty())
 				rejoinModifies(pm);

 			entries.addAll(added);
@ -382,9 +391,6 @@ public class RenameDetector {

 	private void breakModifies(ContentSource.Pair reader, ProgressMonitor pm)
 			throws IOException {
-		if (breakScore <= 0)
-			return;
-
 		ArrayList<DiffEntry> newEntries = new ArrayList<DiffEntry>(entries.size());

 		pm.beginTask(JGitText.get().renamesBreakingModifies, entries.size());
@ -445,6 +451,7 @@ public class RenameDetector {

 	private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d)
 			throws IOException {
+		try {
 			SimilarityIndex src = new SimilarityIndex();
 			src.hash(reader.open(OLD, d));
 			src.sort();
@ -453,21 +460,27 @@ public class RenameDetector {
 			dst.hash(reader.open(NEW, d));
 			dst.sort();
 			return src.score(dst, 100);
+		} catch (TableFullException tableFull) {
+			// If either table overflowed while being constructed, don't allow
+			// the pair to be broken. Returning 1 higher than breakScore will
+			// ensure its not similar, but not quite dissimilar enough to break.
+			//
+			overRenameLimit = true;
+			return breakScore + 1;
+		}
 	}

 	private void findContentRenames(ContentSource.Pair reader,
 			ProgressMonitor pm)
 			throws IOException {
 		int cnt = Math.max(added.size(), deleted.size());
-		if (cnt == 0)
-			return;
-
 		if (getRenameLimit() == 0 || cnt <= getRenameLimit()) {
 			SimilarityRenameDetector d;

 			d = new SimilarityRenameDetector(reader, deleted, added);
 			d.setRenameScore(getRenameScore());
 			d.compute(pm);
+			overRenameLimit |= d.isTableOverflow();
 			deleted = d.getLeftOverSources();
 			added = d.getLeftOverDestinations();
 			entries.addAll(d.getMatches());
@ -478,9 +491,6 @@ public class RenameDetector {

 	@SuppressWarnings("unchecked")
 	private void findExactRenames(ProgressMonitor pm) {
-		if (added.isEmpty() || deleted.isEmpty())
-			return;
-
 		pm.beginTask(JGitText.get().renamesFindingExact, //
 				added.size() + added.size() + deleted.size()
 						+ added.size() * deleted.size());
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
@ -65,8 +65,8 @@ import org.eclipse.jgit.lib.ObjectStream;
 * file are discovered.
 */
 class SimilarityIndex {
-	/** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */
-	private static final int MAX_HASH_BITS = 17;
+	/** A special {@link TableFullException} used in place of OutOfMemoryError. */
+	private static final TableFullException TABLE_FULL_OUT_OF_MEMORY = new TableFullException();

 	/**
 	 * Shift to apply before storing a key.
@ -76,20 +76,26 @@ class SimilarityIndex {
 	 */
 	private static final int KEY_SHIFT = 32;

+	/** Maximum value of the count field, also mask to extract the count. */
+	private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
+
 	/** Total size of the file we hashed into the structure. */
 	private long fileSize;

 	/** Number of non-zero entries in {@link #idHash}. */
 	private int idSize;

+	/** {@link #idSize} that triggers {@link #idHash} to double in size. */
+	private int idGrowAt;
+
 	/**
 	 * Pairings of content keys and counters.
 	 * <p>
 	 * Slots in the table are actually two ints wedged into a single long. The
-	 * upper {@link #MAX_HASH_BITS} bits stores the content key, and the
-	 * remaining lower bits stores the number of bytes associated with that key.
-	 * Empty slots are denoted by 0, which cannot occur because the count cannot
-	 * be 0. Values can only be positive, which we enforce during key addition.
+	 * upper 32 bits stores the content key, and the remaining lower bits stores
+	 * the number of bytes associated with that key. Empty slots are denoted by
+	 * 0, which cannot occur because the count cannot be 0. Values can only be
+	 * positive, which we enforce during key addition.
 	 */
 	private long[] idHash;

@ -99,6 +105,7 @@ class SimilarityIndex {
 	SimilarityIndex() {
 		idHashBits = 8;
 		idHash = new long[1 << idHashBits];
+		idGrowAt = growAt(idHashBits);
 	}

 	long getFileSize() {
@ -109,7 +116,8 @@ class SimilarityIndex {
 		fileSize = size;
 	}

-	void hash(ObjectLoader obj) throws MissingObjectException, IOException {
+	void hash(ObjectLoader obj) throws MissingObjectException, IOException,
+			TableFullException {
 		if (obj.isLarge()) {
 			ObjectStream in = obj.openStream();
 			try {
@ -125,7 +133,7 @@ class SimilarityIndex {
 		}
 	}

-	void hash(byte[] raw, int ptr, final int end) {
+	void hash(byte[] raw, int ptr, final int end) throws TableFullException {
 		while (ptr < end) {
 			int hash = 5381;
 			int start = ptr;
@ -141,7 +149,8 @@ class SimilarityIndex {
 		}
 	}

-	void hash(InputStream in, long remaining) throws IOException {
+	void hash(InputStream in, long remaining) throws IOException,
+			TableFullException {
 		byte[] buf = new byte[4096];
 		int ptr = 0;
 		int cnt = 0;
@ -190,11 +199,11 @@ class SimilarityIndex {
 		return (int) ((common(dst) * maxScore) / max);
 	}

-	int common(SimilarityIndex dst) {
+	long common(SimilarityIndex dst) {
 		return common(this, dst);
 	}

-	private static int common(SimilarityIndex src, SimilarityIndex dst) {
+	private static long common(SimilarityIndex src, SimilarityIndex dst) {
 		int srcIdx = src.packedIndex(0);
 		int dstIdx = dst.packedIndex(0);
 		long[] srcHash = src.idHash;
@ -202,12 +211,12 @@ class SimilarityIndex {
 		return common(srcHash, srcIdx, dstHash, dstIdx);
 	}

-	private static int common(long[] srcHash, int srcIdx, //
+	private static long common(long[] srcHash, int srcIdx, //
 			long[] dstHash, int dstIdx) {
 		if (srcIdx == srcHash.length || dstIdx == dstHash.length)
 			return 0;

-		int common = 0;
+		long common = 0;
 		int srcKey = keyOf(srcHash[srcIdx]);
 		int dstKey = keyOf(dstHash[dstIdx]);

@ -230,8 +239,8 @@ class SimilarityIndex {
 					break;
 				srcKey = keyOf(srcHash[srcIdx]);

-			} else /* if (srcKey > dstKey) */{
-				// Regions of dst which do not appear in dst.
+			} else /* if (dstKey < srcKey) */{
+				// Regions of dst which do not appear in src.
 				if (++dstIdx == dstHash.length)
 					break;
 				dstKey = keyOf(dstHash[dstIdx]);
@ -268,7 +277,7 @@ class SimilarityIndex {
 		return (idHash.length - idSize) + idx;
 	}

-	void add(int key, int cnt) {
+	void add(int key, int cnt) throws TableFullException {
 		key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.

 		int j = slot(key);
@ -276,18 +285,20 @@ class SimilarityIndex {
 			long v = idHash[j];
 			if (v == 0) {
 				// Empty slot in the table, store here.
-				if (shouldGrow()) {
+				if (idGrowAt <= idSize) {
 					grow();
 					j = slot(key);
 					continue;
 				}
-				idHash[j] = (((long) key) << KEY_SHIFT) | cnt;
+				idHash[j] = pair(key, cnt);
 				idSize++;
 				return;

 			} else if (keyOf(v) == key) {
-				// Same key, increment the counter.
-				idHash[j] = v + cnt;
+				// Same key, increment the counter. If it overflows, fail
+				// indexing to prevent the key from being impacted.
+				//
+				idHash[j] = pair(key, countOf(v) + cnt);
 				return;

 			} else if (++j >= idHash.length) {
@ -296,6 +307,12 @@ class SimilarityIndex {
 		}
 	}

+	private static long pair(int key, long cnt) throws TableFullException {
+		if (MAX_COUNT < cnt)
+			throw new TableFullException();
+		return (((long) key) << KEY_SHIFT) | cnt;
+	}
+
 	private int slot(int key) {
 		// We use 31 - idHashBits because the upper bit was already forced
 		// to be 0 and we want the remaining high bits to be used as the
@ -304,16 +321,26 @@ class SimilarityIndex {
 		return key >>> (31 - idHashBits);
 	}

-	private boolean shouldGrow() {
-		return idHashBits < MAX_HASH_BITS && idHash.length <= idSize * 2;
+	private static int growAt(int idHashBits) {
+		return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
 	}

-	private void grow() {
+	private void grow() throws TableFullException {
+		if (idHashBits == 30)
+			throw new TableFullException();
+
 		long[] oldHash = idHash;
 		int oldSize = idHash.length;

 		idHashBits++;
+		idGrowAt = growAt(idHashBits);
+
+		try {
 			idHash = new long[1 << idHashBits];
+		} catch (OutOfMemoryError noMemory) {
+			throw TABLE_FULL_OUT_OF_MEMORY;
+		}
+
 		for (int i = 0; i < oldSize; i++) {
 			long v = oldHash[i];
 			if (v != 0) {
@ -330,7 +357,11 @@ class SimilarityIndex {
 		return (int) (v >>> KEY_SHIFT);
 	}

-	private static int countOf(long v) {
-		return (int) v;
+	private static long countOf(long v) {
+		return v & MAX_COUNT;
+	}
+
+	static class TableFullException extends Exception {
+		private static final long serialVersionUID = 1L;
 	}
 }
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
@ -49,10 +49,12 @@ import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.BitSet;
 import java.util.List;

 import org.eclipse.jgit.JGitText;
 import org.eclipse.jgit.diff.DiffEntry.ChangeType;
+import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
 import org.eclipse.jgit.lib.FileMode;
 import org.eclipse.jgit.lib.NullProgressMonitor;
 import org.eclipse.jgit.lib.ProgressMonitor;
@ -110,6 +112,9 @@ class SimilarityRenameDetector {
 	/** Score a pair must exceed to be considered a rename. */
 	private int renameScore = 60;

+	/** Set if any {@link SimilarityIndex.TableFullException} occurs. */
+	private boolean tableOverflow;
+
 	private List<DiffEntry> out;

 	SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
@ -182,6 +187,10 @@ class SimilarityRenameDetector {
 		return dsts;
 	}

+	boolean isTableOverflow() {
+		return tableOverflow;
+	}
+
 	private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
 		ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size());
 		for (DiffEntry e : in) {
@ -208,25 +217,22 @@ class SimilarityRenameDetector {

 		long[] srcSizes = new long[srcs.size()];
 		long[] dstSizes = new long[dsts.size()];
-
-		// Init the size arrays to some value that indicates that we haven't
-		// calculated the size yet. Since sizes cannot be negative, -1 will work
-		Arrays.fill(srcSizes, -1);
-		Arrays.fill(dstSizes, -1);
+		BitSet dstTooLarge = null;

 		// Consider each pair of files, if the score is above the minimum
 		// threshold we need record that scoring in the matrix so we can
 		// later find the best matches.
 		//
 		int mNext = 0;
-		for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
+		SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
 			DiffEntry srcEnt = srcs.get(srcIdx);
 			if (!isFile(srcEnt.oldMode)) {
 				pm.update(dsts.size());
 				continue;
 			}

-			SimilarityIndex s = hash(OLD, srcEnt);
+			SimilarityIndex s = null;
+
 			for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
 				DiffEntry dstEnt = dsts.get(dstIdx);

@ -240,15 +246,20 @@ class SimilarityRenameDetector {
 					continue;
 				}

+				if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
+					pm.update(1);
+					continue;
+				}
+
 				long srcSize = srcSizes[srcIdx];
-				if (srcSize < 0) {
-					srcSize = size(OLD, srcEnt);
+				if (srcSize == 0) {
+					srcSize = size(OLD, srcEnt) + 1;
 					srcSizes[srcIdx] = srcSize;
 				}

 				long dstSize = dstSizes[dstIdx];
-				if (dstSize < 0) {
-					dstSize = size(NEW, dstEnt);
+				if (dstSize == 0) {
+					dstSize = size(NEW, dstEnt) + 1;
 					dstSizes[dstIdx] = dstSize;
 				}

@ -260,7 +271,27 @@ class SimilarityRenameDetector {
 					continue;
 				}

-				SimilarityIndex d = hash(NEW, dstEnt);
+				if (s == null) {
+					try {
+						s = hash(OLD, srcEnt);
+					} catch (TableFullException tableFull) {
+						tableOverflow = true;
+						continue SRC;
+					}
+				}
+
+				SimilarityIndex d;
+				try {
+					d = hash(NEW, dstEnt);
+				} catch (TableFullException tableFull) {
+					if (dstTooLarge == null)
+						dstTooLarge = new BitSet(dsts.size());
+					dstTooLarge.set(dstIdx);
+					tableOverflow = true;
+					pm.update(1);
+					continue;
+				}
+
 				int contentScore = s.score(d, 10000);

 				// nameScore returns a value between 0 and 100, but we want it
@ -336,7 +367,7 @@ class SimilarityRenameDetector {
 	}

 	private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
-			throws IOException {
+			throws IOException, TableFullException {
 		SimilarityIndex r = new SimilarityIndex();
 		r.hash(reader.open(side, ent));
 		r.sort();