diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java index 7e42e5358..1da5828b3 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java @@ -48,10 +48,11 @@ import java.io.IOException; import junit.framework.TestCase; +import org.eclipse.jgit.diff.SimilarityIndex.TableFullException; import org.eclipse.jgit.lib.Constants; public class SimilarityIndexTest extends TestCase { - public void testIndexingSmallObject() { + public void testIndexingSmallObject() throws TableFullException { SimilarityIndex si = hash("" // + "A\n" // + "B\n" // @@ -70,7 +71,8 @@ public class SimilarityIndexTest extends TestCase { assertEquals(2, si.count(si.findIndex(key_D))); } - public void testIndexingLargeObject() throws IOException { + public void testIndexingLargeObject() throws IOException, + TableFullException { byte[] in = ("" // + "A\n" // + "B\n" // @@ -81,7 +83,7 @@ public class SimilarityIndexTest extends TestCase { assertEquals(2, si.size()); } - public void testCommonScore_SameFiles() { + public void testCommonScore_SameFiles() throws TableFullException { String text = "" // + "A\n" // + "B\n" // @@ -96,21 +98,22 @@ public class SimilarityIndexTest extends TestCase { assertEquals(100, dst.score(src, 100)); } - public void testCommonScore_EmptyFiles() { + public void testCommonScore_EmptyFiles() throws TableFullException { SimilarityIndex src = hash(""); SimilarityIndex dst = hash(""); assertEquals(0, src.common(dst)); assertEquals(0, dst.common(src)); } - public void testCommonScore_TotallyDifferentFiles() { + public void testCommonScore_TotallyDifferentFiles() + throws TableFullException { SimilarityIndex src = hash("A\n"); SimilarityIndex dst = hash("D\n"); assertEquals(0, src.common(dst)); assertEquals(0, dst.common(src)); } - public void testCommonScore_SimiliarBy75() { + public void testCommonScore_SimiliarBy75() throws TableFullException { SimilarityIndex src = hash("A\nB\nC\nD\n"); SimilarityIndex dst = hash("A\nB\nC\nQ\n"); assertEquals(6, src.common(dst)); @@ -120,10 +123,11 @@ public class SimilarityIndexTest extends TestCase { assertEquals(75, dst.score(src, 100)); } - private static SimilarityIndex hash(String text) { + private static SimilarityIndex hash(String text) throws TableFullException { SimilarityIndex src = new SimilarityIndex() { @Override - void hash(byte[] raw, int ptr, final int end) { + void hash(byte[] raw, int ptr, final int end) + throws TableFullException { while (ptr < end) { int hash = raw[ptr] & 0xff; int start = ptr; @@ -143,7 +147,7 @@ public class SimilarityIndexTest extends TestCase { return src; } - private static int keyFor(String line) { + private static int keyFor(String line) throws TableFullException { SimilarityIndex si = hash(line); assertEquals("single line scored", 1, si.size()); return si.key(0); diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java index 66218f640..dfaf5886e 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java @@ -57,6 +57,7 @@ import java.util.List; import org.eclipse.jgit.JGitText; import org.eclipse.jgit.diff.DiffEntry.ChangeType; +import org.eclipse.jgit.diff.SimilarityIndex.TableFullException; import org.eclipse.jgit.lib.AbbreviatedObjectId; import org.eclipse.jgit.lib.FileMode; import org.eclipse.jgit.lib.NullProgressMonitor; @@ -356,9 +357,17 @@ public class RenameDetector { if (pm == null) pm = NullProgressMonitor.INSTANCE; + + if (0 < breakScore) breakModifies(reader, pm); + + if (!added.isEmpty() && !deleted.isEmpty()) findExactRenames(pm); + + if (!added.isEmpty() && !deleted.isEmpty()) findContentRenames(reader, pm); + + if (0 < breakScore && !added.isEmpty() && !deleted.isEmpty()) rejoinModifies(pm); entries.addAll(added); @@ -382,9 +391,6 @@ public class RenameDetector { private void breakModifies(ContentSource.Pair reader, ProgressMonitor pm) throws IOException { - if (breakScore <= 0) - return; - ArrayList newEntries = new ArrayList(entries.size()); pm.beginTask(JGitText.get().renamesBreakingModifies, entries.size()); @@ -445,29 +451,36 @@ public class RenameDetector { private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d) throws IOException { - SimilarityIndex src = new SimilarityIndex(); - src.hash(reader.open(OLD, d)); - src.sort(); - - SimilarityIndex dst = new SimilarityIndex(); - dst.hash(reader.open(NEW, d)); - dst.sort(); - return src.score(dst, 100); + try { + SimilarityIndex src = new SimilarityIndex(); + src.hash(reader.open(OLD, d)); + src.sort(); + + SimilarityIndex dst = new SimilarityIndex(); + dst.hash(reader.open(NEW, d)); + dst.sort(); + return src.score(dst, 100); + } catch (TableFullException tableFull) { + // If either table overflowed while being constructed, don't allow + // the pair to be broken. Returning 1 higher than breakScore will + // ensure its not similar, but not quite dissimilar enough to break. + // + overRenameLimit = true; + return breakScore + 1; + } } private void findContentRenames(ContentSource.Pair reader, ProgressMonitor pm) throws IOException { int cnt = Math.max(added.size(), deleted.size()); - if (cnt == 0) - return; - if (getRenameLimit() == 0 || cnt <= getRenameLimit()) { SimilarityRenameDetector d; d = new SimilarityRenameDetector(reader, deleted, added); d.setRenameScore(getRenameScore()); d.compute(pm); + overRenameLimit |= d.isTableOverflow(); deleted = d.getLeftOverSources(); added = d.getLeftOverDestinations(); entries.addAll(d.getMatches()); @@ -478,9 +491,6 @@ public class RenameDetector { @SuppressWarnings("unchecked") private void findExactRenames(ProgressMonitor pm) { - if (added.isEmpty() || deleted.isEmpty()) - return; - pm.beginTask(JGitText.get().renamesFindingExact, // added.size() + added.size() + deleted.size() + added.size() * deleted.size()); diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java index 6627268e4..17ccb9726 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java @@ -65,8 +65,8 @@ import org.eclipse.jgit.lib.ObjectStream; * file are discovered. */ class SimilarityIndex { - /** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */ - private static final int MAX_HASH_BITS = 17; + /** A special {@link TableFullException} used in place of OutOfMemoryError. */ + private static final TableFullException TABLE_FULL_OUT_OF_MEMORY = new TableFullException(); /** * Shift to apply before storing a key. @@ -76,20 +76,26 @@ class SimilarityIndex { */ private static final int KEY_SHIFT = 32; + /** Maximum value of the count field, also mask to extract the count. */ + private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1; + /** Total size of the file we hashed into the structure. */ private long fileSize; /** Number of non-zero entries in {@link #idHash}. */ private int idSize; + /** {@link #idSize} that triggers {@link #idHash} to double in size. */ + private int idGrowAt; + /** * Pairings of content keys and counters. *

* Slots in the table are actually two ints wedged into a single long. The - * upper {@link #MAX_HASH_BITS} bits stores the content key, and the - * remaining lower bits stores the number of bytes associated with that key. - * Empty slots are denoted by 0, which cannot occur because the count cannot - * be 0. Values can only be positive, which we enforce during key addition. + * upper 32 bits stores the content key, and the remaining lower bits stores + * the number of bytes associated with that key. Empty slots are denoted by + * 0, which cannot occur because the count cannot be 0. Values can only be + * positive, which we enforce during key addition. */ private long[] idHash; @@ -99,6 +105,7 @@ class SimilarityIndex { SimilarityIndex() { idHashBits = 8; idHash = new long[1 << idHashBits]; + idGrowAt = growAt(idHashBits); } long getFileSize() { @@ -109,7 +116,8 @@ class SimilarityIndex { fileSize = size; } - void hash(ObjectLoader obj) throws MissingObjectException, IOException { + void hash(ObjectLoader obj) throws MissingObjectException, IOException, + TableFullException { if (obj.isLarge()) { ObjectStream in = obj.openStream(); try { @@ -125,7 +133,7 @@ class SimilarityIndex { } } - void hash(byte[] raw, int ptr, final int end) { + void hash(byte[] raw, int ptr, final int end) throws TableFullException { while (ptr < end) { int hash = 5381; int start = ptr; @@ -141,7 +149,8 @@ class SimilarityIndex { } } - void hash(InputStream in, long remaining) throws IOException { + void hash(InputStream in, long remaining) throws IOException, + TableFullException { byte[] buf = new byte[4096]; int ptr = 0; int cnt = 0; @@ -190,11 +199,11 @@ class SimilarityIndex { return (int) ((common(dst) * maxScore) / max); } - int common(SimilarityIndex dst) { + long common(SimilarityIndex dst) { return common(this, dst); } - private static int common(SimilarityIndex src, SimilarityIndex dst) { + private static long common(SimilarityIndex src, SimilarityIndex dst) { int srcIdx = src.packedIndex(0); int dstIdx = dst.packedIndex(0); long[] srcHash = src.idHash; @@ -202,12 +211,12 @@ class SimilarityIndex { return common(srcHash, srcIdx, dstHash, dstIdx); } - private static int common(long[] srcHash, int srcIdx, // + private static long common(long[] srcHash, int srcIdx, // long[] dstHash, int dstIdx) { if (srcIdx == srcHash.length || dstIdx == dstHash.length) return 0; - int common = 0; + long common = 0; int srcKey = keyOf(srcHash[srcIdx]); int dstKey = keyOf(dstHash[dstIdx]); @@ -230,8 +239,8 @@ class SimilarityIndex { break; srcKey = keyOf(srcHash[srcIdx]); - } else /* if (srcKey > dstKey) */{ - // Regions of dst which do not appear in dst. + } else /* if (dstKey < srcKey) */{ + // Regions of dst which do not appear in src. if (++dstIdx == dstHash.length) break; dstKey = keyOf(dstHash[dstIdx]); @@ -268,7 +277,7 @@ class SimilarityIndex { return (idHash.length - idSize) + idx; } - void add(int key, int cnt) { + void add(int key, int cnt) throws TableFullException { key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative. int j = slot(key); @@ -276,18 +285,20 @@ class SimilarityIndex { long v = idHash[j]; if (v == 0) { // Empty slot in the table, store here. - if (shouldGrow()) { + if (idGrowAt <= idSize) { grow(); j = slot(key); continue; } - idHash[j] = (((long) key) << KEY_SHIFT) | cnt; + idHash[j] = pair(key, cnt); idSize++; return; } else if (keyOf(v) == key) { - // Same key, increment the counter. - idHash[j] = v + cnt; + // Same key, increment the counter. If it overflows, fail + // indexing to prevent the key from being impacted. + // + idHash[j] = pair(key, countOf(v) + cnt); return; } else if (++j >= idHash.length) { @@ -296,6 +307,12 @@ class SimilarityIndex { } } + private static long pair(int key, long cnt) throws TableFullException { + if (MAX_COUNT < cnt) + throw new TableFullException(); + return (((long) key) << KEY_SHIFT) | cnt; + } + private int slot(int key) { // We use 31 - idHashBits because the upper bit was already forced // to be 0 and we want the remaining high bits to be used as the @@ -304,16 +321,26 @@ class SimilarityIndex { return key >>> (31 - idHashBits); } - private boolean shouldGrow() { - return idHashBits < MAX_HASH_BITS && idHash.length <= idSize * 2; + private static int growAt(int idHashBits) { + return (1 << idHashBits) * (idHashBits - 3) / idHashBits; } - private void grow() { + private void grow() throws TableFullException { + if (idHashBits == 30) + throw new TableFullException(); + long[] oldHash = idHash; int oldSize = idHash.length; idHashBits++; - idHash = new long[1 << idHashBits]; + idGrowAt = growAt(idHashBits); + + try { + idHash = new long[1 << idHashBits]; + } catch (OutOfMemoryError noMemory) { + throw TABLE_FULL_OUT_OF_MEMORY; + } + for (int i = 0; i < oldSize; i++) { long v = oldHash[i]; if (v != 0) { @@ -330,7 +357,11 @@ class SimilarityIndex { return (int) (v >>> KEY_SHIFT); } - private static int countOf(long v) { - return (int) v; + private static long countOf(long v) { + return v & MAX_COUNT; + } + + static class TableFullException extends Exception { + private static final long serialVersionUID = 1L; } } diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java index 3075c223a..3a9847545 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java @@ -49,10 +49,12 @@ import static org.eclipse.jgit.diff.DiffEntry.Side.OLD; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.BitSet; import java.util.List; import org.eclipse.jgit.JGitText; import org.eclipse.jgit.diff.DiffEntry.ChangeType; +import org.eclipse.jgit.diff.SimilarityIndex.TableFullException; import org.eclipse.jgit.lib.FileMode; import org.eclipse.jgit.lib.NullProgressMonitor; import org.eclipse.jgit.lib.ProgressMonitor; @@ -110,6 +112,9 @@ class SimilarityRenameDetector { /** Score a pair must exceed to be considered a rename. */ private int renameScore = 60; + /** Set if any {@link SimilarityIndex.TableFullException} occurs. */ + private boolean tableOverflow; + private List out; SimilarityRenameDetector(ContentSource.Pair reader, List srcs, @@ -182,6 +187,10 @@ class SimilarityRenameDetector { return dsts; } + boolean isTableOverflow() { + return tableOverflow; + } + private static List compactSrcList(List in) { ArrayList r = new ArrayList(in.size()); for (DiffEntry e : in) { @@ -208,25 +217,22 @@ class SimilarityRenameDetector { long[] srcSizes = new long[srcs.size()]; long[] dstSizes = new long[dsts.size()]; - - // Init the size arrays to some value that indicates that we haven't - // calculated the size yet. Since sizes cannot be negative, -1 will work - Arrays.fill(srcSizes, -1); - Arrays.fill(dstSizes, -1); + BitSet dstTooLarge = null; // Consider each pair of files, if the score is above the minimum // threshold we need record that scoring in the matrix so we can // later find the best matches. // int mNext = 0; - for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) { + SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) { DiffEntry srcEnt = srcs.get(srcIdx); if (!isFile(srcEnt.oldMode)) { pm.update(dsts.size()); continue; } - SimilarityIndex s = hash(OLD, srcEnt); + SimilarityIndex s = null; + for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) { DiffEntry dstEnt = dsts.get(dstIdx); @@ -240,15 +246,20 @@ class SimilarityRenameDetector { continue; } + if (dstTooLarge != null && dstTooLarge.get(dstIdx)) { + pm.update(1); + continue; + } + long srcSize = srcSizes[srcIdx]; - if (srcSize < 0) { - srcSize = size(OLD, srcEnt); + if (srcSize == 0) { + srcSize = size(OLD, srcEnt) + 1; srcSizes[srcIdx] = srcSize; } long dstSize = dstSizes[dstIdx]; - if (dstSize < 0) { - dstSize = size(NEW, dstEnt); + if (dstSize == 0) { + dstSize = size(NEW, dstEnt) + 1; dstSizes[dstIdx] = dstSize; } @@ -260,7 +271,27 @@ class SimilarityRenameDetector { continue; } - SimilarityIndex d = hash(NEW, dstEnt); + if (s == null) { + try { + s = hash(OLD, srcEnt); + } catch (TableFullException tableFull) { + tableOverflow = true; + continue SRC; + } + } + + SimilarityIndex d; + try { + d = hash(NEW, dstEnt); + } catch (TableFullException tableFull) { + if (dstTooLarge == null) + dstTooLarge = new BitSet(dsts.size()); + dstTooLarge.set(dstIdx); + tableOverflow = true; + pm.update(1); + continue; + } + int contentScore = s.score(d, 10000); // nameScore returns a value between 0 and 100, but we want it @@ -336,7 +367,7 @@ class SimilarityRenameDetector { } private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent) - throws IOException { + throws IOException, TableFullException { SimilarityIndex r = new SimilarityIndex(); r.hash(reader.open(side, ent)); r.sort();