Browse Source

Merge branch 'rename-detection'

* rename-detection:
  RenameDetector: Only scan deletes if adds exist
  SimilarityRenameDetector: Initialize sizes to 0
  SimilarityRenameDetector: Avoid allocating source index
  SimilarityRenameDetector: Only attempt to index large files once
  SimilarityIndex: Don't overflow internal counter fields
  SimilarityIndex: Accept files larger than 8 MB
  SimilarityIndex: Correct comment explaining the logic
stable-0.10
Shawn O. Pearce 14 years ago
parent
commit
51bf8ea2a4
  1. 22
      org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
  2. 28
      org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
  3. 81
      org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
  4. 57
      org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java

22
org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java

@ -48,10 +48,11 @@ import java.io.IOException;
import junit.framework.TestCase; import junit.framework.TestCase;
import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
import org.eclipse.jgit.lib.Constants; import org.eclipse.jgit.lib.Constants;
public class SimilarityIndexTest extends TestCase { public class SimilarityIndexTest extends TestCase {
public void testIndexingSmallObject() { public void testIndexingSmallObject() throws TableFullException {
SimilarityIndex si = hash("" // SimilarityIndex si = hash("" //
+ "A\n" // + "A\n" //
+ "B\n" // + "B\n" //
@ -70,7 +71,8 @@ public class SimilarityIndexTest extends TestCase {
assertEquals(2, si.count(si.findIndex(key_D))); assertEquals(2, si.count(si.findIndex(key_D)));
} }
public void testIndexingLargeObject() throws IOException { public void testIndexingLargeObject() throws IOException,
TableFullException {
byte[] in = ("" // byte[] in = ("" //
+ "A\n" // + "A\n" //
+ "B\n" // + "B\n" //
@ -81,7 +83,7 @@ public class SimilarityIndexTest extends TestCase {
assertEquals(2, si.size()); assertEquals(2, si.size());
} }
public void testCommonScore_SameFiles() { public void testCommonScore_SameFiles() throws TableFullException {
String text = "" // String text = "" //
+ "A\n" // + "A\n" //
+ "B\n" // + "B\n" //
@ -96,21 +98,22 @@ public class SimilarityIndexTest extends TestCase {
assertEquals(100, dst.score(src, 100)); assertEquals(100, dst.score(src, 100));
} }
public void testCommonScore_EmptyFiles() { public void testCommonScore_EmptyFiles() throws TableFullException {
SimilarityIndex src = hash(""); SimilarityIndex src = hash("");
SimilarityIndex dst = hash(""); SimilarityIndex dst = hash("");
assertEquals(0, src.common(dst)); assertEquals(0, src.common(dst));
assertEquals(0, dst.common(src)); assertEquals(0, dst.common(src));
} }
public void testCommonScore_TotallyDifferentFiles() { public void testCommonScore_TotallyDifferentFiles()
throws TableFullException {
SimilarityIndex src = hash("A\n"); SimilarityIndex src = hash("A\n");
SimilarityIndex dst = hash("D\n"); SimilarityIndex dst = hash("D\n");
assertEquals(0, src.common(dst)); assertEquals(0, src.common(dst));
assertEquals(0, dst.common(src)); assertEquals(0, dst.common(src));
} }
public void testCommonScore_SimiliarBy75() { public void testCommonScore_SimiliarBy75() throws TableFullException {
SimilarityIndex src = hash("A\nB\nC\nD\n"); SimilarityIndex src = hash("A\nB\nC\nD\n");
SimilarityIndex dst = hash("A\nB\nC\nQ\n"); SimilarityIndex dst = hash("A\nB\nC\nQ\n");
assertEquals(6, src.common(dst)); assertEquals(6, src.common(dst));
@ -120,10 +123,11 @@ public class SimilarityIndexTest extends TestCase {
assertEquals(75, dst.score(src, 100)); assertEquals(75, dst.score(src, 100));
} }
private static SimilarityIndex hash(String text) { private static SimilarityIndex hash(String text) throws TableFullException {
SimilarityIndex src = new SimilarityIndex() { SimilarityIndex src = new SimilarityIndex() {
@Override @Override
void hash(byte[] raw, int ptr, final int end) { void hash(byte[] raw, int ptr, final int end)
throws TableFullException {
while (ptr < end) { while (ptr < end) {
int hash = raw[ptr] & 0xff; int hash = raw[ptr] & 0xff;
int start = ptr; int start = ptr;
@ -143,7 +147,7 @@ public class SimilarityIndexTest extends TestCase {
return src; return src;
} }
private static int keyFor(String line) { private static int keyFor(String line) throws TableFullException {
SimilarityIndex si = hash(line); SimilarityIndex si = hash(line);
assertEquals("single line scored", 1, si.size()); assertEquals("single line scored", 1, si.size());
return si.key(0); return si.key(0);

28
org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java

@ -57,6 +57,7 @@ import java.util.List;
import org.eclipse.jgit.JGitText; import org.eclipse.jgit.JGitText;
import org.eclipse.jgit.diff.DiffEntry.ChangeType; import org.eclipse.jgit.diff.DiffEntry.ChangeType;
import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
import org.eclipse.jgit.lib.AbbreviatedObjectId; import org.eclipse.jgit.lib.AbbreviatedObjectId;
import org.eclipse.jgit.lib.FileMode; import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.NullProgressMonitor; import org.eclipse.jgit.lib.NullProgressMonitor;
@ -356,9 +357,17 @@ public class RenameDetector {
if (pm == null) if (pm == null)
pm = NullProgressMonitor.INSTANCE; pm = NullProgressMonitor.INSTANCE;
if (0 < breakScore)
breakModifies(reader, pm); breakModifies(reader, pm);
if (!added.isEmpty() && !deleted.isEmpty())
findExactRenames(pm); findExactRenames(pm);
if (!added.isEmpty() && !deleted.isEmpty())
findContentRenames(reader, pm); findContentRenames(reader, pm);
if (0 < breakScore && !added.isEmpty() && !deleted.isEmpty())
rejoinModifies(pm); rejoinModifies(pm);
entries.addAll(added); entries.addAll(added);
@ -382,9 +391,6 @@ public class RenameDetector {
private void breakModifies(ContentSource.Pair reader, ProgressMonitor pm) private void breakModifies(ContentSource.Pair reader, ProgressMonitor pm)
throws IOException { throws IOException {
if (breakScore <= 0)
return;
ArrayList<DiffEntry> newEntries = new ArrayList<DiffEntry>(entries.size()); ArrayList<DiffEntry> newEntries = new ArrayList<DiffEntry>(entries.size());
pm.beginTask(JGitText.get().renamesBreakingModifies, entries.size()); pm.beginTask(JGitText.get().renamesBreakingModifies, entries.size());
@ -445,6 +451,7 @@ public class RenameDetector {
private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d) private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d)
throws IOException { throws IOException {
try {
SimilarityIndex src = new SimilarityIndex(); SimilarityIndex src = new SimilarityIndex();
src.hash(reader.open(OLD, d)); src.hash(reader.open(OLD, d));
src.sort(); src.sort();
@ -453,21 +460,27 @@ public class RenameDetector {
dst.hash(reader.open(NEW, d)); dst.hash(reader.open(NEW, d));
dst.sort(); dst.sort();
return src.score(dst, 100); return src.score(dst, 100);
} catch (TableFullException tableFull) {
// If either table overflowed while being constructed, don't allow
// the pair to be broken. Returning 1 higher than breakScore will
// ensure its not similar, but not quite dissimilar enough to break.
//
overRenameLimit = true;
return breakScore + 1;
}
} }
private void findContentRenames(ContentSource.Pair reader, private void findContentRenames(ContentSource.Pair reader,
ProgressMonitor pm) ProgressMonitor pm)
throws IOException { throws IOException {
int cnt = Math.max(added.size(), deleted.size()); int cnt = Math.max(added.size(), deleted.size());
if (cnt == 0)
return;
if (getRenameLimit() == 0 || cnt <= getRenameLimit()) { if (getRenameLimit() == 0 || cnt <= getRenameLimit()) {
SimilarityRenameDetector d; SimilarityRenameDetector d;
d = new SimilarityRenameDetector(reader, deleted, added); d = new SimilarityRenameDetector(reader, deleted, added);
d.setRenameScore(getRenameScore()); d.setRenameScore(getRenameScore());
d.compute(pm); d.compute(pm);
overRenameLimit |= d.isTableOverflow();
deleted = d.getLeftOverSources(); deleted = d.getLeftOverSources();
added = d.getLeftOverDestinations(); added = d.getLeftOverDestinations();
entries.addAll(d.getMatches()); entries.addAll(d.getMatches());
@ -478,9 +491,6 @@ public class RenameDetector {
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
private void findExactRenames(ProgressMonitor pm) { private void findExactRenames(ProgressMonitor pm) {
if (added.isEmpty() || deleted.isEmpty())
return;
pm.beginTask(JGitText.get().renamesFindingExact, // pm.beginTask(JGitText.get().renamesFindingExact, //
added.size() + added.size() + deleted.size() added.size() + added.size() + deleted.size()
+ added.size() * deleted.size()); + added.size() * deleted.size());

81
org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java

@ -65,8 +65,8 @@ import org.eclipse.jgit.lib.ObjectStream;
* file are discovered. * file are discovered.
*/ */
class SimilarityIndex { class SimilarityIndex {
/** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */ /** A special {@link TableFullException} used in place of OutOfMemoryError. */
private static final int MAX_HASH_BITS = 17; private static final TableFullException TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
/** /**
* Shift to apply before storing a key. * Shift to apply before storing a key.
@ -76,20 +76,26 @@ class SimilarityIndex {
*/ */
private static final int KEY_SHIFT = 32; private static final int KEY_SHIFT = 32;
/** Maximum value of the count field, also mask to extract the count. */
private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
/** Total size of the file we hashed into the structure. */ /** Total size of the file we hashed into the structure. */
private long fileSize; private long fileSize;
/** Number of non-zero entries in {@link #idHash}. */ /** Number of non-zero entries in {@link #idHash}. */
private int idSize; private int idSize;
/** {@link #idSize} that triggers {@link #idHash} to double in size. */
private int idGrowAt;
/** /**
* Pairings of content keys and counters. * Pairings of content keys and counters.
* <p> * <p>
* Slots in the table are actually two ints wedged into a single long. The * Slots in the table are actually two ints wedged into a single long. The
* upper {@link #MAX_HASH_BITS} bits stores the content key, and the * upper 32 bits stores the content key, and the remaining lower bits stores
* remaining lower bits stores the number of bytes associated with that key. * the number of bytes associated with that key. Empty slots are denoted by
* Empty slots are denoted by 0, which cannot occur because the count cannot * 0, which cannot occur because the count cannot be 0. Values can only be
* be 0. Values can only be positive, which we enforce during key addition. * positive, which we enforce during key addition.
*/ */
private long[] idHash; private long[] idHash;
@ -99,6 +105,7 @@ class SimilarityIndex {
SimilarityIndex() { SimilarityIndex() {
idHashBits = 8; idHashBits = 8;
idHash = new long[1 << idHashBits]; idHash = new long[1 << idHashBits];
idGrowAt = growAt(idHashBits);
} }
long getFileSize() { long getFileSize() {
@ -109,7 +116,8 @@ class SimilarityIndex {
fileSize = size; fileSize = size;
} }
void hash(ObjectLoader obj) throws MissingObjectException, IOException { void hash(ObjectLoader obj) throws MissingObjectException, IOException,
TableFullException {
if (obj.isLarge()) { if (obj.isLarge()) {
ObjectStream in = obj.openStream(); ObjectStream in = obj.openStream();
try { try {
@ -125,7 +133,7 @@ class SimilarityIndex {
} }
} }
void hash(byte[] raw, int ptr, final int end) { void hash(byte[] raw, int ptr, final int end) throws TableFullException {
while (ptr < end) { while (ptr < end) {
int hash = 5381; int hash = 5381;
int start = ptr; int start = ptr;
@ -141,7 +149,8 @@ class SimilarityIndex {
} }
} }
void hash(InputStream in, long remaining) throws IOException { void hash(InputStream in, long remaining) throws IOException,
TableFullException {
byte[] buf = new byte[4096]; byte[] buf = new byte[4096];
int ptr = 0; int ptr = 0;
int cnt = 0; int cnt = 0;
@ -190,11 +199,11 @@ class SimilarityIndex {
return (int) ((common(dst) * maxScore) / max); return (int) ((common(dst) * maxScore) / max);
} }
int common(SimilarityIndex dst) { long common(SimilarityIndex dst) {
return common(this, dst); return common(this, dst);
} }
private static int common(SimilarityIndex src, SimilarityIndex dst) { private static long common(SimilarityIndex src, SimilarityIndex dst) {
int srcIdx = src.packedIndex(0); int srcIdx = src.packedIndex(0);
int dstIdx = dst.packedIndex(0); int dstIdx = dst.packedIndex(0);
long[] srcHash = src.idHash; long[] srcHash = src.idHash;
@ -202,12 +211,12 @@ class SimilarityIndex {
return common(srcHash, srcIdx, dstHash, dstIdx); return common(srcHash, srcIdx, dstHash, dstIdx);
} }
private static int common(long[] srcHash, int srcIdx, // private static long common(long[] srcHash, int srcIdx, //
long[] dstHash, int dstIdx) { long[] dstHash, int dstIdx) {
if (srcIdx == srcHash.length || dstIdx == dstHash.length) if (srcIdx == srcHash.length || dstIdx == dstHash.length)
return 0; return 0;
int common = 0; long common = 0;
int srcKey = keyOf(srcHash[srcIdx]); int srcKey = keyOf(srcHash[srcIdx]);
int dstKey = keyOf(dstHash[dstIdx]); int dstKey = keyOf(dstHash[dstIdx]);
@ -230,8 +239,8 @@ class SimilarityIndex {
break; break;
srcKey = keyOf(srcHash[srcIdx]); srcKey = keyOf(srcHash[srcIdx]);
} else /* if (srcKey > dstKey) */{ } else /* if (dstKey < srcKey) */{
// Regions of dst which do not appear in dst. // Regions of dst which do not appear in src.
if (++dstIdx == dstHash.length) if (++dstIdx == dstHash.length)
break; break;
dstKey = keyOf(dstHash[dstIdx]); dstKey = keyOf(dstHash[dstIdx]);
@ -268,7 +277,7 @@ class SimilarityIndex {
return (idHash.length - idSize) + idx; return (idHash.length - idSize) + idx;
} }
void add(int key, int cnt) { void add(int key, int cnt) throws TableFullException {
key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative. key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
int j = slot(key); int j = slot(key);
@ -276,18 +285,20 @@ class SimilarityIndex {
long v = idHash[j]; long v = idHash[j];
if (v == 0) { if (v == 0) {
// Empty slot in the table, store here. // Empty slot in the table, store here.
if (shouldGrow()) { if (idGrowAt <= idSize) {
grow(); grow();
j = slot(key); j = slot(key);
continue; continue;
} }
idHash[j] = (((long) key) << KEY_SHIFT) | cnt; idHash[j] = pair(key, cnt);
idSize++; idSize++;
return; return;
} else if (keyOf(v) == key) { } else if (keyOf(v) == key) {
// Same key, increment the counter. // Same key, increment the counter. If it overflows, fail
idHash[j] = v + cnt; // indexing to prevent the key from being impacted.
//
idHash[j] = pair(key, countOf(v) + cnt);
return; return;
} else if (++j >= idHash.length) { } else if (++j >= idHash.length) {
@ -296,6 +307,12 @@ class SimilarityIndex {
} }
} }
private static long pair(int key, long cnt) throws TableFullException {
if (MAX_COUNT < cnt)
throw new TableFullException();
return (((long) key) << KEY_SHIFT) | cnt;
}
private int slot(int key) { private int slot(int key) {
// We use 31 - idHashBits because the upper bit was already forced // We use 31 - idHashBits because the upper bit was already forced
// to be 0 and we want the remaining high bits to be used as the // to be 0 and we want the remaining high bits to be used as the
@ -304,16 +321,26 @@ class SimilarityIndex {
return key >>> (31 - idHashBits); return key >>> (31 - idHashBits);
} }
private boolean shouldGrow() { private static int growAt(int idHashBits) {
return idHashBits < MAX_HASH_BITS && idHash.length <= idSize * 2; return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
} }
private void grow() { private void grow() throws TableFullException {
if (idHashBits == 30)
throw new TableFullException();
long[] oldHash = idHash; long[] oldHash = idHash;
int oldSize = idHash.length; int oldSize = idHash.length;
idHashBits++; idHashBits++;
idGrowAt = growAt(idHashBits);
try {
idHash = new long[1 << idHashBits]; idHash = new long[1 << idHashBits];
} catch (OutOfMemoryError noMemory) {
throw TABLE_FULL_OUT_OF_MEMORY;
}
for (int i = 0; i < oldSize; i++) { for (int i = 0; i < oldSize; i++) {
long v = oldHash[i]; long v = oldHash[i];
if (v != 0) { if (v != 0) {
@ -330,7 +357,11 @@ class SimilarityIndex {
return (int) (v >>> KEY_SHIFT); return (int) (v >>> KEY_SHIFT);
} }
private static int countOf(long v) { private static long countOf(long v) {
return (int) v; return v & MAX_COUNT;
}
static class TableFullException extends Exception {
private static final long serialVersionUID = 1L;
} }
} }

57
org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java

@ -49,10 +49,12 @@ import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.BitSet;
import java.util.List; import java.util.List;
import org.eclipse.jgit.JGitText; import org.eclipse.jgit.JGitText;
import org.eclipse.jgit.diff.DiffEntry.ChangeType; import org.eclipse.jgit.diff.DiffEntry.ChangeType;
import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
import org.eclipse.jgit.lib.FileMode; import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.NullProgressMonitor; import org.eclipse.jgit.lib.NullProgressMonitor;
import org.eclipse.jgit.lib.ProgressMonitor; import org.eclipse.jgit.lib.ProgressMonitor;
@ -110,6 +112,9 @@ class SimilarityRenameDetector {
/** Score a pair must exceed to be considered a rename. */ /** Score a pair must exceed to be considered a rename. */
private int renameScore = 60; private int renameScore = 60;
/** Set if any {@link SimilarityIndex.TableFullException} occurs. */
private boolean tableOverflow;
private List<DiffEntry> out; private List<DiffEntry> out;
SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs, SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
@ -182,6 +187,10 @@ class SimilarityRenameDetector {
return dsts; return dsts;
} }
boolean isTableOverflow() {
return tableOverflow;
}
private static List<DiffEntry> compactSrcList(List<DiffEntry> in) { private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size()); ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size());
for (DiffEntry e : in) { for (DiffEntry e : in) {
@ -208,25 +217,22 @@ class SimilarityRenameDetector {
long[] srcSizes = new long[srcs.size()]; long[] srcSizes = new long[srcs.size()];
long[] dstSizes = new long[dsts.size()]; long[] dstSizes = new long[dsts.size()];
BitSet dstTooLarge = null;
// Init the size arrays to some value that indicates that we haven't
// calculated the size yet. Since sizes cannot be negative, -1 will work
Arrays.fill(srcSizes, -1);
Arrays.fill(dstSizes, -1);
// Consider each pair of files, if the score is above the minimum // Consider each pair of files, if the score is above the minimum
// threshold we need record that scoring in the matrix so we can // threshold we need record that scoring in the matrix so we can
// later find the best matches. // later find the best matches.
// //
int mNext = 0; int mNext = 0;
for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) { SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
DiffEntry srcEnt = srcs.get(srcIdx); DiffEntry srcEnt = srcs.get(srcIdx);
if (!isFile(srcEnt.oldMode)) { if (!isFile(srcEnt.oldMode)) {
pm.update(dsts.size()); pm.update(dsts.size());
continue; continue;
} }
SimilarityIndex s = hash(OLD, srcEnt); SimilarityIndex s = null;
for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) { for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
DiffEntry dstEnt = dsts.get(dstIdx); DiffEntry dstEnt = dsts.get(dstIdx);
@ -240,15 +246,20 @@ class SimilarityRenameDetector {
continue; continue;
} }
if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
pm.update(1);
continue;
}
long srcSize = srcSizes[srcIdx]; long srcSize = srcSizes[srcIdx];
if (srcSize < 0) { if (srcSize == 0) {
srcSize = size(OLD, srcEnt); srcSize = size(OLD, srcEnt) + 1;
srcSizes[srcIdx] = srcSize; srcSizes[srcIdx] = srcSize;
} }
long dstSize = dstSizes[dstIdx]; long dstSize = dstSizes[dstIdx];
if (dstSize < 0) { if (dstSize == 0) {
dstSize = size(NEW, dstEnt); dstSize = size(NEW, dstEnt) + 1;
dstSizes[dstIdx] = dstSize; dstSizes[dstIdx] = dstSize;
} }
@ -260,7 +271,27 @@ class SimilarityRenameDetector {
continue; continue;
} }
SimilarityIndex d = hash(NEW, dstEnt); if (s == null) {
try {
s = hash(OLD, srcEnt);
} catch (TableFullException tableFull) {
tableOverflow = true;
continue SRC;
}
}
SimilarityIndex d;
try {
d = hash(NEW, dstEnt);
} catch (TableFullException tableFull) {
if (dstTooLarge == null)
dstTooLarge = new BitSet(dsts.size());
dstTooLarge.set(dstIdx);
tableOverflow = true;
pm.update(1);
continue;
}
int contentScore = s.score(d, 10000); int contentScore = s.score(d, 10000);
// nameScore returns a value between 0 and 100, but we want it // nameScore returns a value between 0 and 100, but we want it
@ -336,7 +367,7 @@ class SimilarityRenameDetector {
} }
private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent) private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
throws IOException { throws IOException, TableFullException {
SimilarityIndex r = new SimilarityIndex(); SimilarityIndex r = new SimilarityIndex();
r.hash(reader.open(side, ent)); r.hash(reader.open(side, ent));
r.sort(); r.sort();

Loading…
Cancel
Save