Browse Source

Fix broken HistogramDiff

HistogramDiff failed on cases where the initial element for the LCS
was actually very common (e.g. has 20 occurrences), and the first
element of the inserted region after the LCS was also common but
had fewer occurrences (e.g. 10), while the LCS also contained a
unique element (1 occurrence).

This happens often in Java source code.  The initial element for
the LCS might be the empty line ("\n"), and the inserted but common
element might be "\t/**\n", with the LCS being a large span of
lines that contains unique method declarations.  Even though "/**"
occurs less often than the empty line its not a better LCS if the
LCS we already have contains a unique element.

The logic in HistogramDiff would normally have worked fine, except I
tried to optimize scanning of B by making tryLongestCommonSequence
return the end of the region when there are matching elements
found in A.  This allows us to skip over the current LCS region,
as it has already been examined, but caused us to fail to identify
an element that had a lower occurrence count within the region.

The solution used here is to trade space-for-time by keeping a
table of A positions to their occurrence counts.  This allows the
matching logic to always use the smallest count for this region,
even if the smallest count doesn't appear on the initial element.

The new unit test testEdit_LcsContainsUnique() verifies this new
behavior works as expected.

Bug: 328895
Change-Id: Id170783b891f645b6a8cf6f133c6682b8de40aaf
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
stable-0.10
Shawn O. Pearce 14 years ago
parent
commit
b88b693a3d
  1. 8
      org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/HistogramDiffTest.java
  2. 42
      org.eclipse.jgit/src/org/eclipse/jgit/diff/HistogramDiffIndex.java

8
org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/HistogramDiffTest.java

@ -74,6 +74,14 @@ public class HistogramDiffTest extends AbstractDiffTestCase {
assertEquals(new Edit(3, 3, 2, 5), r.get(1)); // INSERT "SRR"
}
public void testEdit_LcsContainsUnique() {
EditList r = diff(t("nqnjrnjsnm"), t("AnqnjrnjsnjTnmZ"));
assertEquals(new Edit(0, 0, 0, 1), r.get(0)); // INSERT "A";
assertEquals(new Edit(9, 9, 10, 13), r.get(1)); // INSERT "jTn";
assertEquals(new Edit(10, 10, 14, 15), r.get(2)); // INSERT "Z";
assertEquals(3, r.size());
}
public void testExceedsChainLength_DuringScanOfA() {
HistogramDiff hd = new HistogramDiff();
hd.setFallbackAlgorithm(null);

42
org.eclipse.jgit/src/org/eclipse/jgit/diff/HistogramDiffIndex.java

@ -106,10 +106,10 @@ final class HistogramDiffIndex<S extends Sequence> {
private int recCnt;
/**
* For {@code ptr}, {@code next[ptr - nextShift]} has subsequent index.
* For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
*
* For the sequence element {@code ptr}, the value stored at location
* {@code next[ptr - nextShift]} is the next occurrence of the exact same
* {@code next[ptr - ptrShift]} is the next occurrence of the exact same
* element in the sequence.
*
* Chains always run from the lowest index to the largest index. Therefore
@ -118,13 +118,25 @@ final class HistogramDiffIndex<S extends Sequence> {
* be a valid next element.
*
* The array is sized to be {@code region.getLengthA()} and element indexes
* are converted to array indexes by subtracting {@link #nextShift}, which
* is just a cached version of {@code region.beginA}.
* are converted to array indexes by subtracting {@link #ptrShift}, which is
* just a cached version of {@code region.beginA}.
*/
private int[] next;
/**
* For element {@code ptr} in A, index of the record in {@link #recs} array.
*
* The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
* describing all occurrences of the element appearing in sequence A at
* position {@code ptr}. The record is needed to get the occurrence count of
* the element, or to locate all other occurrences of that element within
* sequence A. This index provides constant-time access to the record, and
* avoids needing to scan the hash chain.
*/
private int[] recIdx;
/** Value to subtract from element indexes to key {@link #next} array. */
private int nextShift;
private int ptrShift;
private Edit lcs;
@ -148,10 +160,11 @@ final class HistogramDiffIndex<S extends Sequence> {
final int tableBits = tableBits(sz);
table = new int[1 << tableBits];
keyShift = 32 - tableBits;
nextShift = r.beginA;
ptrShift = r.beginA;
recs = new long[Math.max(4, sz >>> 3)];
next = new int[sz];
recIdx = new int[sz];
}
Edit findLongestCommonSequence() {
@ -187,7 +200,8 @@ final class HistogramDiffIndex<S extends Sequence> {
if (MAX_CNT < newCnt)
newCnt = MAX_CNT;
recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
next[ptr - nextShift] = recPtr(rec);
next[ptr - ptrShift] = recPtr(rec);
recIdx[ptr - ptrShift] = rIdx;
continue SCAN;
}
@ -210,6 +224,7 @@ final class HistogramDiffIndex<S extends Sequence> {
}
recs[rIdx] = recCreate(table[tIdx], ptr, 1);
recIdx[ptr - ptrShift] = rIdx;
table[tIdx] = rIdx;
}
return true;
@ -234,25 +249,30 @@ final class HistogramDiffIndex<S extends Sequence> {
hasCommon = true;
TRY_LOCATIONS: for (;;) {
int np = next[as - nextShift];
int np = next[as - ptrShift];
int bs = bPtr;
int ae = as + 1;
int be = bs + 1;
int rc = recCnt(rec);
while (region.beginA < as && region.beginB < bs
&& cmp.equals(a, as - 1, b, bs - 1)) {
as--;
bs--;
if (1 < rc)
rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
}
while (ae < region.endA && be < region.endB
&& cmp.equals(a, ae, b, be)) {
if (1 < rc)
rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
ae++;
be++;
}
if (bNext < be)
bNext = be;
if (lcs.getLengthA() < ae - as || recCnt(rec) < cnt) {
if (lcs.getLengthA() < ae - as || rc < cnt) {
// If this region is the longest, or there are less
// occurrences of it in A, its now our LCS.
//
@ -260,7 +280,7 @@ final class HistogramDiffIndex<S extends Sequence> {
lcs.beginB = bs;
lcs.endA = ae;
lcs.endB = be;
cnt = recCnt(rec);
cnt = rc;
}
// Because we added elements in reverse order index 0
@ -275,7 +295,7 @@ final class HistogramDiffIndex<S extends Sequence> {
// The next location to consider was actually within
// the LCS we examined above. Don't reconsider it.
//
np = next[np - nextShift];
np = next[np - ptrShift];
if (np == 0)
break TRY_LOCATIONS;
}

Loading…
Cancel
Save