Browse Source
HistogramDiff is an alternative implementation of patience diff, performing a search over all matching locations and picking the longest common subsequence that has the lowest occurrence count. If there are unique common elements, its behavior is identical to that of patience diff. Actual performance on real-world source files usually beats MyersDiff, sometimes by a factor of 3, especially for complex comparators that ignore whitespace. Change-Id: I1806cd708087e36d144fb824a0e5ab7cdd579d73 Signed-off-by: Shawn O. Pearce <spearce@spearce.org>stable-0.10
Shawn O. Pearce
14 years ago
5 changed files with 679 additions and 0 deletions
@ -0,0 +1,154 @@ |
|||||||
|
/* |
||||||
|
* Copyright (C) 2010, Google Inc. |
||||||
|
* and other copyright owners as documented in the project's IP log. |
||||||
|
* |
||||||
|
* This program and the accompanying materials are made available |
||||||
|
* under the terms of the Eclipse Distribution License v1.0 which |
||||||
|
* accompanies this distribution, is reproduced below, and is |
||||||
|
* available at http://www.eclipse.org/org/documents/edl-v10.php
|
||||||
|
* |
||||||
|
* All rights reserved. |
||||||
|
* |
||||||
|
* Redistribution and use in source and binary forms, with or |
||||||
|
* without modification, are permitted provided that the following |
||||||
|
* conditions are met: |
||||||
|
* |
||||||
|
* - Redistributions of source code must retain the above copyright |
||||||
|
* notice, this list of conditions and the following disclaimer. |
||||||
|
* |
||||||
|
* - Redistributions in binary form must reproduce the above |
||||||
|
* copyright notice, this list of conditions and the following |
||||||
|
* disclaimer in the documentation and/or other materials provided |
||||||
|
* with the distribution. |
||||||
|
* |
||||||
|
* - Neither the name of the Eclipse Foundation, Inc. nor the |
||||||
|
* names of its contributors may be used to endorse or promote |
||||||
|
* products derived from this software without specific prior |
||||||
|
* written permission. |
||||||
|
* |
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
||||||
|
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, |
||||||
|
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
||||||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
||||||
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
||||||
|
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
||||||
|
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.eclipse.jgit.diff; |
||||||
|
|
||||||
|
import org.eclipse.jgit.diff.DiffPerformanceTest.CharArray; |
||||||
|
import org.eclipse.jgit.diff.DiffPerformanceTest.CharCmp; |
||||||
|
|
||||||
|
public class HistogramDiffTest extends AbstractDiffTestCase { |
||||||
|
@Override |
||||||
|
protected HistogramDiff algorithm() { |
||||||
|
HistogramDiff hd = new HistogramDiff(); |
||||||
|
hd.setFallbackAlgorithm(null); |
||||||
|
return hd; |
||||||
|
} |
||||||
|
|
||||||
|
public void testEdit_NoUniqueMiddleSide_FlipBlocks() { |
||||||
|
EditList r = diff(t("aRRSSz"), t("aSSRRz")); |
||||||
|
assertEquals(2, r.size()); |
||||||
|
assertEquals(new Edit(1, 3, 1, 1), r.get(0)); // DELETE "RR"
|
||||||
|
assertEquals(new Edit(5, 5, 3, 5), r.get(1)); // INSERT "RR
|
||||||
|
} |
||||||
|
|
||||||
|
public void testEdit_NoUniqueMiddleSide_Insert2() { |
||||||
|
EditList r = diff(t("aRSz"), t("aRRSSz")); |
||||||
|
assertEquals(1, r.size()); |
||||||
|
assertEquals(new Edit(2, 2, 2, 4), r.get(0)); |
||||||
|
} |
||||||
|
|
||||||
|
public void testEdit_NoUniqueMiddleSide_FlipAndExpand() { |
||||||
|
EditList r = diff(t("aRSz"), t("aSSRRz")); |
||||||
|
assertEquals(2, r.size()); |
||||||
|
assertEquals(new Edit(1, 2, 1, 1), r.get(0)); // DELETE "R"
|
||||||
|
assertEquals(new Edit(3, 3, 2, 5), r.get(1)); // INSERT "SRR"
|
||||||
|
} |
||||||
|
|
||||||
|
public void testExceedsChainLenght_DuringScanOfA() { |
||||||
|
HistogramDiff hd = new HistogramDiff(); |
||||||
|
hd.setFallbackAlgorithm(null); |
||||||
|
hd.setMaxChainLength(3); |
||||||
|
|
||||||
|
SequenceComparator<RawText> cmp = new SequenceComparator<RawText>() { |
||||||
|
@Override |
||||||
|
public boolean equals(RawText a, int ai, RawText b, int bi) { |
||||||
|
return RawTextComparator.DEFAULT.equals(a, ai, b, bi); |
||||||
|
} |
||||||
|
|
||||||
|
@Override |
||||||
|
public int hash(RawText a, int ai) { |
||||||
|
return 1; |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
EditList r = hd.diff(cmp, t("RabS"), t("QabT")); |
||||||
|
assertEquals(1, r.size()); |
||||||
|
assertEquals(new Edit(0, 4, 0, 4), r.get(0)); |
||||||
|
} |
||||||
|
|
||||||
|
public void testExceedsChainLenght_DuringScanOfB() { |
||||||
|
HistogramDiff hd = new HistogramDiff(); |
||||||
|
hd.setFallbackAlgorithm(null); |
||||||
|
hd.setMaxChainLength(1); |
||||||
|
|
||||||
|
EditList r = hd.diff(RawTextComparator.DEFAULT, t("RaaS"), t("QaaT")); |
||||||
|
assertEquals(1, r.size()); |
||||||
|
assertEquals(new Edit(0, 4, 0, 4), r.get(0)); |
||||||
|
} |
||||||
|
|
||||||
|
public void testFallbackToMyersDiff() { |
||||||
|
HistogramDiff hd = new HistogramDiff(); |
||||||
|
hd.setMaxChainLength(64); |
||||||
|
|
||||||
|
String a = DiffTestDataGenerator.generateSequence(40000, 971, 3); |
||||||
|
String b = DiffTestDataGenerator.generateSequence(40000, 1621, 5); |
||||||
|
CharCmp cmp = new CharCmp(); |
||||||
|
CharArray ac = new CharArray(a); |
||||||
|
CharArray bc = new CharArray(b); |
||||||
|
EditList r; |
||||||
|
|
||||||
|
// Without fallback our results are limited due to collisions.
|
||||||
|
hd.setFallbackAlgorithm(null); |
||||||
|
r = hd.diff(cmp, ac, bc); |
||||||
|
assertEquals(70, r.size()); |
||||||
|
|
||||||
|
// Results go up when we add a fallback for the high collision regions.
|
||||||
|
hd.setFallbackAlgorithm(MyersDiff.INSTANCE); |
||||||
|
r = hd.diff(cmp, ac, bc); |
||||||
|
assertEquals(73, r.size()); |
||||||
|
|
||||||
|
// But they still differ from Myers due to the way we did early steps.
|
||||||
|
EditList myersResult = MyersDiff.INSTANCE.diff(cmp, ac, bc); |
||||||
|
assertFalse("Not same as Myers", myersResult.equals(r)); |
||||||
|
} |
||||||
|
|
||||||
|
public void testPerformanceTestDeltaLength() { |
||||||
|
HistogramDiff hd = new HistogramDiff(); |
||||||
|
hd.setFallbackAlgorithm(null); |
||||||
|
|
||||||
|
String a = DiffTestDataGenerator.generateSequence(40000, 971, 3); |
||||||
|
String b = DiffTestDataGenerator.generateSequence(40000, 1621, 5); |
||||||
|
CharCmp cmp = new CharCmp(); |
||||||
|
CharArray ac = new CharArray(a); |
||||||
|
CharArray bc = new CharArray(b); |
||||||
|
EditList r; |
||||||
|
|
||||||
|
hd.setMaxChainLength(64); |
||||||
|
r = hd.diff(cmp, ac, bc); |
||||||
|
assertEquals(70, r.size()); |
||||||
|
|
||||||
|
hd.setMaxChainLength(176); |
||||||
|
r = hd.diff(cmp, ac, bc); |
||||||
|
assertEquals(72, r.size()); |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,204 @@ |
|||||||
|
/* |
||||||
|
* Copyright (C) 2010, Google Inc. |
||||||
|
* and other copyright owners as documented in the project's IP log. |
||||||
|
* |
||||||
|
* This program and the accompanying materials are made available |
||||||
|
* under the terms of the Eclipse Distribution License v1.0 which |
||||||
|
* accompanies this distribution, is reproduced below, and is |
||||||
|
* available at http://www.eclipse.org/org/documents/edl-v10.php
|
||||||
|
* |
||||||
|
* All rights reserved. |
||||||
|
* |
||||||
|
* Redistribution and use in source and binary forms, with or |
||||||
|
* without modification, are permitted provided that the following |
||||||
|
* conditions are met: |
||||||
|
* |
||||||
|
* - Redistributions of source code must retain the above copyright |
||||||
|
* notice, this list of conditions and the following disclaimer. |
||||||
|
* |
||||||
|
* - Redistributions in binary form must reproduce the above |
||||||
|
* copyright notice, this list of conditions and the following |
||||||
|
* disclaimer in the documentation and/or other materials provided |
||||||
|
* with the distribution. |
||||||
|
* |
||||||
|
* - Neither the name of the Eclipse Foundation, Inc. nor the |
||||||
|
* names of its contributors may be used to endorse or promote |
||||||
|
* products derived from this software without specific prior |
||||||
|
* written permission. |
||||||
|
* |
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
||||||
|
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, |
||||||
|
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
||||||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
||||||
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
||||||
|
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
||||||
|
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.eclipse.jgit.diff; |
||||||
|
|
||||||
|
/** |
||||||
|
* An extended form of Bram Cohen's patience diff algorithm. |
||||||
|
* |
||||||
|
* This implementation was derived by using the 4 rules that are outlined in |
||||||
|
* Bram Cohen's <a href="http://bramcohen.livejournal.com/73318.html">blog</a>, |
||||||
|
* and then was further extended to support low-occurrence common elements. |
||||||
|
* |
||||||
|
* The basic idea of the algorithm is to create a histogram of occurrences for |
||||||
|
* each element of sequence A. Each element of sequence B is then considered in |
||||||
|
* turn. If the element also exists in sequence A, and has a lower occurrence |
||||||
|
* count, the positions are considered as a candidate for the longest common |
||||||
|
* subsequence (LCS). After scanning of B is complete the LCS that has the |
||||||
|
* lowest number of occurrences is chosen as a split point. The region is split |
||||||
|
* around the LCS, and the algorithm is recursively applied to the sections |
||||||
|
* before and after the LCS. |
||||||
|
* |
||||||
|
* By always selecting a LCS position with the lowest occurrence count, this |
||||||
|
* algorithm behaves exactly like Bram Cohen's patience diff whenever there is a |
||||||
|
* unique common element available between the two sequences. When no unique |
||||||
|
* elements exist, the lowest occurrence element is chosen instead. This offers |
||||||
|
* more readable diffs than simply falling back on the standard Myers' O(ND) |
||||||
|
* algorithm would produce. |
||||||
|
* |
||||||
|
* To prevent the algorithm from having an O(N^2) running time, an upper limit |
||||||
|
* on the number of unique elements in a histogram bucket is configured by |
||||||
|
* {@link #setMaxChainLength(int)}. If sequence A has more than this many |
||||||
|
* elements that hash into the same hash bucket, the algorithm passes the region |
||||||
|
* to {@link #setFallbackAlgorithm(DiffAlgorithm)}. If no fallback algorithm is |
||||||
|
* configured, the region is emitted as a replace edit. |
||||||
|
* |
||||||
|
* During scanning of sequence B, any element of A that occurs more than |
||||||
|
* {@link #setMaxChainLength(int)} times is never considered for an LCS match |
||||||
|
* position, even if it is common between the two sequences. This limits the |
||||||
|
* number of locations in sequence A that must be considered to find the LCS, |
||||||
|
* and helps maintain a lower running time bound. |
||||||
|
* |
||||||
|
* So long as {@link #setMaxChainLength(int)} is a small constant (such as 64), |
||||||
|
* the algorithm runs in O(N * D) time, where N is the sum of the input lengths |
||||||
|
* and D is the number of edits in the resulting EditList. If the supplied |
||||||
|
* {@link SequenceComparator} has a good hash function, this implementation |
||||||
|
* typically out-performs {@link MyersDiff}, even though its theoretical running |
||||||
|
* time is the same. |
||||||
|
* |
||||||
|
* This implementation has an internal limitation that prevents it from handling |
||||||
|
* sequences with more than 268,435,456 (2^28) elements. |
||||||
|
*/ |
||||||
|
public class HistogramDiff extends DiffAlgorithm { |
||||||
|
/** Algorithm to use when there are too many element occurrences. */ |
||||||
|
private DiffAlgorithm fallback = MyersDiff.INSTANCE; |
||||||
|
|
||||||
|
/** |
||||||
|
* Maximum number of positions to consider for a given element hash. |
||||||
|
* |
||||||
|
* All elements with the same hash are stored into a single chain. The chain |
||||||
|
* size is capped to ensure search is linear time at O(len_A + len_B) rather |
||||||
|
* than quadratic at O(len_A * len_B). |
||||||
|
*/ |
||||||
|
private int maxChainLength = 64; |
||||||
|
|
||||||
|
/** |
||||||
|
* Set the algorithm used when there are too many element occurrences. |
||||||
|
* |
||||||
|
* @param alg |
||||||
|
* the secondary algorithm. If null the region will be denoted as |
||||||
|
* a single REPLACE block. |
||||||
|
*/ |
||||||
|
public void setFallbackAlgorithm(DiffAlgorithm alg) { |
||||||
|
fallback = alg; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Maximum number of positions to consider for a given element hash. |
||||||
|
* |
||||||
|
* All elements with the same hash are stored into a single chain. The chain |
||||||
|
* size is capped to ensure search is linear time at O(len_A + len_B) rather |
||||||
|
* than quadratic at O(len_A * len_B). |
||||||
|
* |
||||||
|
* @param maxLen |
||||||
|
* new maximum length. |
||||||
|
*/ |
||||||
|
public void setMaxChainLength(int maxLen) { |
||||||
|
maxChainLength = maxLen; |
||||||
|
} |
||||||
|
|
||||||
|
public <S extends Sequence> EditList diffNonCommon( |
||||||
|
SequenceComparator<? super S> cmp, S a, S b) { |
||||||
|
State<S> s = new State<S>(new HashedSequencePair<S>(cmp, a, b)); |
||||||
|
s.diffReplace(new Edit(0, s.a.size(), 0, s.b.size())); |
||||||
|
return s.edits; |
||||||
|
} |
||||||
|
|
||||||
|
private class State<S extends Sequence> { |
||||||
|
private final HashedSequenceComparator<S> cmp; |
||||||
|
|
||||||
|
private final HashedSequence<S> a; |
||||||
|
|
||||||
|
private final HashedSequence<S> b; |
||||||
|
|
||||||
|
/** Result edits we have determined that must be made to convert a to b. */ |
||||||
|
final EditList edits; |
||||||
|
|
||||||
|
State(HashedSequencePair<S> p) { |
||||||
|
this.cmp = p.getComparator(); |
||||||
|
this.a = p.getA(); |
||||||
|
this.b = p.getB(); |
||||||
|
this.edits = new EditList(); |
||||||
|
} |
||||||
|
|
||||||
|
void diffReplace(Edit r) { |
||||||
|
Edit lcs = new HistogramDiffIndex<S>(maxChainLength, cmp, a, b, r) |
||||||
|
.findLongestCommonSequence(); |
||||||
|
if (lcs != null) { |
||||||
|
// If we were given an edit, we can prove a result here.
|
||||||
|
//
|
||||||
|
if (lcs.isEmpty()) { |
||||||
|
// An empty edit indicates there is nothing in common.
|
||||||
|
// Replace the entire region.
|
||||||
|
//
|
||||||
|
edits.add(r); |
||||||
|
} else { |
||||||
|
diff(r.before(lcs)); |
||||||
|
diff(r.after(lcs)); |
||||||
|
} |
||||||
|
|
||||||
|
} else if (fallback != null) { |
||||||
|
SubsequenceComparator<HashedSequence<S>> cs = subcmp(); |
||||||
|
Subsequence<HashedSequence<S>> as = Subsequence.a(a, r); |
||||||
|
Subsequence<HashedSequence<S>> bs = Subsequence.b(b, r); |
||||||
|
|
||||||
|
EditList res = fallback.diffNonCommon(cs, as, bs); |
||||||
|
edits.addAll(Subsequence.toBase(res, as, bs)); |
||||||
|
|
||||||
|
} else { |
||||||
|
edits.add(r); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
private void diff(Edit r) { |
||||||
|
switch (r.getType()) { |
||||||
|
case INSERT: |
||||||
|
case DELETE: |
||||||
|
edits.add(r); |
||||||
|
break; |
||||||
|
|
||||||
|
case REPLACE: |
||||||
|
diffReplace(r); |
||||||
|
break; |
||||||
|
|
||||||
|
case EMPTY: |
||||||
|
default: |
||||||
|
throw new IllegalStateException(); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
private SubsequenceComparator<HashedSequence<S>> subcmp() { |
||||||
|
return new SubsequenceComparator<HashedSequence<S>>(cmp); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,319 @@ |
|||||||
|
/* |
||||||
|
* Copyright (C) 2010, Google Inc. |
||||||
|
* and other copyright owners as documented in the project's IP log. |
||||||
|
* |
||||||
|
* This program and the accompanying materials are made available |
||||||
|
* under the terms of the Eclipse Distribution License v1.0 which |
||||||
|
* accompanies this distribution, is reproduced below, and is |
||||||
|
* available at http://www.eclipse.org/org/documents/edl-v10.php
|
||||||
|
* |
||||||
|
* All rights reserved. |
||||||
|
* |
||||||
|
* Redistribution and use in source and binary forms, with or |
||||||
|
* without modification, are permitted provided that the following |
||||||
|
* conditions are met: |
||||||
|
* |
||||||
|
* - Redistributions of source code must retain the above copyright |
||||||
|
* notice, this list of conditions and the following disclaimer. |
||||||
|
* |
||||||
|
* - Redistributions in binary form must reproduce the above |
||||||
|
* copyright notice, this list of conditions and the following |
||||||
|
* disclaimer in the documentation and/or other materials provided |
||||||
|
* with the distribution. |
||||||
|
* |
||||||
|
* - Neither the name of the Eclipse Foundation, Inc. nor the |
||||||
|
* names of its contributors may be used to endorse or promote |
||||||
|
* products derived from this software without specific prior |
||||||
|
* written permission. |
||||||
|
* |
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
||||||
|
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, |
||||||
|
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
||||||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
||||||
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
||||||
|
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
||||||
|
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.eclipse.jgit.diff; |
||||||
|
|
||||||
|
import org.eclipse.jgit.JGitText; |
||||||
|
|
||||||
|
/** |
||||||
|
* Support {@link HistogramDiff} by computing occurrence counts of elements. |
||||||
|
* |
||||||
|
* Each element in the range being considered is put into a hash table, tracking |
||||||
|
* the number of times that distinct element appears in the sequence. Once all |
||||||
|
* elements have been inserted from sequence A, each element of sequence B is |
||||||
|
* probed in the hash table and the longest common subsequence with the lowest |
||||||
|
* occurrence count in A is used as the result. |
||||||
|
* |
||||||
|
* @param <S> |
||||||
|
* type of the base sequence. |
||||||
|
*/ |
||||||
|
final class HistogramDiffIndex<S extends Sequence> { |
||||||
|
private static final int REC_NEXT_SHIFT = 28 + 8; |
||||||
|
|
||||||
|
private static final int REC_PTR_SHIFT = 8; |
||||||
|
|
||||||
|
private static final int REC_PTR_MASK = (1 << 28) - 1; |
||||||
|
|
||||||
|
private static final int REC_CNT_MASK = (1 << 8) - 1; |
||||||
|
|
||||||
|
private static final int MAX_PTR = REC_PTR_MASK; |
||||||
|
|
||||||
|
private static final int MAX_CNT = (1 << 8) - 1; |
||||||
|
|
||||||
|
private final int maxChainLength; |
||||||
|
|
||||||
|
private final HashedSequenceComparator<S> cmp; |
||||||
|
|
||||||
|
private final HashedSequence<S> a; |
||||||
|
|
||||||
|
private final HashedSequence<S> b; |
||||||
|
|
||||||
|
private final Edit region; |
||||||
|
|
||||||
|
/** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */ |
||||||
|
private final int[] table; |
||||||
|
|
||||||
|
/** Number of low bits to discard from a key to index {@link #table}. */ |
||||||
|
private final int keyShift; |
||||||
|
|
||||||
|
/** |
||||||
|
* Describes a unique element in sequence A. |
||||||
|
* |
||||||
|
* The records in this table are actually 3-tuples of: |
||||||
|
* <ul> |
||||||
|
* <li>index of next record in this table that has same hash code</li> |
||||||
|
* <li>index of first element in this occurrence chain</li> |
||||||
|
* <li>occurrence count for this element (length of locs list)</li> |
||||||
|
* </ul> |
||||||
|
* |
||||||
|
* The occurrence count is capped at {@link #MAX_CNT}, as the field is only |
||||||
|
* a few bits wide. Elements that occur more frequently will have their |
||||||
|
* count capped. |
||||||
|
*/ |
||||||
|
private long[] recs; |
||||||
|
|
||||||
|
/** Number of elements in {@link #recs}; also is the unique element count. */ |
||||||
|
private int recCnt; |
||||||
|
|
||||||
|
/** |
||||||
|
* For {@code ptr}, {@code next[ptr - nextShift]} has subsequent index. |
||||||
|
* |
||||||
|
* For the sequence element {@code ptr}, the value stored at location |
||||||
|
* {@code next[ptr - nextShift]} is the next occurrence of the exact same |
||||||
|
* element in the sequence. |
||||||
|
* |
||||||
|
* Chains always run from the lowest index to the largest index. Therefore |
||||||
|
* the array will store {@code next[1] = 2}, but never {@code next[2] = 1}. |
||||||
|
* This allows a chain to terminate with {@code 0}, as {@code 0} would never |
||||||
|
* be a valid next element. |
||||||
|
* |
||||||
|
* The array is sized to be {@code region.getLenghtA()} and element indexes |
||||||
|
* are converted to array indexes by subtracting {@link #nextShift}, which |
||||||
|
* is just a cached version of {@code region.beginA}. |
||||||
|
*/ |
||||||
|
private int[] next; |
||||||
|
|
||||||
|
/** Value to subtract from element indexes to key {@link #next} array. */ |
||||||
|
private int nextShift; |
||||||
|
|
||||||
|
private Edit lcs; |
||||||
|
|
||||||
|
private int cnt; |
||||||
|
|
||||||
|
private boolean hasCommon; |
||||||
|
|
||||||
|
HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp, |
||||||
|
HashedSequence<S> a, HashedSequence<S> b, Edit r) { |
||||||
|
this.maxChainLength = maxChainLength; |
||||||
|
this.cmp = cmp; |
||||||
|
this.a = a; |
||||||
|
this.b = b; |
||||||
|
this.region = r; |
||||||
|
|
||||||
|
if (region.endA >= MAX_PTR) |
||||||
|
throw new IllegalArgumentException( |
||||||
|
JGitText.get().sequenceTooLargeForDiffAlgorithm); |
||||||
|
|
||||||
|
final int sz = r.getLengthA(); |
||||||
|
final int tableBits = tableBits(sz); |
||||||
|
table = new int[1 << tableBits]; |
||||||
|
keyShift = 32 - tableBits; |
||||||
|
nextShift = r.beginA; |
||||||
|
|
||||||
|
recs = new long[Math.max(4, sz >>> 3)]; |
||||||
|
next = new int[sz]; |
||||||
|
} |
||||||
|
|
||||||
|
Edit findLongestCommonSequence() { |
||||||
|
if (!scanA()) |
||||||
|
return null; |
||||||
|
|
||||||
|
lcs = new Edit(0, 0); |
||||||
|
cnt = maxChainLength + 1; |
||||||
|
|
||||||
|
for (int bPtr = region.beginB; bPtr < region.endB;) |
||||||
|
bPtr = tryLongestCommonSequence(bPtr); |
||||||
|
|
||||||
|
return hasCommon && maxChainLength < cnt ? null : lcs; |
||||||
|
} |
||||||
|
|
||||||
|
private boolean scanA() { |
||||||
|
// Scan the elements backwards, inserting them into the hash table
|
||||||
|
// as we go. Going in reverse places the earliest occurrence of any
|
||||||
|
// element at the start of the chain, so we consider earlier matches
|
||||||
|
// before later matches.
|
||||||
|
//
|
||||||
|
SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) { |
||||||
|
final int tIdx = hash(a, ptr); |
||||||
|
|
||||||
|
int chainLen = 0; |
||||||
|
for (int rIdx = table[tIdx]; rIdx != 0;) { |
||||||
|
final long rec = recs[rIdx]; |
||||||
|
if (cmp.equals(a, recPtr(rec), a, ptr)) { |
||||||
|
// ptr is identical to another element. Insert it onto
|
||||||
|
// the front of the existing element chain.
|
||||||
|
//
|
||||||
|
int newCnt = recCnt(rec) + 1; |
||||||
|
if (MAX_CNT < newCnt) |
||||||
|
newCnt = MAX_CNT; |
||||||
|
recs[rIdx] = recCreate(recNext(rec), ptr, newCnt); |
||||||
|
next[ptr - nextShift] = recPtr(rec); |
||||||
|
continue SCAN; |
||||||
|
} |
||||||
|
|
||||||
|
rIdx = recNext(rec); |
||||||
|
chainLen++; |
||||||
|
} |
||||||
|
|
||||||
|
if (chainLen == maxChainLength) |
||||||
|
return false; |
||||||
|
|
||||||
|
// This is the first time we have ever seen this particular
|
||||||
|
// element in the sequence. Construct a new chain for it.
|
||||||
|
//
|
||||||
|
final int rIdx = ++recCnt; |
||||||
|
if (rIdx == recs.length) { |
||||||
|
int sz = Math.min(recs.length << 1, 1 + region.getLengthA()); |
||||||
|
long[] n = new long[sz]; |
||||||
|
System.arraycopy(recs, 0, n, 0, recs.length); |
||||||
|
recs = n; |
||||||
|
} |
||||||
|
|
||||||
|
recs[rIdx] = recCreate(table[tIdx], ptr, 1); |
||||||
|
table[tIdx] = rIdx; |
||||||
|
} |
||||||
|
return true; |
||||||
|
} |
||||||
|
|
||||||
|
private int tryLongestCommonSequence(final int bPtr) { |
||||||
|
int bNext = bPtr + 1; |
||||||
|
int rIdx = table[hash(b, bPtr)]; |
||||||
|
for (long rec; rIdx != 0; rIdx = recNext(rec)) { |
||||||
|
rec = recs[rIdx]; |
||||||
|
|
||||||
|
// If there are more occurrences in A, don't use this chain.
|
||||||
|
if (recCnt(rec) > cnt) { |
||||||
|
if (!hasCommon) |
||||||
|
hasCommon = cmp.equals(a, recPtr(rec), b, bPtr); |
||||||
|
continue; |
||||||
|
} |
||||||
|
|
||||||
|
int as = recPtr(rec); |
||||||
|
if (!cmp.equals(a, as, b, bPtr)) |
||||||
|
continue; |
||||||
|
|
||||||
|
hasCommon = true; |
||||||
|
TRY_LOCATIONS: for (;;) { |
||||||
|
int np = next[as - nextShift]; |
||||||
|
int bs = bPtr; |
||||||
|
int ae = as + 1; |
||||||
|
int be = bs + 1; |
||||||
|
|
||||||
|
while (region.beginA < as && region.beginB < bs |
||||||
|
&& cmp.equals(a, as - 1, b, bs - 1)) { |
||||||
|
as--; |
||||||
|
bs--; |
||||||
|
} |
||||||
|
while (ae < region.endA && be < region.endB |
||||||
|
&& cmp.equals(a, ae, b, be)) { |
||||||
|
ae++; |
||||||
|
be++; |
||||||
|
} |
||||||
|
|
||||||
|
if (bNext < be) |
||||||
|
bNext = be; |
||||||
|
if (lcs.getLengthA() < ae - as || recCnt(rec) < cnt) { |
||||||
|
// If this region is the longest, or there are less
|
||||||
|
// occurrences of it in A, its now our LCS.
|
||||||
|
//
|
||||||
|
lcs.beginA = as; |
||||||
|
lcs.beginB = bs; |
||||||
|
lcs.endA = ae; |
||||||
|
lcs.endB = be; |
||||||
|
cnt = recCnt(rec); |
||||||
|
} |
||||||
|
|
||||||
|
// Because we added elements in reverse order index 0
|
||||||
|
// cannot possibly be the next position. Its the first
|
||||||
|
// element of the sequence and thus would have been the
|
||||||
|
// value of as at the start of the TRY_LOCATIONS loop.
|
||||||
|
//
|
||||||
|
if (np == 0) |
||||||
|
break TRY_LOCATIONS; |
||||||
|
|
||||||
|
while (np < ae) { |
||||||
|
// The next location to consider was actually within
|
||||||
|
// the LCS we examined above. Don't reconsider it.
|
||||||
|
//
|
||||||
|
np = next[np - nextShift]; |
||||||
|
if (np == 0) |
||||||
|
break TRY_LOCATIONS; |
||||||
|
} |
||||||
|
|
||||||
|
as = np; |
||||||
|
} |
||||||
|
} |
||||||
|
return bNext; |
||||||
|
} |
||||||
|
|
||||||
|
private int hash(HashedSequence<S> s, int idx) { |
||||||
|
return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift; |
||||||
|
} |
||||||
|
|
||||||
|
private static long recCreate(int next, int ptr, int cnt) { |
||||||
|
return ((long) next << REC_NEXT_SHIFT) //
|
||||||
|
| ((long) ptr << REC_PTR_SHIFT) //
|
||||||
|
| cnt; |
||||||
|
} |
||||||
|
|
||||||
|
private static int recNext(long rec) { |
||||||
|
return (int) (rec >>> REC_NEXT_SHIFT); |
||||||
|
} |
||||||
|
|
||||||
|
private static int recPtr(long rec) { |
||||||
|
return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK; |
||||||
|
} |
||||||
|
|
||||||
|
private static int recCnt(long rec) { |
||||||
|
return ((int) rec) & REC_CNT_MASK; |
||||||
|
} |
||||||
|
|
||||||
|
private static int tableBits(final int sz) { |
||||||
|
int bits = 31 - Integer.numberOfLeadingZeros(sz); |
||||||
|
if (bits == 0) |
||||||
|
bits = 1; |
||||||
|
if (1 << bits < sz) |
||||||
|
bits++; |
||||||
|
return bits; |
||||||
|
} |
||||||
|
} |
Loading…
Reference in new issue