Browse Source

Correct distribution of allowed delta size along chain length

Nicolas Pitre discovered a very simple rule for selecting between two
different delta base candidates:

  - if based whole object, must be <= 50% of target
  - if at end of a chain, must be <= 1/depth * 50% of target

The rule penalizes deltas near the end of the chain, requiring them to
be very small in order to be kept by the packer.  This favors deltas
that are based on a shorter chain, where the read-time unpack cost is
much lower.  Fewer bytes need to be consulted from the source pack
file, and less copying is required in memory to rebuild the object.

Junio Hamano explained Nico's rule to me today, and this commit fixes
DeltaWindow to implement it as described.

When no base has been chosen the computation is simply the statements
denoted above.  However once a base with depth of 9 has been chosen
(e.g.  when pack.depth is limited to 10), a non-delta source may
create a new delta that is up to 10x larger than the already selected
base.  This reflects the intent of Nico's size distribution rule no
matter what order objects are visited in the DeltaWindow.

With this patch and my other patches applied, repacking JGit with:

  [pack]
    reuseObjects = false
    reuseDeltas = false
    depth = 50
    window = 250
    threads = 4
    compression = 9

  CGit (all) 5,711,735 bytes; real 0m13.942s user 0m47.722s [1]
  JGit heads 5,718,295 bytes; real 0m11.880s user 0m38.177s [2]
       rest      9,809 bytes

The improved JGit result for the head pack is only 6.4 KiB larger than
CGit's resulting pack.  This patch allowed JGit to find an additional
39.7 KiB worth of space savings.  JGit now also often runs 2s faster
than CGit, despite also creating bitmaps and pruning objects after the
head pack creation.

[1] time git repack -a -d -F --window=250 --depth=50
[2] time java -Xmx128m -jar jgit debug-gc

Change-Id: I5caec31359bf7248cabdd2a3254c84d4ee3cd96b
stable-3.0
Shawn Pearce 12 years ago committed by Shawn Pearce
parent
commit
8a7c2f97d0
  1. 158
      org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/pack/DeltaWindow.java

158
org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/pack/DeltaWindow.java

@ -80,13 +80,6 @@ final class DeltaWindow {
// The object we are currently considering needs a lot of state:
/**
* Maximum delta chain depth the current object can have.
* <p>
* This can be smaller than {@link #maxDepth}.
*/
private int resMaxDepth;
/** Window entry of the object we are currently considering. */
private DeltaWindowEntry res;
@ -206,31 +199,21 @@ final class DeltaWindow {
}
private void searchInWindow() throws IOException {
// TODO(spearce) If the object is used as a base for other
// objects in this pack we should limit the depth we create
// for ourselves to be the remainder of our longest dependent
// chain and the configured maximum depth. This can happen
// when the dependents are being reused out a pack, but we
// cannot be because we are near the edge of a thin pack.
//
resMaxDepth = maxDepth;
// Loop through the window backwards, considering every entry.
// This lets us look at the bigger objects that came before.
//
for (DeltaWindowEntry src = res.prev; src != res; src = src.prev) {
if (src.empty())
break;
if (delta(src) /* == NEXT_SRC */)
continue;
bestBase = null;
bestDelta = null;
return;
}
// We couldn't find a suitable delta for this object, but it may
// still be able to act as a base for another one.
//
if (bestDelta == null) {
if (bestBase == null) {
keepInWindow();
return;
}
@ -245,77 +228,61 @@ final class DeltaWindow {
// has on hand, so we don't want to send it. We have to store
// an ObjectId and *NOT* an ObjectToPack for the base to ensure
// the base isn't included in the outgoing pack file.
//
resObj.setDeltaBase(srcObj.copy());
} else {
// The base is part of the pack we are sending, so it should be
// a direct pointer to the base.
//
resObj.setDeltaBase(srcObj);
}
resObj.setDeltaDepth(srcObj.getDeltaDepth() + 1);
int depth = srcObj.getDeltaDepth() + 1;
resObj.setDeltaDepth(depth);
resObj.clearReuseAsIs();
cacheDelta(srcObj, resObj);
// Discard the cached best result, otherwise it leaks.
//
bestDelta = null;
// If this should be the end of a chain, don't keep
// it in the window. Just move on to the next object.
//
if (resObj.getDeltaDepth() == maxDepth)
return;
if (depth < maxDepth) {
// Reorder the window so that the best base will be tested
// first for the next object, and the current object will
// be the second candidate to consider before any others.
res.makeNext(bestBase);
res = bestBase.next;
}
shuffleBaseUpInPriority();
keepInWindow();
bestBase = null;
bestDelta = null;
}
private boolean delta(final DeltaWindowEntry src)
throws IOException {
// Objects must use only the same type as their delta base.
// If we are looking at something where that isn't true we
// have exhausted everything of the correct type and should
// move on to the next thing to examine.
//
if (src.type() != res.type()) {
keepInWindow();
return NEXT_RES;
}
// Only consider a source with a short enough delta chain.
if (src.depth() > resMaxDepth)
// If the sizes are radically different, this is a bad pairing.
if (res.size() < src.size() >>> 4)
return NEXT_SRC;
// Estimate a reasonable upper limit on delta size.
int msz = deltaSizeLimit(res, resMaxDepth, src);
if (msz <= 8)
int msz = deltaSizeLimit(src);
if (msz <= 8) // Nearly impossible to fit useful delta.
return NEXT_SRC;
// If we have to insert a lot to make this work, find another.
if (res.size() - src.size() > msz)
return NEXT_SRC;
// If the sizes are radically different, this is a bad pairing.
if (res.size() < src.size() / 16)
return NEXT_SRC;
DeltaIndex srcIndex;
try {
srcIndex = index(src);
} catch (LargeObjectException tooBig) {
// If the source is too big to work on, skip it.
dropFromWindow(src);
return NEXT_SRC;
} catch (IOException notAvailable) {
if (src.object.isEdge()) {
// This is an edge that is suddenly not available.
dropFromWindow(src);
if (src.object.isEdge()) // Missing edges are OK.
return NEXT_SRC;
} else {
throw notAvailable;
}
}
byte[] resBuf;
try {
@ -325,26 +292,41 @@ final class DeltaWindow {
return NEXT_RES;
}
// If we already have a delta for the current object, abort
// encoding early if this new pairing produces a larger delta.
if (bestDelta != null && bestDelta.length() < msz)
msz = (int) bestDelta.length();
TemporaryBuffer.Heap delta = new TemporaryBuffer.Heap(msz);
try {
if (!srcIndex.encode(delta, resBuf, msz))
return NEXT_SRC;
TemporaryBuffer.Heap delta = new TemporaryBuffer.Heap(msz);
if (srcIndex.encode(delta, resBuf, msz)) {
bestBase = src;
bestDelta = delta;
}
} catch (IOException deltaTooBig) {
// This only happens when the heap overflows our limit.
// Unlikely, encoder should see limit and return false.
}
return NEXT_SRC;
}
if (isBetterDelta(src, delta)) {
bestDelta = delta;
bestBase = src;
private int deltaSizeLimit(DeltaWindowEntry src) {
if (bestBase == null) {
// Any delta should be no more than 50% of the original size
// (for text files deflate of whole form should shrink 50%).
int n = res.size() >>> 1;
// Evenly distribute delta size limits over allowed depth.
// If src is non-delta (depth = 0), delta <= 50% of original.
// If src is almost at limit (9/10), delta <= 10% of original.
return n * (maxDepth - src.depth()) / maxDepth;
}
return NEXT_SRC;
// With a delta base chosen any new delta must be "better".
// Retain the distribution described above.
int d = bestBase.depth();
int n = (int) bestDelta.length();
// If src is whole (depth=0) and base is near limit (depth=9/10)
// any delta using src can be 10x larger and still be better.
//
// If src is near limit (depth=9/10) and base is whole (depth=0)
// a new delta dependent on src must be 1/10th the size.
return n * (maxDepth - src.depth()) / (maxDepth - d);
}
private void cacheDelta(ObjectToPack srcObj, ObjectToPack resObj) {
@ -375,56 +357,10 @@ final class DeltaWindow {
return insz + ((insz + 7) >> 3) + ((insz + 63) >> 6) + 11;
}
private void shuffleBaseUpInPriority() {
// Reorder the window so that the best match we just used
// is the current one, and the now current object is before.
res.makeNext(bestBase);
res = bestBase;
}
private void keepInWindow() {
res = res.next;
}
private void dropFromWindow(@SuppressWarnings("unused") DeltaWindowEntry src) {
// We should drop the current source entry from the window,
// it is somehow invalid for us to work with.
}
private boolean isBetterDelta(DeltaWindowEntry src,
TemporaryBuffer.Heap resDelta) {
if (bestDelta == null)
return true;
// If both delta sequences are the same length, use the one
// that has a shorter delta chain since it would be faster
// to access during reads.
//
if (resDelta.length() == bestDelta.length())
return src.depth() < bestBase.depth();
return resDelta.length() < bestDelta.length();
}
private static int deltaSizeLimit(DeltaWindowEntry res, int maxDepth,
DeltaWindowEntry src) {
// Ideally the delta is at least 50% of the original size,
// but we also want to account for delta header overhead in
// the pack file (to point to the delta base) so subtract off
// some of those header bytes from the limit.
//
final int limit = res.size() / 2 - 20;
// Distribute the delta limit over the entire chain length.
// This is weighted such that deeper items in the chain must
// be even smaller than if they were earlier in the chain, as
// they cost significantly more to unpack due to the increased
// number of recursive unpack calls.
//
final int remainingDepth = maxDepth - src.depth();
return (limit * remainingDepth) / maxDepth;
}
private DeltaIndex index(DeltaWindowEntry ent)
throws MissingObjectException, IncorrectObjectTypeException,
IOException, LargeObjectException {

Loading…
Cancel
Save